utiltools/X86MAC64/cuda/samples/0_Simple/inlinePTX/inlinePTX.cu

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 /*
  13  * Demonstration of inline PTX (assembly language) usage in CUDA kernels
  14  */
  15
  16 // System includes
  17 #include <stdio.h>
  18 #include <assert.h>
  19
  20 // CUDA runtime
  21 #include <cuda_runtime.h>
  22
  23 // helper functions and utilities to work with CUDA
  24 #include <helper_functions.h>
  25 #include <helper_cuda.h>
  26
  27 __global__ void sequence_gpu(int *d_ptr, int length)
  28 {
  29     int elemID = blockIdx.x * blockDim.x + threadIdx.x;
  30
  31     if (elemID < length)
  32     {
  33         unsigned int laneid;
  34         //This command gets the lane ID within the current warp
  35         asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
  36         d_ptr[elemID] = laneid;
  37     }
  38 }
  39
  40
  41 void sequence_cpu(int *h_ptr, int length)
  42 {
  43     for (int elemID=0; elemID<length; elemID++)
  44     {
  45         h_ptr[elemID] = elemID % 32;
  46     }
  47 }
  48
  49 int main(int argc, char **argv)
  50 {
  51     printf("CUDA inline PTX assembler sample\n");
  52
  53     const int N = 1000;
  54
  55     int dev = findCudaDevice(argc, (const char **) argv);
  56
  57     if (dev == -1)
  58     {
  59         return EXIT_FAILURE;
  60     }
  61
  62     int *d_ptr;
  63     checkCudaErrors(cudaMalloc(&d_ptr, N * sizeof(int)));
  64
  65     int *h_ptr;
  66     checkCudaErrors(cudaMallocHost(&h_ptr, N * sizeof(int)));
  67
  68     dim3 cudaBlockSize(256,1,1);
  69     dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);
  70     sequence_gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N);
  71     checkCudaErrors(cudaGetLastError());
  72     checkCudaErrors(cudaDeviceSynchronize());
  73
  74     sequence_cpu(h_ptr, N);
  75
  76     int *h_d_ptr;
  77     checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int)));
  78     checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost));
  79
  80     bool bValid = true;
  81
  82     for (int i=0; i<N && bValid; i++)
  83     {
  84         if (h_ptr[i] != h_d_ptr[i])
  85         {
  86             bValid = false;
  87         }
  88     }
  89
  90     printf("Test %s.\n", bValid ? "Successful" : "Failed");
  91
  92     checkCudaErrors(cudaFree(d_ptr));
  93     checkCudaErrors(cudaFreeHost(h_ptr));
  94     checkCudaErrors(cudaFreeHost(h_d_ptr));
  95
  96     checkCudaErrors(cudaDeviceReset());
  97
  98     return bValid ? EXIT_SUCCESS: EXIT_FAILURE;
  99 }