util/X86MAC64/cuda/samples/6_Advanced/concurrentKernels/concurrentKernels.cu

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 //
  13 // This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
  14 // introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
  15 // in CUDA 3.2.
  16 //
  17 // Devices of compute capability 1.x will run the kernels one after another
  18 // Devices of compute capability 2.0 or higher can overlap the kernels
  19 //
  20 #include <stdio.h>
  21 #include <helper_functions.h>
  22 #include <helper_cuda.h>
  23
  24 // This is a kernel that does no real work but runs at least for a specified number of clocks
  25 __global__ void clock_block(clock_t *d_o, clock_t clock_count)
  26 {
  27     unsigned int start_clock = (unsigned int) clock();
  28
  29     clock_t clock_offset = 0;
  30
  31     while (clock_offset < clock_count)
  32     {
  33         unsigned int end_clock = (unsigned int) clock();
  34
  35         // The code below should work like
  36         // this (thanks to modular arithmetics):
  37         //
  38         // clock_offset = (clock_t) (end_clock > start_clock ?
  39         //                           end_clock - start_clock :
  40         //                           end_clock + (0xffffffffu - start_clock));
  41         //
  42         // Indeed, let m = 2^32 then
  43         // end - start = end + m - start (mod m).
  44
  45         clock_offset = (clock_t)(end_clock - start_clock);
  46     }
  47
  48     d_o[0] = clock_offset;
  49 }
  50
  51
  52 // Single warp reduction kernel
  53 __global__ void sum(clock_t *d_clocks, int N)
  54 {
  55     __shared__ clock_t s_clocks[32];
  56
  57     clock_t my_sum = 0;
  58
  59     for (int i = threadIdx.x; i < N; i+= blockDim.x)
  60     {
  61         my_sum += d_clocks[i];
  62     }
  63
  64     s_clocks[threadIdx.x] = my_sum;
  65     syncthreads();
  66
  67     for (int i=16; i>0; i/=2)
  68     {
  69         if (threadIdx.x < i)
  70         {
  71             s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
  72         }
  73
  74         syncthreads();
  75     }
  76
  77     d_clocks[0] = s_clocks[0];
  78 }
  79
  80 int main(int argc, char **argv)
  81 {
  82     int nkernels = 8;               // number of concurrent kernels
  83     int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
  84     int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
  85     float kernel_time = 10; // time the kernel should run in ms
  86     float elapsed_time;   // timing variables
  87     int cuda_device = 0;
  88
  89     printf("[%s] - Starting...\n", argv[0]);
  90
  91     // get number of kernels if overridden on the command line
  92     if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
  93     {
  94         nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
  95         nstreams = nkernels + 1;
  96     }
  97
  98     // use command-line specified CUDA device, otherwise use device with highest Gflops/s
  99     cuda_device = findCudaDevice(argc, (const char **)argv);
 100
 101     cudaDeviceProp deviceProp;
 102     checkCudaErrors(cudaGetDevice(&cuda_device));
 103
 104     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
 105
 106     if ((deviceProp.concurrentKernels == 0))
 107     {
 108         printf("> GPU does not support concurrent kernel execution\n");
 109         printf("  CUDA kernel runs will be serialized\n");
 110     }
 111
 112     printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
 113            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 114
 115     // allocate host memory
 116     clock_t *a = 0;                     // pointer to the array data in host memory
 117     checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
 118
 119     // allocate device memory
 120     clock_t *d_a = 0;             // pointers to data and init value in the device memory
 121     checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 122
 123     // allocate and initialize an array of stream handles
 124     cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
 125
 126     for (int i = 0; i < nstreams; i++)
 127     {
 128         checkCudaErrors(cudaStreamCreate(&(streams[i])));
 129     }
 130
 131     // create CUDA event handles
 132     cudaEvent_t start_event, stop_event;
 133     checkCudaErrors(cudaEventCreate(&start_event));
 134     checkCudaErrors(cudaEventCreate(&stop_event));
 135
 136
 137     // the events are used for synchronization only and hence do not need to record timings
 138     // this also makes events not introduce global sync points when recorded which is critical to get overlap
 139     cudaEvent_t *kernelEvent;
 140     kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
 141
 142     for (int i = 0; i < nkernels; i++)
 143     {
 144         checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
 145     }
 146
 147     //////////////////////////////////////////////////////////////////////
 148     // time execution with nkernels streams
 149     clock_t total_clocks = 0;
 150     clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
 151
 152     cudaEventRecord(start_event, 0);
 153
 154     // queue nkernels in separate streams and record when they are done
 155     for (int i=0; i<nkernels; ++i)
 156     {
 157         clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
 158         total_clocks += time_clocks;
 159         checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
 160
 161         // make the last stream wait for the kernel event to be recorded
 162         checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
 163     }
 164
 165     // queue a sum kernel and a copy back to host in the last stream.
 166     // the commands in this stream get dispatched as soon as all the kernel events have been recorded
 167     sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
 168     checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
 169
 170     // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
 171
 172     // in this sample we just wait until the GPU is done
 173     checkCudaErrors(cudaEventRecord(stop_event, 0));
 174     checkCudaErrors(cudaEventSynchronize(stop_event));
 175     checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
 176
 177     printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
 178     printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
 179     printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
 180
 181     bool bTestResult  = (a[0] > total_clocks);
 182
 183     // release resources
 184     for (int i = 0; i < nkernels; i++)
 185     {
 186         cudaStreamDestroy(streams[i]);
 187         cudaEventDestroy(kernelEvent[i]);
 188     }
 189
 190     free(streams);
 191     free(kernelEvent);
 192
 193     cudaEventDestroy(start_event);
 194     cudaEventDestroy(stop_event);
 195     cudaFreeHost(a);
 196     cudaFree(d_a);
 197
 198     cudaDeviceReset();
 199
 200     if (!bTestResult)
 201     {
 202         printf("Test failed!\n");
 203         exit(EXIT_FAILURE);
 204     }
 205
 206     printf("Test passed\n");
 207     exit(EXIT_SUCCESS);
 208 }