2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
13 // This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
14 // introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
17 // Devices of compute capability 1.x will run the kernels one after another
18 // Devices of compute capability 2.0 or higher can overlap the kernels
21 #include <helper_functions.h>
22 #include <helper_cuda.h>
24 // This is a kernel that does no real work but runs at least for a specified number of clocks
25 __global__ void clock_block(clock_t *d_o, clock_t clock_count)
27 unsigned int start_clock = (unsigned int) clock();
29 clock_t clock_offset = 0;
31 while (clock_offset < clock_count)
33 unsigned int end_clock = (unsigned int) clock();
35 // The code below should work like
36 // this (thanks to modular arithmetics):
38 // clock_offset = (clock_t) (end_clock > start_clock ?
39 // end_clock - start_clock :
40 // end_clock + (0xffffffffu - start_clock));
42 // Indeed, let m = 2^32 then
43 // end - start = end + m - start (mod m).
45 clock_offset = (clock_t)(end_clock - start_clock);
48 d_o[0] = clock_offset;
52 // Single warp reduction kernel
53 __global__ void sum(clock_t *d_clocks, int N)
55 __shared__ clock_t s_clocks[32];
59 for (int i = threadIdx.x; i < N; i+= blockDim.x)
61 my_sum += d_clocks[i];
64 s_clocks[threadIdx.x] = my_sum;
67 for (int i=16; i>0; i/=2)
71 s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
77 d_clocks[0] = s_clocks[0];
80 int main(int argc, char **argv)
82 int nkernels = 8; // number of concurrent kernels
83 int nstreams = nkernels + 1; // use one more stream than concurrent kernel
84 int nbytes = nkernels * sizeof(clock_t); // number of data bytes
85 float kernel_time = 10; // time the kernel should run in ms
86 float elapsed_time; // timing variables
89 printf("[%s] - Starting...\n", argv[0]);
91 // get number of kernels if overridden on the command line
92 if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
94 nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
95 nstreams = nkernels + 1;
98 // use command-line specified CUDA device, otherwise use device with highest Gflops/s
99 cuda_device = findCudaDevice(argc, (const char **)argv);
101 cudaDeviceProp deviceProp;
102 checkCudaErrors(cudaGetDevice(&cuda_device));
104 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
106 if ((deviceProp.concurrentKernels == 0))
108 printf("> GPU does not support concurrent kernel execution\n");
109 printf(" CUDA kernel runs will be serialized\n");
112 printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
113 deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
115 // allocate host memory
116 clock_t *a = 0; // pointer to the array data in host memory
117 checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
119 // allocate device memory
120 clock_t *d_a = 0; // pointers to data and init value in the device memory
121 checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
123 // allocate and initialize an array of stream handles
124 cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
126 for (int i = 0; i < nstreams; i++)
128 checkCudaErrors(cudaStreamCreate(&(streams[i])));
131 // create CUDA event handles
132 cudaEvent_t start_event, stop_event;
133 checkCudaErrors(cudaEventCreate(&start_event));
134 checkCudaErrors(cudaEventCreate(&stop_event));
137 // the events are used for synchronization only and hence do not need to record timings
138 // this also makes events not introduce global sync points when recorded which is critical to get overlap
139 cudaEvent_t *kernelEvent;
140 kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
142 for (int i = 0; i < nkernels; i++)
144 checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
147 //////////////////////////////////////////////////////////////////////
148 // time execution with nkernels streams
149 clock_t total_clocks = 0;
150 clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
152 cudaEventRecord(start_event, 0);
154 // queue nkernels in separate streams and record when they are done
155 for (int i=0; i<nkernels; ++i)
157 clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
158 total_clocks += time_clocks;
159 checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
161 // make the last stream wait for the kernel event to be recorded
162 checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
165 // queue a sum kernel and a copy back to host in the last stream.
166 // the commands in this stream get dispatched as soon as all the kernel events have been recorded
167 sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
168 checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
170 // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
172 // in this sample we just wait until the GPU is done
173 checkCudaErrors(cudaEventRecord(stop_event, 0));
174 checkCudaErrors(cudaEventSynchronize(stop_event));
175 checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
177 printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
178 printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
179 printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
181 bool bTestResult = (a[0] > total_clocks);
184 for (int i = 0; i < nkernels; i++)
186 cudaStreamDestroy(streams[i]);
187 cudaEventDestroy(kernelEvent[i]);
193 cudaEventDestroy(start_event);
194 cudaEventDestroy(stop_event);
202 printf("Test failed!\n");
206 printf("Test passed\n");