2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
12 #include "../inc/piestimator.h"
19 #include <cuda_runtime.h>
25 __device__ unsigned int reduce_sum(unsigned int in)
27 extern __shared__ unsigned int sdata[];
29 // Perform first level of reduction:
30 // - Write to shared memory
31 unsigned int ltid = threadIdx.x;
36 // Do reduction in shared mem
37 for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
41 sdata[ltid] += sdata[ltid + s];
51 template <typename Real>
52 __global__ void computeValue(unsigned int *const results,
53 const Real *const points,
54 const unsigned int numSims)
56 // Determine thread ID
57 unsigned int bid = blockIdx.x;
58 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
59 unsigned int step = gridDim.x * blockDim.x;
61 // Shift the input/output pointers
62 const Real *pointx = points + tid;
63 const Real *pointy = pointx + numSims;
65 // Count the number of points which lie inside the unit quarter-circle
66 unsigned int pointsInside = 0;
68 for (unsigned int i = tid ; i < numSims ; i += step, pointx += step, pointy += step)
72 Real l2norm2 = x * x + y * y;
74 if (l2norm2 < static_cast<Real>(1))
80 // Reduce within the block
81 pointsInside = reduce_sum(pointsInside);
86 results[bid] = pointsInside;
90 template <typename Real>
91 PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize)
94 m_threadBlockSize(threadBlockSize)
98 template <typename Real>
99 Real PiEstimator<Real>::operator()()
101 cudaError_t cudaResult = cudaSuccess;
102 struct cudaDeviceProp deviceProperties;
103 struct cudaFuncAttributes funcAttributes;
105 // Get device properties
106 cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);
108 if (cudaResult != cudaSuccess)
110 string msg("Could not get device properties: ");
111 msg += cudaGetErrorString(cudaResult);
112 throw std::runtime_error(msg);
115 // Check precision is valid
116 if (typeid(Real) == typeid(double) &&
117 (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
119 throw std::runtime_error("Device does not have double precision support");
123 cudaResult = cudaSetDevice(m_device);
125 if (cudaResult != cudaSuccess)
127 string msg("Could not set CUDA device: ");
128 msg += cudaGetErrorString(cudaResult);
129 throw std::runtime_error(msg);
132 // Determine how to divide the work between cores
135 block.x = m_threadBlockSize;
136 grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;
138 // Aim to launch around ten or more times as many blocks as there
139 // are multiprocessors on the target device.
140 unsigned int blocksPerSM = 10;
141 unsigned int numSMs = deviceProperties.multiProcessorCount;
143 while (grid.x > 2 * blocksPerSM * numSMs)
148 // Get computeValue function properties and check the maximum block size
149 cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);
151 if (cudaResult != cudaSuccess)
153 string msg("Could not get function attributes: ");
154 msg += cudaGetErrorString(cudaResult);
155 throw std::runtime_error(msg);
158 if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
160 throw std::runtime_error("Block X dimension is too large for computeValue kernel");
163 // Check the dimensions are valid
164 if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
166 throw std::runtime_error("Block X dimension is too large for device");
169 if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
171 throw std::runtime_error("Grid X dimension is too large for device");
174 // Allocate memory for points
175 // Each simulation has two random numbers to give X and Y coordinate
177 cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real));
179 if (cudaResult != cudaSuccess)
181 string msg("Could not allocate memory on device for random numbers: ");
182 msg += cudaGetErrorString(cudaResult);
183 throw std::runtime_error(msg);
186 // Allocate memory for result
187 // Each thread block will produce one result
188 unsigned int *d_results = 0;
189 cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));
191 if (cudaResult != cudaSuccess)
193 string msg("Could not allocate memory on device for partial results: ");
194 msg += cudaGetErrorString(cudaResult);
195 throw std::runtime_error(msg);
198 // Generate random points in unit square
199 curandStatus_t curandResult;
200 curandGenerator_t qrng;
202 if (typeid(Real) == typeid(float))
204 curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
206 else if (typeid(Real) == typeid(double))
208 curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL64);
212 string msg("Could not create random number generator of specified type");
213 throw std::runtime_error(msg);
216 if (curandResult != CURAND_STATUS_SUCCESS)
218 string msg("Could not create quasi-random number generator: ");
220 throw std::runtime_error(msg);
223 curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);
225 if (curandResult != CURAND_STATUS_SUCCESS)
227 string msg("Could not set number of dimensions for quasi-random number generator: ");
229 throw std::runtime_error(msg);
232 curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);
234 if (curandResult != CURAND_STATUS_SUCCESS)
236 string msg("Could not set order for quasi-random number generator: ");
238 throw std::runtime_error(msg);
241 if (typeid(Real) == typeid(float))
243 curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);
245 else if (typeid(Real) == typeid(double))
247 curandResult = curandGenerateUniformDouble(qrng, (double *)d_points, 2 * m_numSims);
251 string msg("Could not generate random numbers of specified type");
252 throw std::runtime_error(msg);
255 if (curandResult != CURAND_STATUS_SUCCESS)
257 string msg("Could not generate quasi-random numbers: ");
259 throw std::runtime_error(msg);
262 curandResult = curandDestroyGenerator(qrng);
264 if (curandResult != CURAND_STATUS_SUCCESS)
266 string msg("Could not destroy quasi-random number generator: ");
268 throw std::runtime_error(msg);
271 // Count the points inside unit quarter-circle
272 computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_points, m_numSims);
274 // Copy partial results back
275 vector<unsigned int> results(grid.x);
276 cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);
278 if (cudaResult != cudaSuccess)
280 string msg("Could not copy partial results to host: ");
281 msg += cudaGetErrorString(cudaResult);
282 throw std::runtime_error(msg);
285 // Complete sum-reduction on host
286 Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));
288 // Determine the proportion of points inside the quarter-circle,
289 // i.e. the area of the unit quarter-circle
292 // Value is currently an estimate of the area of a unit quarter-circle, so we can
293 // scale to a full circle by multiplying by four. Now since the area of a circle
294 // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
311 // Explicit template instantiation
312 template class PiEstimator<float>;
313 template class PiEstimator<double>;