utiltools/X86MAC64/cuda/samples/7_CUDALibraries/MC_EstimatePiQ/src/piestimator.cu

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 #include "../inc/piestimator.h"
  13
  14 #include <string>
  15 #include <vector>
  16 #include <numeric>
  17 #include <stdexcept>
  18 #include <typeinfo>
  19 #include <cuda_runtime.h>
  20 #include <curand.h>
  21
  22 using std::string;
  23 using std::vector;
  24
  25 __device__ unsigned int reduce_sum(unsigned int in)
  26 {
  27     extern __shared__ unsigned int sdata[];
  28
  29     // Perform first level of reduction:
  30     // - Write to shared memory
  31     unsigned int ltid = threadIdx.x;
  32
  33     sdata[ltid] = in;
  34     __syncthreads();
  35
  36     // Do reduction in shared mem
  37     for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
  38     {
  39         if (ltid < s)
  40         {
  41             sdata[ltid] += sdata[ltid + s];
  42         }
  43
  44         __syncthreads();
  45     }
  46
  47     return sdata[0];
  48 }
  49
  50 // Estimator kernel
  51 template <typename Real>
  52 __global__ void computeValue(unsigned int *const results,
  53                              const Real *const points,
  54                              const unsigned int numSims)
  55 {
  56     // Determine thread ID
  57     unsigned int bid = blockIdx.x;
  58     unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
  59     unsigned int step = gridDim.x * blockDim.x;
  60
  61     // Shift the input/output pointers
  62     const Real *pointx = points + tid;
  63     const Real *pointy = pointx + numSims;
  64
  65     // Count the number of points which lie inside the unit quarter-circle
  66     unsigned int pointsInside = 0;
  67
  68     for (unsigned int i = tid ; i < numSims ; i += step, pointx += step, pointy += step)
  69     {
  70         Real x = *pointx;
  71         Real y = *pointy;
  72         Real l2norm2 = x * x + y * y;
  73
  74         if (l2norm2 < static_cast<Real>(1))
  75         {
  76             pointsInside++;
  77         }
  78     }
  79
  80     // Reduce within the block
  81     pointsInside = reduce_sum(pointsInside);
  82
  83     // Store the result
  84     if (threadIdx.x == 0)
  85     {
  86         results[bid] = pointsInside;
  87     }
  88 }
  89
  90 template <typename Real>
  91 PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize)
  92     : m_numSims(numSims),
  93       m_device(device),
  94       m_threadBlockSize(threadBlockSize)
  95 {
  96 }
  97
  98 template <typename Real>
  99 Real PiEstimator<Real>::operator()()
 100 {
 101     cudaError_t cudaResult = cudaSuccess;
 102     struct cudaDeviceProp     deviceProperties;
 103     struct cudaFuncAttributes funcAttributes;
 104
 105     // Get device properties
 106     cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);
 107
 108     if (cudaResult != cudaSuccess)
 109     {
 110         string msg("Could not get device properties: ");
 111         msg += cudaGetErrorString(cudaResult);
 112         throw std::runtime_error(msg);
 113     }
 114
 115     // Check precision is valid
 116     if (typeid(Real) == typeid(double) &&
 117         (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
 118     {
 119         throw std::runtime_error("Device does not have double precision support");
 120     }
 121
 122     // Attach to GPU
 123     cudaResult = cudaSetDevice(m_device);
 124
 125     if (cudaResult != cudaSuccess)
 126     {
 127         string msg("Could not set CUDA device: ");
 128         msg += cudaGetErrorString(cudaResult);
 129         throw std::runtime_error(msg);
 130     }
 131
 132     // Determine how to divide the work between cores
 133     dim3 block;
 134     dim3 grid;
 135     block.x = m_threadBlockSize;
 136     grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;
 137
 138     // Aim to launch around ten or more times as many blocks as there
 139     // are multiprocessors on the target device.
 140     unsigned int blocksPerSM = 10;
 141     unsigned int numSMs      = deviceProperties.multiProcessorCount;
 142
 143     while (grid.x > 2 * blocksPerSM * numSMs)
 144     {
 145         grid.x >>= 1;
 146     }
 147
 148     // Get computeValue function properties and check the maximum block size
 149     cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);
 150
 151     if (cudaResult != cudaSuccess)
 152     {
 153         string msg("Could not get function attributes: ");
 154         msg += cudaGetErrorString(cudaResult);
 155         throw std::runtime_error(msg);
 156     }
 157
 158     if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
 159     {
 160         throw std::runtime_error("Block X dimension is too large for computeValue kernel");
 161     }
 162
 163     // Check the dimensions are valid
 164     if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
 165     {
 166         throw std::runtime_error("Block X dimension is too large for device");
 167     }
 168
 169     if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
 170     {
 171         throw std::runtime_error("Grid X dimension is too large for device");
 172     }
 173
 174     // Allocate memory for points
 175     // Each simulation has two random numbers to give X and Y coordinate
 176     Real *d_points = 0;
 177     cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real));
 178
 179     if (cudaResult != cudaSuccess)
 180     {
 181         string msg("Could not allocate memory on device for random numbers: ");
 182         msg += cudaGetErrorString(cudaResult);
 183         throw std::runtime_error(msg);
 184     }
 185
 186     // Allocate memory for result
 187     // Each thread block will produce one result
 188     unsigned int *d_results = 0;
 189     cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));
 190
 191     if (cudaResult != cudaSuccess)
 192     {
 193         string msg("Could not allocate memory on device for partial results: ");
 194         msg += cudaGetErrorString(cudaResult);
 195         throw std::runtime_error(msg);
 196     }
 197
 198     // Generate random points in unit square
 199     curandStatus_t curandResult;
 200     curandGenerator_t qrng;
 201
 202     if (typeid(Real) == typeid(float))
 203     {
 204         curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
 205     }
 206     else if (typeid(Real) == typeid(double))
 207     {
 208         curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL64);
 209     }
 210     else
 211     {
 212         string msg("Could not create random number generator of specified type");
 213         throw std::runtime_error(msg);
 214     }
 215
 216     if (curandResult != CURAND_STATUS_SUCCESS)
 217     {
 218         string msg("Could not create quasi-random number generator: ");
 219         msg += curandResult;
 220         throw std::runtime_error(msg);
 221     }
 222
 223     curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);
 224
 225     if (curandResult != CURAND_STATUS_SUCCESS)
 226     {
 227         string msg("Could not set number of dimensions for quasi-random number generator: ");
 228         msg += curandResult;
 229         throw std::runtime_error(msg);
 230     }
 231
 232     curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);
 233
 234     if (curandResult != CURAND_STATUS_SUCCESS)
 235     {
 236         string msg("Could not set order for quasi-random number generator: ");
 237         msg += curandResult;
 238         throw std::runtime_error(msg);
 239     }
 240
 241     if (typeid(Real) == typeid(float))
 242     {
 243         curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);
 244     }
 245     else if (typeid(Real) == typeid(double))
 246     {
 247         curandResult = curandGenerateUniformDouble(qrng, (double *)d_points, 2 * m_numSims);
 248     }
 249     else
 250     {
 251         string msg("Could not generate random numbers of specified type");
 252         throw std::runtime_error(msg);
 253     }
 254
 255     if (curandResult != CURAND_STATUS_SUCCESS)
 256     {
 257         string msg("Could not generate quasi-random numbers: ");
 258         msg += curandResult;
 259         throw std::runtime_error(msg);
 260     }
 261
 262     curandResult = curandDestroyGenerator(qrng);
 263
 264     if (curandResult != CURAND_STATUS_SUCCESS)
 265     {
 266         string msg("Could not destroy quasi-random number generator: ");
 267         msg += curandResult;
 268         throw std::runtime_error(msg);
 269     }
 270
 271     // Count the points inside unit quarter-circle
 272     computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_points, m_numSims);
 273
 274     // Copy partial results back
 275     vector<unsigned int> results(grid.x);
 276     cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);
 277
 278     if (cudaResult != cudaSuccess)
 279     {
 280         string msg("Could not copy partial results to host: ");
 281         msg += cudaGetErrorString(cudaResult);
 282         throw std::runtime_error(msg);
 283     }
 284
 285     // Complete sum-reduction on host
 286     Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));
 287
 288     // Determine the proportion of points inside the quarter-circle,
 289     // i.e. the area of the unit quarter-circle
 290     value /= m_numSims;
 291
 292     // Value is currently an estimate of the area of a unit quarter-circle, so we can
 293     // scale to a full circle by multiplying by four. Now since the area of a circle
 294     // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
 295     value *= 4;
 296
 297     // Cleanup
 298     if (d_points)
 299     {
 300         cudaFree(d_points);
 301         d_points = 0;
 302     }
 303     if (d_results)
 304     {
 305         cudaFree(d_results);
 306         d_results = 0;
 307     }
 308     return value;
 309 }
 310
 311 // Explicit template instantiation
 312 template class PiEstimator<float>;
 313 template class PiEstimator<double>;