2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
13 * This sample evaluates fair call price for a
14 * given set of European options using Monte Carlo approach.
15 * See supplied whitepaper for more explanations.
24 #include <cuda_runtime.h>
27 #include <helper_functions.h> // Helper functions (utilities, parsing, timing)
28 #include <helper_cuda.h> // helper functions (cuda error checking and intialization)
29 #include <multithreading.h>
31 #include "MonteCarlo_common.h"
37 #define strcasecmp _strcmpi
40 ////////////////////////////////////////////////////////////////////////////////
42 ////////////////////////////////////////////////////////////////////////////////
43 float randFloat(float low, float high)
45 float t = (float)rand() / (float)RAND_MAX;
46 return (1.0f - t) * low + t * high;
49 /// Utility function to tweak problem size for small GPUs
50 int adjustProblemSize(int GPU_N, int default_nOptions)
52 int nOptions = default_nOptions;
54 // select problem size
55 for (int i=0; i<GPU_N; i++)
57 cudaDeviceProp deviceProp;
58 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
59 int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)
60 * deviceProp.multiProcessorCount;
64 nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2);
73 ///////////////////////////////////////////////////////////////////////////////
74 // CPU reference functions
75 ///////////////////////////////////////////////////////////////////////////////
76 extern "C" void MonteCarloCPU(
77 TOptionValue &callValue,
78 TOptionData optionData,
83 //Black-Scholes formula for call options
84 extern "C" void BlackScholesCall(
86 TOptionData optionData
90 ////////////////////////////////////////////////////////////////////////////////
91 // GPU-driving host thread
92 ////////////////////////////////////////////////////////////////////////////////
94 StopWatchInterface **hTimer = NULL;
96 static CUT_THREADPROC solverThread(TOptionPlan *plan)
99 checkCudaErrors(cudaSetDevice(plan->device));
101 cudaDeviceProp deviceProp;
102 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));
105 sdkStartTimer(&hTimer[plan->device]);
107 // Allocate intermediate memory for MC integrator and initialize
109 initMonteCarloGPU(plan);
114 checkCudaErrors(cudaDeviceSynchronize());
117 sdkStopTimer(&hTimer[plan->device]);
120 closeMonteCarloGPU(plan);
122 cudaStreamSynchronize(0);
124 printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name);
129 static void multiSolver(TOptionPlan *plan, int nPlans)
132 // allocate and initialize an array of stream handles
133 cudaStream_t *streams = (cudaStream_t *) malloc(nPlans * sizeof(cudaStream_t));
134 cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));
136 for (int i = 0; i < nPlans; i++)
138 checkCudaErrors(cudaSetDevice(plan[i].device));
139 checkCudaErrors(cudaStreamCreate(&(streams[i])));
140 checkCudaErrors(cudaEventCreate(&(events[i])));
144 // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device
145 // Set the device desired, then perform initializations on that device
147 for (int i=0 ; i<nPlans ; i++)
149 // set the target device to perform initialization on
150 checkCudaErrors(cudaSetDevice(plan[i].device));
152 cudaDeviceProp deviceProp;
153 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));
155 // Allocate intermediate memory for MC integrator
156 // and initialize RNG state
157 initMonteCarloGPU(&plan[i]);
161 sdkResetTimer(&hTimer[0]);
162 sdkStartTimer(&hTimer[0]);
164 for (int i=0; i<nPlans; i++)
166 checkCudaErrors(cudaSetDevice(plan[i].device));
169 MonteCarloGPU(&plan[i], streams[i]);
171 checkCudaErrors(cudaEventRecord(events[i]));
174 for (int i=0; i<nPlans; i++)
176 checkCudaErrors(cudaSetDevice(plan[i].device));
177 cudaEventSynchronize(events[i]);
181 sdkStopTimer(&hTimer[0]);
183 for (int i=0 ; i<nPlans ; i++)
185 checkCudaErrors(cudaSetDevice(plan[i].device));
186 closeMonteCarloGPU(&plan[i]);
187 checkCudaErrors(cudaStreamDestroy(streams[i]));
188 checkCudaErrors(cudaEventDestroy(events[i]));
194 ///////////////////////////////////////////////////////////////////////////////
196 ///////////////////////////////////////////////////////////////////////////////
200 #define PRINT_RESULTS
206 printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");
207 printf("Method=threaded: 1 CPU thread for each GPU [default]\n");
208 printf(" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or newer)\n");
209 printf("Scaling=strong : constant problem size\n");
210 printf(" weak : problem size scales with number of available GPUs [default]\n");
214 int main(int argc, char **argv)
216 char *multiMethodChoice = NULL;
217 char *scalingChoice = NULL;
218 bool use_threads = true;
219 bool bqatest = false;
220 bool strongScaling = false;
225 printf("%s Starting...\n\n", argv[0]);
227 if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
232 getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);
233 getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);
235 if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
236 checkCmdLineFlag(argc, (const char **)argv, "help"))
242 if (multiMethodChoice == NULL)
248 if (!strcasecmp(multiMethodChoice, "threaded"))
258 if (use_threads == false)
260 printf("Using single CPU thread for multiple GPUs\n");
263 if (scalingChoice == NULL)
265 strongScaling = false;
269 if (!strcasecmp(scalingChoice, "strong"))
271 strongScaling = true;
275 strongScaling = false;
280 //GPU number present in the system
282 checkCudaErrors(cudaGetDeviceCount(&GPU_N));
285 nOptions = adjustProblemSize(GPU_N, nOptions);
287 // select problem size
288 int scale = (strongScaling) ? 1 : GPU_N;
289 int OPT_N = nOptions * scale;
291 const unsigned long long SEED = 777;
293 // initialize the timers
294 hTimer = new StopWatchInterface*[GPU_N];
296 for (int i=0; i<GPU_N; i++)
298 sdkCreateTimer(&hTimer[i]);
299 sdkResetTimer(&hTimer[i]);
303 TOptionData *optionData = new TOptionData[OPT_N];
304 //Final GPU MC results
305 TOptionValue *callValueGPU = new TOptionValue[OPT_N];
306 //"Theoretical" call values by Black-Scholes formula
307 float *callValueBS = new float[OPT_N];
309 TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
311 CUTThread *threadID = new CUTThread[GPU_N];
313 int gpuBase, gpuIndex;
318 double delta, ref, sumDelta, sumRef, sumReserve;
320 printf("MonteCarloMultiGPU\n");
321 printf("==================\n");
322 printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed");
323 printf("Problem scaling = %s\n", strongScaling? "strong" : "weak");
324 printf("Number of GPUs = %d\n", GPU_N);
325 printf("Total number of options = %d\n", OPT_N);
326 printf("Number of paths = %d\n", PATH_N);
329 printf("main(): generating input data...\n");
332 for (i=0; i < OPT_N; i++)
334 optionData[i].S = randFloat(5.0f, 50.0f);
335 optionData[i].X = randFloat(10.0f, 25.0f);
336 optionData[i].T = randFloat(1.0f, 5.0f);
337 optionData[i].R = 0.06f;
338 optionData[i].V = 0.10f;
339 callValueGPU[i].Expected = -1.0f;
340 callValueGPU[i].Confidence = -1.0f;
343 printf("main(): starting %i host threads...\n", GPU_N);
346 //Get option count for each GPU
347 for (i = 0; i < GPU_N; i++)
349 optionSolver[i].optionCount = OPT_N / GPU_N;
352 //Take into account cases with "odd" option counts
353 for (i = 0; i < (OPT_N % GPU_N); i++)
355 optionSolver[i].optionCount++;
358 //Assign GPU option ranges
361 for (i = 0; i < GPU_N; i++)
363 optionSolver[i].device = i;
364 optionSolver[i].optionData = optionData + gpuBase;
365 optionSolver[i].callValue = callValueGPU + gpuBase;
366 // all devices use the same global seed, but start
367 // the sequence at a different offset
368 optionSolver[i].seed = SEED;
369 optionSolver[i].pathN = PATH_N;
370 gpuBase += optionSolver[i].optionCount;
374 if (use_threads || bqatest)
376 //Start CPU thread for each GPU
377 for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
379 threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
382 printf("main(): waiting for GPU results...\n");
383 cutWaitForThreads(threadID, GPU_N);
385 printf("main(): GPU statistics, threaded\n");
387 for (i = 0; i < GPU_N; i++)
389 cudaDeviceProp deviceProp;
390 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
391 printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
392 printf("Options : %i\n", optionSolver[i].optionCount);
393 printf("Simulation paths: %i\n", optionSolver[i].pathN);
394 time = sdkGetTimerValue(&hTimer[i]);
395 printf("Total time (ms.): %f\n", time);
396 printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
399 printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
404 for (i = 0; i < OPT_N; i++)
406 BlackScholesCall(callValueBS[i], optionData[i]);
407 delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
408 ref = callValueBS[i];
414 sumReserve += callValueGPU[i].Confidence / delta;
418 printf("BS: %f; delta: %E\n", callValueBS[i], delta);
426 if (!use_threads || bqatest)
428 multiSolver(optionSolver, GPU_N);
430 printf("main(): GPU statistics, streamed\n");
432 for (i = 0; i < GPU_N; i++)
434 cudaDeviceProp deviceProp;
435 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
436 printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
437 printf("Options : %i\n", optionSolver[i].optionCount);
438 printf("Simulation paths: %i\n", optionSolver[i].pathN);
441 time = sdkGetTimerValue(&hTimer[0]);
442 printf("\nTotal time (ms.): %f\n", time);
443 printf("\tNote: This is elapsed time for all to compute.\n");
444 printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
446 printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
451 for (i = 0; i < OPT_N; i++)
453 BlackScholesCall(callValueBS[i], optionData[i]);
454 delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
455 ref = callValueBS[i];
461 sumReserve += callValueGPU[i].Confidence / delta;
465 printf("BS: %f; delta: %E\n", callValueBS[i], delta);
473 printf("main(): running CPU MonteCarlo...\n");
474 TOptionValue callValueCPU;
478 for (i = 0; i < OPT_N; i++)
486 delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
487 ref = callValueCPU.Expected;
490 printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected);
491 printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
494 printf("L1 norm: %E\n", sumDelta / sumRef);
497 printf("Shutting down...\n");
499 for (int i=0; i<GPU_N; i++)
501 sdkStartTimer(&hTimer[i]);
502 checkCudaErrors(cudaSetDevice(i));
506 delete[] optionSolver;
507 delete[] callValueBS;
508 delete[] callValueGPU;
513 printf("Test Summary...\n");
514 printf("L1 norm : %E\n", sumDelta / sumRef);
515 printf("Average reserve: %f\n", sumReserve);
516 printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
517 exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);