utiltools/X86MAC64/cuda/samples/4_Finance/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp

   1 /**
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 /*
  13  * This sample evaluates fair call price for a
  14  * given set of European options using Monte Carlo approach.
  15  * See supplied whitepaper for more explanations.
  16  */
  17
  18
  19
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <math.h>
  24 #include <cuda_runtime.h>
  25
  26 // includes, project
  27 #include <helper_functions.h> // Helper functions (utilities, parsing, timing)
  28 #include <helper_cuda.h>      // helper functions (cuda error checking and intialization)
  29 #include <multithreading.h>
  30
  31 #include "MonteCarlo_common.h"
  32
  33 int *pArgc = NULL;
  34 char **pArgv = NULL;
  35
  36 #ifdef WIN32
  37 #define strcasecmp _strcmpi
  38 #endif
  39
  40 ////////////////////////////////////////////////////////////////////////////////
  41 // Common functions
  42 ////////////////////////////////////////////////////////////////////////////////
  43 float randFloat(float low, float high)
  44 {
  45     float t = (float)rand() / (float)RAND_MAX;
  46     return (1.0f - t) * low + t * high;
  47 }
  48
  49 /// Utility function to tweak problem size for small GPUs
  50 int adjustProblemSize(int GPU_N, int default_nOptions)
  51 {
  52     int nOptions = default_nOptions;
  53
  54     // select problem size
  55     for (int i=0; i<GPU_N; i++)
  56     {
  57         cudaDeviceProp deviceProp;
  58         checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
  59         int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)
  60                         * deviceProp.multiProcessorCount;
  61
  62         if (cudaCores <= 32)
  63         {
  64             nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2);
  65         }
  66     }
  67
  68     return nOptions;
  69 }
  70
  71
  72
  73 ///////////////////////////////////////////////////////////////////////////////
  74 // CPU reference functions
  75 ///////////////////////////////////////////////////////////////////////////////
  76 extern "C" void MonteCarloCPU(
  77     TOptionValue   &callValue,
  78     TOptionData optionData,
  79     float *h_Random,
  80     int pathN
  81 );
  82
  83 //Black-Scholes formula for call options
  84 extern "C" void BlackScholesCall(
  85     float &CallResult,
  86     TOptionData optionData
  87 );
  88
  89
  90 ////////////////////////////////////////////////////////////////////////////////
  91 // GPU-driving host thread
  92 ////////////////////////////////////////////////////////////////////////////////
  93 //Timer
  94 StopWatchInterface **hTimer = NULL;
  95
  96 static CUT_THREADPROC solverThread(TOptionPlan *plan)
  97 {
  98     //Init GPU
  99     checkCudaErrors(cudaSetDevice(plan->device));
 100
 101     cudaDeviceProp deviceProp;
 102     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));
 103
 104     //Start the timer
 105     sdkStartTimer(&hTimer[plan->device]);
 106
 107     // Allocate intermediate memory for MC integrator and initialize
 108     // RNG states
 109     initMonteCarloGPU(plan);
 110
 111     // Main commputation
 112     MonteCarloGPU(plan);
 113
 114     checkCudaErrors(cudaDeviceSynchronize());
 115
 116     //Stop the timer
 117     sdkStopTimer(&hTimer[plan->device]);
 118
 119     //Shut down this GPU
 120     closeMonteCarloGPU(plan);
 121
 122     cudaStreamSynchronize(0);
 123
 124     printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name);
 125     cudaDeviceReset();
 126     CUT_THREADEND;
 127 }
 128
 129 static void multiSolver(TOptionPlan *plan, int nPlans)
 130 {
 131
 132     // allocate and initialize an array of stream handles
 133     cudaStream_t *streams = (cudaStream_t *) malloc(nPlans * sizeof(cudaStream_t));
 134     cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));
 135
 136     for (int i = 0; i < nPlans; i++)
 137     {
 138         checkCudaErrors(cudaSetDevice(plan[i].device));
 139         checkCudaErrors(cudaStreamCreate(&(streams[i])));
 140         checkCudaErrors(cudaEventCreate(&(events[i])));
 141     }
 142
 143     //Init Each GPU
 144     // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device
 145     // Set the device desired, then perform initializations on that device
 146
 147     for (int i=0 ; i<nPlans ; i++)
 148     {
 149         // set the target device to perform initialization on
 150         checkCudaErrors(cudaSetDevice(plan[i].device));
 151
 152         cudaDeviceProp deviceProp;
 153         checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));
 154
 155         // Allocate intermediate memory for MC integrator
 156         // and initialize RNG state
 157         initMonteCarloGPU(&plan[i]);
 158     }
 159
 160     //Start the timer
 161     sdkResetTimer(&hTimer[0]);
 162     sdkStartTimer(&hTimer[0]);
 163
 164     for (int i=0; i<nPlans; i++)
 165     {
 166         checkCudaErrors(cudaSetDevice(plan[i].device));
 167
 168         //Main computations
 169         MonteCarloGPU(&plan[i], streams[i]);
 170
 171         checkCudaErrors(cudaEventRecord(events[i]));
 172     }
 173
 174     for (int i=0; i<nPlans; i++)
 175     {
 176         checkCudaErrors(cudaSetDevice(plan[i].device));
 177         cudaEventSynchronize(events[i]);
 178     }
 179
 180     //Stop the timer
 181     sdkStopTimer(&hTimer[0]);
 182
 183     for (int i=0 ; i<nPlans ; i++)
 184     {
 185         checkCudaErrors(cudaSetDevice(plan[i].device));
 186         closeMonteCarloGPU(&plan[i]);
 187         checkCudaErrors(cudaStreamDestroy(streams[i]));
 188         checkCudaErrors(cudaEventDestroy(events[i]));
 189     }
 190 }
 191
 192
 193
 194 ///////////////////////////////////////////////////////////////////////////////
 195 // Main program
 196 ///////////////////////////////////////////////////////////////////////////////
 197 #define DO_CPU
 198 #undef DO_CPU
 199
 200 #define PRINT_RESULTS
 201 #undef PRINT_RESULTS
 202
 203
 204 void usage()
 205 {
 206     printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");
 207     printf("Method=threaded: 1 CPU thread for each GPU     [default]\n");
 208     printf("       streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or newer)\n");
 209     printf("Scaling=strong : constant problem size\n");
 210     printf("        weak   : problem size scales with number of available GPUs [default]\n");
 211 }
 212
 213
 214 int main(int argc, char **argv)
 215 {
 216     char *multiMethodChoice = NULL;
 217     char *scalingChoice = NULL;
 218     bool use_threads = true;
 219     bool bqatest = false;
 220     bool strongScaling = false;
 221
 222     pArgc = &argc;
 223     pArgv = argv;
 224
 225     printf("%s Starting...\n\n", argv[0]);
 226
 227     if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
 228     {
 229         bqatest = true;
 230     }
 231
 232     getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);
 233     getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);
 234
 235     if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
 236         checkCmdLineFlag(argc, (const char **)argv, "help"))
 237     {
 238         usage();
 239         exit(EXIT_SUCCESS);
 240     }
 241
 242     if (multiMethodChoice == NULL)
 243     {
 244         use_threads = true;
 245     }
 246     else
 247     {
 248         if (!strcasecmp(multiMethodChoice, "threaded"))
 249         {
 250             use_threads = true;
 251         }
 252         else
 253         {
 254             use_threads = false;
 255         }
 256     }
 257
 258     if (use_threads == false)
 259     {
 260         printf("Using single CPU thread for multiple GPUs\n");
 261     }
 262
 263     if (scalingChoice == NULL)
 264     {
 265         strongScaling = false;
 266     }
 267     else
 268     {
 269         if (!strcasecmp(scalingChoice, "strong"))
 270         {
 271             strongScaling = true;
 272         }
 273         else
 274         {
 275             strongScaling = false;
 276         }
 277     }
 278
 279
 280     //GPU number present in the system
 281     int GPU_N;
 282     checkCudaErrors(cudaGetDeviceCount(&GPU_N));
 283     int nOptions = 256;
 284
 285     nOptions = adjustProblemSize(GPU_N, nOptions);
 286
 287     // select problem size
 288     int scale = (strongScaling) ? 1 : GPU_N;
 289     int OPT_N = nOptions * scale;
 290     int PATH_N = 262144;
 291     const unsigned long long SEED = 777;
 292
 293     // initialize the timers
 294     hTimer = new StopWatchInterface*[GPU_N];
 295
 296     for (int i=0; i<GPU_N; i++)
 297     {
 298         sdkCreateTimer(&hTimer[i]);
 299         sdkResetTimer(&hTimer[i]);
 300     }
 301
 302     //Input data array
 303     TOptionData  *optionData   = new TOptionData[OPT_N];
 304     //Final GPU MC results
 305     TOptionValue *callValueGPU = new TOptionValue[OPT_N];
 306     //"Theoretical" call values by Black-Scholes formula
 307     float *callValueBS = new float[OPT_N];
 308     //Solver config
 309     TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
 310     //OS thread ID
 311     CUTThread *threadID = new CUTThread[GPU_N];
 312
 313     int gpuBase, gpuIndex;
 314     int i;
 315
 316     float time;
 317
 318     double delta, ref, sumDelta, sumRef, sumReserve;
 319
 320     printf("MonteCarloMultiGPU\n");
 321     printf("==================\n");
 322     printf("Parallelization method  = %s\n", use_threads ? "threaded" : "streamed");
 323     printf("Problem scaling         = %s\n", strongScaling? "strong" : "weak");
 324     printf("Number of GPUs          = %d\n", GPU_N);
 325     printf("Total number of options = %d\n", OPT_N);
 326     printf("Number of paths         = %d\n", PATH_N);
 327
 328
 329     printf("main(): generating input data...\n");
 330     srand(123);
 331
 332     for (i=0; i < OPT_N; i++)
 333     {
 334         optionData[i].S = randFloat(5.0f, 50.0f);
 335         optionData[i].X = randFloat(10.0f, 25.0f);
 336         optionData[i].T = randFloat(1.0f, 5.0f);
 337         optionData[i].R = 0.06f;
 338         optionData[i].V = 0.10f;
 339         callValueGPU[i].Expected   = -1.0f;
 340         callValueGPU[i].Confidence = -1.0f;
 341     }
 342
 343     printf("main(): starting %i host threads...\n", GPU_N);
 344
 345
 346     //Get option count for each GPU
 347     for (i = 0; i < GPU_N; i++)
 348     {
 349         optionSolver[i].optionCount = OPT_N / GPU_N;
 350     }
 351
 352     //Take into account cases with "odd" option counts
 353     for (i = 0; i < (OPT_N % GPU_N); i++)
 354     {
 355         optionSolver[i].optionCount++;
 356     }
 357
 358     //Assign GPU option ranges
 359     gpuBase = 0;
 360
 361     for (i = 0; i < GPU_N; i++)
 362     {
 363         optionSolver[i].device     = i;
 364         optionSolver[i].optionData = optionData   + gpuBase;
 365         optionSolver[i].callValue  = callValueGPU + gpuBase;
 366         // all devices use the same global seed, but start
 367         // the sequence at a different offset
 368         optionSolver[i].seed       = SEED;
 369         optionSolver[i].pathN      = PATH_N;
 370         gpuBase += optionSolver[i].optionCount;
 371     }
 372
 373
 374     if (use_threads || bqatest)
 375     {
 376         //Start CPU thread for each GPU
 377         for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
 378         {
 379             threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
 380         }
 381
 382         printf("main(): waiting for GPU results...\n");
 383         cutWaitForThreads(threadID, GPU_N);
 384
 385         printf("main(): GPU statistics, threaded\n");
 386
 387         for (i = 0; i < GPU_N; i++)
 388         {
 389             cudaDeviceProp deviceProp;
 390             checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
 391             printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
 392             printf("Options         : %i\n", optionSolver[i].optionCount);
 393             printf("Simulation paths: %i\n", optionSolver[i].pathN);
 394             time = sdkGetTimerValue(&hTimer[i]);
 395             printf("Total time (ms.): %f\n", time);
 396             printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
 397         }
 398
 399         printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
 400         sumDelta   = 0;
 401         sumRef     = 0;
 402         sumReserve = 0;
 403
 404         for (i = 0; i < OPT_N; i++)
 405         {
 406             BlackScholesCall(callValueBS[i], optionData[i]);
 407             delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
 408             ref       = callValueBS[i];
 409             sumDelta += delta;
 410             sumRef   += fabs(ref);
 411
 412             if (delta > 1e-6)
 413             {
 414                 sumReserve += callValueGPU[i].Confidence / delta;
 415             }
 416
 417 #ifdef PRINT_RESULTS
 418             printf("BS: %f; delta: %E\n", callValueBS[i], delta);
 419 #endif
 420
 421         }
 422
 423         sumReserve /= OPT_N;
 424     }
 425
 426     if (!use_threads || bqatest)
 427     {
 428         multiSolver(optionSolver, GPU_N);
 429
 430         printf("main(): GPU statistics, streamed\n");
 431
 432         for (i = 0; i < GPU_N; i++)
 433         {
 434             cudaDeviceProp deviceProp;
 435             checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
 436             printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
 437             printf("Options         : %i\n", optionSolver[i].optionCount);
 438             printf("Simulation paths: %i\n", optionSolver[i].pathN);
 439         }
 440
 441         time = sdkGetTimerValue(&hTimer[0]);
 442         printf("\nTotal time (ms.): %f\n", time);
 443         printf("\tNote: This is elapsed time for all to compute.\n");
 444         printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
 445
 446         printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
 447         sumDelta   = 0;
 448         sumRef     = 0;
 449         sumReserve = 0;
 450
 451         for (i = 0; i < OPT_N; i++)
 452         {
 453             BlackScholesCall(callValueBS[i], optionData[i]);
 454             delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
 455             ref       = callValueBS[i];
 456             sumDelta += delta;
 457             sumRef   += fabs(ref);
 458
 459             if (delta > 1e-6)
 460             {
 461                 sumReserve += callValueGPU[i].Confidence / delta;
 462             }
 463
 464 #ifdef PRINT_RESULTS
 465             printf("BS: %f; delta: %E\n", callValueBS[i], delta);
 466 #endif
 467         }
 468
 469         sumReserve /= OPT_N;
 470     }
 471
 472 #ifdef DO_CPU
 473     printf("main(): running CPU MonteCarlo...\n");
 474     TOptionValue callValueCPU;
 475     sumDelta = 0;
 476     sumRef   = 0;
 477
 478     for (i = 0; i < OPT_N; i++)
 479     {
 480         MonteCarloCPU(
 481             callValueCPU,
 482             optionData[i],
 483             NULL,
 484             PATH_N
 485         );
 486         delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
 487         ref       = callValueCPU.Expected;
 488         sumDelta += delta;
 489         sumRef   += fabs(ref);
 490         printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
 491         printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
 492     }
 493
 494     printf("L1 norm: %E\n", sumDelta / sumRef);
 495 #endif
 496
 497     printf("Shutting down...\n");
 498
 499     for (int i=0; i<GPU_N; i++)
 500     {
 501         sdkStartTimer(&hTimer[i]);
 502         checkCudaErrors(cudaSetDevice(i));
 503         cudaDeviceReset();
 504     }
 505
 506     delete[] optionSolver;
 507     delete[] callValueBS;
 508     delete[] callValueGPU;
 509     delete[] optionData;
 510     delete[] threadID;
 511     delete[] hTimer;
 512
 513     printf("Test Summary...\n");
 514     printf("L1 norm        : %E\n", sumDelta / sumRef);
 515     printf("Average reserve: %f\n", sumReserve);
 516     printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
 517     exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
 518 }