utiltools/X86MAC64/cuda/samples/0_Simple/simpleMPI/simpleMPI.cu

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12
  13 /* Simple example demonstrating how to use MPI with CUDA
  14 *
  15 *  Generate some random numbers on one node.
  16 *  Dispatch them to all nodes.
  17 *  Compute their square root on each node's GPU.
  18 *  Compute the average of the results using MPI.
  19 *
  20 *  simpleMPI.cu: GPU part, compiled with nvcc
  21 */
  22
  23 #include <iostream>
  24 using std::cerr;
  25 using std::endl;
  26
  27 #include "simpleMPI.h"
  28
  29 // Error handling macro
  30 #define CUDA_CHECK(call) \
  31     if((call) != cudaSuccess) { \
  32         cudaError_t err = cudaGetLastError(); \
  33         cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
  34         my_abort(err); }
  35
  36
  37 // Device code
  38 // Very simple GPU Kernel that computes square roots of input numbers
  39 __global__ void simpleMPIKernel(float *input, float *output)
  40 {
  41     int tid = blockIdx.x * blockDim.x + threadIdx.x;
  42     output[tid] = sqrt(input[tid]);
  43 }
  44
  45
  46 // Initialize an array with random data (between 0 and 1)
  47 void initData(float *data, int dataSize)
  48 {
  49     for (int i = 0; i < dataSize; i++)
  50     {
  51         data[i] = (float)rand() / RAND_MAX;
  52     }
  53 }
  54
  55 // CUDA computation on each node
  56 // No MPI here, only CUDA
  57 void computeGPU(float *hostData, int blockSize, int gridSize)
  58 {
  59     int dataSize = blockSize * gridSize;
  60
  61     // Allocate data on GPU memory
  62     float *deviceInputData = NULL;
  63     CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
  64
  65     float *deviceOutputData = NULL;
  66     CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
  67
  68     // Copy to GPU memory
  69     CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
  70
  71     // Run kernel
  72     simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
  73
  74     // Copy data back to CPU memory
  75     CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost));
  76
  77     // Free GPU memory
  78     CUDA_CHECK(cudaFree(deviceInputData));
  79     CUDA_CHECK(cudaFree(deviceOutputData));
  80 }
  81
  82 float sum(float *data, int size)
  83 {
  84     float accum = 0.f;
  85
  86     for (int i = 0; i < size; i++)
  87     {
  88         accum += data[i];
  89     }
  90
  91     return accum;
  92 }