2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
13 /* Simple example demonstrating how to use MPI with CUDA
15 * Generate some random numbers on one node.
16 * Dispatch them to all nodes.
17 * Compute their square root on each node's GPU.
18 * Compute the average of the results using MPI.
20 * simpleMPI.cu: GPU part, compiled with nvcc
27 #include "simpleMPI.h"
29 // Error handling macro
30 #define CUDA_CHECK(call) \
31 if((call) != cudaSuccess) { \
32 cudaError_t err = cudaGetLastError(); \
33 cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
38 // Very simple GPU Kernel that computes square roots of input numbers
39 __global__ void simpleMPIKernel(float *input, float *output)
41 int tid = blockIdx.x * blockDim.x + threadIdx.x;
42 output[tid] = sqrt(input[tid]);
46 // Initialize an array with random data (between 0 and 1)
47 void initData(float *data, int dataSize)
49 for (int i = 0; i < dataSize; i++)
51 data[i] = (float)rand() / RAND_MAX;
55 // CUDA computation on each node
56 // No MPI here, only CUDA
57 void computeGPU(float *hostData, int blockSize, int gridSize)
59 int dataSize = blockSize * gridSize;
61 // Allocate data on GPU memory
62 float *deviceInputData = NULL;
63 CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
65 float *deviceOutputData = NULL;
66 CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
69 CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
72 simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
74 // Copy data back to CPU memory
75 CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost));
78 CUDA_CHECK(cudaFree(deviceInputData));
79 CUDA_CHECK(cudaFree(deviceOutputData));
82 float sum(float *data, int size)
86 for (int i = 0; i < size; i++)