1 ////////////////////////////////////////////////////////////////////////////
3 // Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
5 // Please refer to the NVIDIA end user license agreement (EULA) associated
6 // with this source code for terms and conditions that govern your use of
7 // this software. Any use, reproduction, disclosure, or distribution of
8 // this software and related documentation outside the terms of the EULA
9 // is strictly prohibited.
11 ////////////////////////////////////////////////////////////////////////////
13 // ----------------------------------------------------------------------------------------
16 // This file contains both device and host code for transposing a floating-point
17 // matrix. It performs several transpose kernels, which incrementally improve performance
18 // through coalescing, removing shared memory bank conflicts, and eliminating partition
19 // camping. Several of the kernels perform a copy, used to represent the best case
20 // performance that a transpose can achieve.
22 // Please see the whitepaper in the docs folder of the transpose project for a detailed
23 // description of this performance study.
24 // ----------------------------------------------------------------------------------------
26 // Utilities and system includes
27 #include <helper_string.h> // helper for string parsing
28 #include <helper_image.h> // helper for image and data compariosn
29 #include <helper_cuda.h> // helper for cuda error checking functions
31 const char *sSDKsample = "Transpose";
33 // Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
34 // using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
35 // TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of BLOCK_ROWS
40 // This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
41 int MATRIX_SIZE_X = 1024;
42 int MATRIX_SIZE_Y = 1024;
43 int MUL_FACTOR = TILE_DIM;
45 #define FLOOR(a,b) (a-(a%b))
47 // Compute the tile size necessary to illustrate performance cases for SM12+ hardware
48 int MAX_TILES_SM12 = (FLOOR(MATRIX_SIZE_X,512) * FLOOR(MATRIX_SIZE_Y,512)) / (TILE_DIM *TILE_DIM);
49 // Compute the tile size necessary to illustrate performance cases for SM10,SM11 hardware
50 int MAX_TILES_SM10 = (FLOOR(MATRIX_SIZE_X,384) * FLOOR(MATRIX_SIZE_Y,384)) / (TILE_DIM *TILE_DIM);
52 // Number of repetitions used for timing. Two sets of repetitions are performed:
53 // 1) over kernel launches and 2) inside the kernel over just the loads and stores
57 // -------------------------------------------------------
59 // width and height must be integral multiples of TILE_DIM
60 // -------------------------------------------------------
62 __global__ void copy(float *odata, float *idata, int width, int height)
64 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
65 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
67 int index = xIndex + width*yIndex;
69 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
71 odata[index+i*width] = idata[index+i*width];
76 __global__ void copySharedMem(float *odata, float *idata, int width, int height)
78 __shared__ float tile[TILE_DIM][TILE_DIM];
80 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
81 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
83 int index = xIndex + width*yIndex;
85 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
87 if (xIndex < width && yIndex < height)
89 tile[threadIdx.y][threadIdx.x] = idata[index];
95 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
97 if (xIndex < height && yIndex < width)
99 odata[index] = tile[threadIdx.y][threadIdx.x];
104 // -------------------------------------------------------
106 // width and height must be integral multiples of TILE_DIM
107 // -------------------------------------------------------
109 __global__ void transposeNaive(float *odata, float *idata, int width, int height)
111 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
112 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
114 int index_in = xIndex + width * yIndex;
115 int index_out = yIndex + height * xIndex;
117 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
119 odata[index_out+i] = idata[index_in+i*width];
123 // coalesced transpose (with bank conflicts)
125 __global__ void transposeCoalesced(float *odata, float *idata, int width, int height)
127 __shared__ float tile[TILE_DIM][TILE_DIM];
129 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
130 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
131 int index_in = xIndex + (yIndex)*width;
133 xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
134 yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
135 int index_out = xIndex + (yIndex)*height;
137 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
139 tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
144 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
146 odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
150 // Coalesced transpose with no bank conflicts
152 __global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height)
154 __shared__ float tile[TILE_DIM][TILE_DIM+1];
156 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
157 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
158 int index_in = xIndex + (yIndex)*width;
160 xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
161 yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
162 int index_out = xIndex + (yIndex)*height;
164 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
166 tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
171 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
173 odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
177 // Transpose that effectively reorders execution of thread blocks along diagonals of the
178 // matrix (also coalesced and has no bank conflicts)
180 // Here blockIdx.x is interpreted as the distance along a diagonal and blockIdx.y as
181 // corresponding to different diagonals
183 // blockIdx_x and blockIdx_y expressions map the diagonal coordinates to the more commonly
184 // used cartesian coordinates so that the only changes to the code from the coalesced version
185 // are the calculation of the blockIdx_x and blockIdx_y and replacement of blockIdx.x and
186 // bloclIdx.y with the subscripted versions in the remaining code
188 __global__ void transposeDiagonal(float *odata, float *idata, int width, int height)
190 __shared__ float tile[TILE_DIM][TILE_DIM+1];
192 int blockIdx_x, blockIdx_y;
194 // do diagonal reordering
197 blockIdx_y = blockIdx.x;
198 blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
202 int bid = blockIdx.x + gridDim.x*blockIdx.y;
203 blockIdx_y = bid%gridDim.y;
204 blockIdx_x = ((bid/gridDim.y)+blockIdx_y)%gridDim.x;
207 // from here on the code is same as previous kernel except blockIdx_x replaces blockIdx.x
208 // and similarly for y
210 int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
211 int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
212 int index_in = xIndex + (yIndex)*width;
214 xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
215 yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
216 int index_out = xIndex + (yIndex)*height;
218 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
220 tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
225 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
227 odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
231 // --------------------------------------------------------------------
232 // Partial transposes
233 // NB: the coarse- and fine-grained routines only perform part of a
234 // transpose and will fail the test against the reference solution
236 // They are used to assess performance characteristics of different
237 // components of a full transpose
238 // --------------------------------------------------------------------
240 __global__ void transposeFineGrained(float *odata, float *idata, int width, int height)
242 __shared__ float block[TILE_DIM][TILE_DIM+1];
244 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
245 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
246 int index = xIndex + (yIndex)*width;
248 for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
250 block[threadIdx.y+i][threadIdx.x] = idata[index+i*width];
255 for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
257 odata[index+i*height] = block[threadIdx.x][threadIdx.y+i];
262 __global__ void transposeCoarseGrained(float *odata, float *idata, int width, int height)
264 __shared__ float block[TILE_DIM][TILE_DIM+1];
266 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
267 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
268 int index_in = xIndex + (yIndex)*width;
270 xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
271 yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
272 int index_out = xIndex + (yIndex)*height;
274 for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
276 block[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
281 for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
283 odata[index_out+i*height] = block[threadIdx.y+i][threadIdx.x];
288 // ---------------------
289 // host utility routines
290 // ---------------------
292 void computeTransposeGold(float *gold, float *idata,
293 const int size_x, const int size_y)
295 for (int y = 0; y < size_y; ++y)
297 for (int x = 0; x < size_x; ++x)
299 gold[(x * size_y) + y] = idata[(y * size_x) + x];
305 void getParams(int argc, char **argv, cudaDeviceProp &deviceProp, int &size_x, int &size_y, int max_tile_dim)
307 // set matrix size (if (x,y) dim of matrix is not square, then this will have to be modified
308 if (checkCmdLineFlag(argc, (const char **)argv, "dimX"))
310 size_x = getCmdLineArgumentInt(argc, (const char **) argv, "dimX");
312 if (size_x > max_tile_dim)
314 printf("> MatrixSize X = %d is greater than the recommended size = %d\n", size_x, max_tile_dim);
318 printf("> MatrixSize X = %d\n", size_x);
323 size_x = max_tile_dim;
325 // If this is SM12 hardware, we want to round down to a multiple of 512
326 if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
328 size_x = FLOOR(size_x, 512);
330 else // else for SM10,SM11 we round down to a multiple of 384
332 size_x = FLOOR(size_x, 384);
336 if (checkCmdLineFlag(argc, (const char **)argv, "dimY"))
338 size_y = getCmdLineArgumentInt(argc, (const char **) argv, "dimY");
340 if (size_y > max_tile_dim)
342 printf("> MatrixSize Y = %d is greater than the recommended size = %d\n", size_y, max_tile_dim);
346 printf("> MatrixSize Y = %d\n", size_y);
351 size_y = max_tile_dim;
353 // If this is SM12 hardware, we want to round down to a multiple of 512
354 if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
356 size_y = FLOOR(size_y, 512);
358 else // else for SM10,SM11 we round down to a multiple of 384
360 size_y = FLOOR(size_y, 384);
369 printf("\n%s : Command line options\n", sSDKsample);
370 printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n");
371 printf("> The default matrix size can be overridden with these parameters\n");
372 printf("\t-dimX=row_dim_size (matrix row dimensions)\n");
373 printf("\t-dimY=col_dim_size (matrix column dimensions)\n");
382 main(int argc, char **argv)
385 printf("%s Starting...\n\n", sSDKsample);
387 if (checkCmdLineFlag(argc, (const char **)argv, "help"))
393 int devID = findCudaDevice(argc, (const char **)argv);
394 cudaDeviceProp deviceProp;
396 // get number of SMs on this GPU
397 checkCudaErrors(cudaGetDevice(&devID));
398 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
400 // compute the scaling factor (for GPUs with fewer MPs)
401 float scale_factor, total_tiles;
402 scale_factor = max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
404 printf("> Device %d: \"%s\"\n", devID, deviceProp.name);
405 printf("> SM Capability %d.%d detected:\n", deviceProp.major, deviceProp.minor);
407 // Calculate number of tiles we will run for the Matrix Transpose performance tests
408 int size_x, size_y, max_matrix_dim, matrix_size_test;
410 if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
412 matrix_size_test = 512; // we round down max_matrix_dim for this perf test
413 total_tiles = (float)MAX_TILES_SM12 / scale_factor;
417 matrix_size_test = 384; // we round down max_matrix_dim for this perf test
418 total_tiles = (float)MAX_TILES_SM10 / scale_factor;
421 max_matrix_dim = FLOOR((int)(floor(sqrt(total_tiles))* TILE_DIM), matrix_size_test);
423 // This is the minimum size allowed
424 if (max_matrix_dim == 0)
426 max_matrix_dim = matrix_size_test;
429 printf("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
430 deviceProp.name, deviceProp.multiProcessorCount,
431 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
432 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
434 printf("> Compute performance scaling factor = %4.2f\n", scale_factor);
436 // Extract parameters if there are any, command line -dimx and -dimy can override
437 // any of these settings
438 getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim);
440 if (size_x != size_y)
442 printf("\n[%s] does not support non-square matrices (row_dim_size(%d) != col_dim_size(%d))\nExiting...\n\n", sSDKsample, size_x, size_y);
447 if (size_x%TILE_DIM != 0 || size_y%TILE_DIM != 0)
449 printf("[%s] Matrix size must be integral multiple of tile size\nExiting...\n\n", sSDKsample);
454 // kernel pointer and descriptor
455 void (*kernel)(float *, float *, int, int);
458 // execution configuration parameters
459 dim3 grid(size_x/TILE_DIM, size_y/TILE_DIM), threads(TILE_DIM,BLOCK_ROWS);
461 if (grid.x < 1 || grid.y < 1)
463 printf("[%s] grid size computation incorrect in test \nExiting...\n\n", sSDKsample);
469 cudaEvent_t start, stop;
471 // size of memory required to store the matrix
472 const int mem_size = sizeof(float) * size_x*size_y;
474 if (2*mem_size > deviceProp.totalGlobalMem)
476 printf("Input matrix size is larger than the available device memory!\n");
477 printf("Please choose a smaller size matrix\n");
482 // allocate host memory
483 float *h_idata = (float *) malloc(mem_size);
484 float *h_odata = (float *) malloc(mem_size);
485 float *transposeGold = (float *) malloc(mem_size);
488 // allocate device memory
489 float *d_idata, *d_odata;
490 checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
491 checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
493 // initalize host data
494 for (int i = 0; i < (size_x*size_y); ++i)
496 h_idata[i] = (float) i;
499 // copy host data to device
500 checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
502 // Compute reference transpose solution
503 computeTransposeGold(transposeGold, h_idata, size_x, size_y);
505 // print out common data for all kernels
506 printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n",
507 size_x, size_y, size_x/TILE_DIM, size_y/TILE_DIM, TILE_DIM, TILE_DIM, TILE_DIM, BLOCK_ROWS);
510 checkCudaErrors(cudaEventCreate(&start));
511 checkCudaErrors(cudaEventCreate(&stop));
514 // loop over different kernels
519 for (int k = 0; k<8; k++)
521 // set kernel pointer
526 kernelName = "simple copy ";
530 kernel = ©SharedMem;
531 kernelName = "shared memory copy";
535 kernel = &transposeNaive;
536 kernelName = "naive ";
540 kernel = &transposeCoalesced;
541 kernelName = "coalesced ";
545 kernel = &transposeNoBankConflicts;
546 kernelName = "optimized ";
550 kernel = &transposeCoarseGrained;
551 kernelName = "coarse-grained ";
555 kernel = &transposeFineGrained;
556 kernelName = "fine-grained ";
560 kernel = &transposeDiagonal;
561 kernelName = "diagonal ";
565 // set reference solution
566 if (kernel == © || kernel == ©SharedMem)
570 else if (kernel == &transposeCoarseGrained || kernel == &transposeFineGrained)
572 gold = h_odata; // fine- and coarse-grained kernels are not full transposes, so bypass check
576 gold = transposeGold;
579 // Clear error status
580 checkCudaErrors(cudaGetLastError());
582 // warmup to avoid timing startup
583 kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);
585 // take measurements for loop over kernel launches
586 checkCudaErrors(cudaEventRecord(start, 0));
588 for (int i=0; i < NUM_REPS; i++)
590 kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);
591 // Ensure no launch failure
592 checkCudaErrors(cudaGetLastError());
594 checkCudaErrors(cudaEventRecord(stop, 0));
595 checkCudaErrors(cudaEventSynchronize(stop));
597 checkCudaErrors(cudaEventElapsedTime(&kernelTime, start, stop));
599 checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
600 bool res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);
604 printf("*** %s kernel FAILED ***\n", kernelName);
608 // take measurements for loop inside kernel
609 checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
610 res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);
614 printf("*** %s kernel FAILED ***\n", kernelName);
618 // report effective bandwidths
619 float kernelBandwidth = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(kernelTime/NUM_REPS);
620 printf("transpose %s, Throughput = %.4f GB/s, Time = %.5f ms, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
624 (size_x *size_y), 1, TILE_DIM *BLOCK_ROWS);
635 checkCudaErrors(cudaEventDestroy(start));
636 checkCudaErrors(cudaEventDestroy(stop));
642 printf("Test failed!\n");
646 printf("Test passed\n");