utiltools/X86MAC64/cuda/samples/6_Advanced/transpose/transpose.cu

   1 ////////////////////////////////////////////////////////////////////////////
   2 //
   3 // Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   4 //
   5 // Please refer to the NVIDIA end user license agreement (EULA) associated
   6 // with this source code for terms and conditions that govern your use of
   7 // this software. Any use, reproduction, disclosure, or distribution of
   8 // this software and related documentation outside the terms of the EULA
   9 // is strictly prohibited.
  10 //
  11 ////////////////////////////////////////////////////////////////////////////
  12
  13 // ----------------------------------------------------------------------------------------
  14 // Transpose
  15 //
  16 // This file contains both device and host code for transposing a floating-point
  17 // matrix.  It performs several transpose kernels, which incrementally improve performance
  18 // through coalescing, removing shared memory bank conflicts, and eliminating partition
  19 // camping.  Several of the kernels perform a copy, used to represent the best case
  20 // performance that a transpose can achieve.
  21 //
  22 // Please see the whitepaper in the docs folder of the transpose project for a detailed
  23 // description of this performance study.
  24 // ----------------------------------------------------------------------------------------
  25
  26 // Utilities and system includes
  27 #include <helper_string.h>    // helper for string parsing
  28 #include <helper_image.h>     // helper for image and data compariosn
  29 #include <helper_cuda.h>      // helper for cuda error checking functions
  30
  31 const char *sSDKsample = "Transpose";
  32
  33 // Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
  34 // using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
  35 // TILE_DIM/BLOCK_ROWS elements.  TILE_DIM must be an integral multiple of BLOCK_ROWS
  36
  37 #define TILE_DIM    16
  38 #define BLOCK_ROWS  16
  39
  40 // This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
  41 int MATRIX_SIZE_X = 1024;
  42 int MATRIX_SIZE_Y = 1024;
  43 int MUL_FACTOR    = TILE_DIM;
  44
  45 #define FLOOR(a,b) (a-(a%b))
  46
  47 // Compute the tile size necessary to illustrate performance cases for SM12+ hardware
  48 int MAX_TILES_SM12 = (FLOOR(MATRIX_SIZE_X,512) * FLOOR(MATRIX_SIZE_Y,512)) / (TILE_DIM *TILE_DIM);
  49 // Compute the tile size necessary to illustrate performance cases for SM10,SM11 hardware
  50 int MAX_TILES_SM10 = (FLOOR(MATRIX_SIZE_X,384) * FLOOR(MATRIX_SIZE_Y,384)) / (TILE_DIM *TILE_DIM);
  51
  52 // Number of repetitions used for timing.  Two sets of repetitions are performed:
  53 // 1) over kernel launches and 2) inside the kernel over just the loads and stores
  54
  55 #define NUM_REPS  100
  56
  57 // -------------------------------------------------------
  58 // Copies
  59 // width and height must be integral multiples of TILE_DIM
  60 // -------------------------------------------------------
  61
  62 __global__ void copy(float *odata, float *idata, int width, int height)
  63 {
  64     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
  65     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
  66
  67     int index  = xIndex + width*yIndex;
  68
  69     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
  70     {
  71       odata[index+i*width] = idata[index+i*width];
  72     }
  73
  74 }
  75
  76 __global__ void copySharedMem(float *odata, float *idata, int width, int height)
  77 {
  78     __shared__ float tile[TILE_DIM][TILE_DIM];
  79
  80     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
  81     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
  82
  83     int index  = xIndex + width*yIndex;
  84
  85     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
  86     {
  87       if (xIndex < width && yIndex < height)
  88       {
  89         tile[threadIdx.y][threadIdx.x] = idata[index];
  90       }
  91     }
  92
  93     __syncthreads();
  94
  95     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
  96     {
  97       if (xIndex < height && yIndex < width)
  98       {
  99         odata[index] = tile[threadIdx.y][threadIdx.x];
 100       }
 101     }
 102 }
 103
 104 // -------------------------------------------------------
 105 // Transposes
 106 // width and height must be integral multiples of TILE_DIM
 107 // -------------------------------------------------------
 108
 109 __global__ void transposeNaive(float *odata, float *idata, int width, int height)
 110 {
 111     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 112     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 113
 114     int index_in  = xIndex + width * yIndex;
 115     int index_out = yIndex + height * xIndex;
 116
 117     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 118     {
 119       odata[index_out+i] = idata[index_in+i*width];
 120     }
 121 }
 122
 123 // coalesced transpose (with bank conflicts)
 124
 125 __global__ void transposeCoalesced(float *odata, float *idata, int width, int height)
 126 {
 127     __shared__ float tile[TILE_DIM][TILE_DIM];
 128
 129     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 130     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 131     int index_in = xIndex + (yIndex)*width;
 132
 133     xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
 134     yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
 135     int index_out = xIndex + (yIndex)*height;
 136
 137     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 138     {
 139       tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
 140     }
 141
 142     __syncthreads();
 143
 144     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 145     {
 146       odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
 147     }
 148 }
 149
 150 // Coalesced transpose with no bank conflicts
 151
 152 __global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height)
 153 {
 154     __shared__ float tile[TILE_DIM][TILE_DIM+1];
 155
 156     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 157     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 158     int index_in = xIndex + (yIndex)*width;
 159
 160     xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
 161     yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
 162     int index_out = xIndex + (yIndex)*height;
 163
 164     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 165     {
 166       tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
 167     }
 168
 169     __syncthreads();
 170
 171     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 172     {
 173       odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
 174     }
 175 }
 176
 177 // Transpose that effectively reorders execution of thread blocks along diagonals of the
 178 // matrix (also coalesced and has no bank conflicts)
 179 //
 180 // Here blockIdx.x is interpreted as the distance along a diagonal and blockIdx.y as
 181 // corresponding to different diagonals
 182 //
 183 // blockIdx_x and blockIdx_y expressions map the diagonal coordinates to the more commonly
 184 // used cartesian coordinates so that the only changes to the code from the coalesced version
 185 // are the calculation of the blockIdx_x and blockIdx_y and replacement of blockIdx.x and
 186 // bloclIdx.y with the subscripted versions in the remaining code
 187
 188 __global__ void transposeDiagonal(float *odata, float *idata, int width, int height)
 189 {
 190     __shared__ float tile[TILE_DIM][TILE_DIM+1];
 191
 192     int blockIdx_x, blockIdx_y;
 193
 194     // do diagonal reordering
 195     if (width == height)
 196     {
 197         blockIdx_y = blockIdx.x;
 198         blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
 199     }
 200     else
 201     {
 202         int bid = blockIdx.x + gridDim.x*blockIdx.y;
 203         blockIdx_y = bid%gridDim.y;
 204         blockIdx_x = ((bid/gridDim.y)+blockIdx_y)%gridDim.x;
 205     }
 206
 207     // from here on the code is same as previous kernel except blockIdx_x replaces blockIdx.x
 208     // and similarly for y
 209
 210     int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
 211     int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
 212     int index_in = xIndex + (yIndex)*width;
 213
 214     xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
 215     yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
 216     int index_out = xIndex + (yIndex)*height;
 217
 218     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 219     {
 220       tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
 221     }
 222
 223     __syncthreads();
 224
 225     for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
 226     {
 227       odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
 228     }
 229 }
 230
 231 // --------------------------------------------------------------------
 232 // Partial transposes
 233 // NB: the coarse- and fine-grained routines only perform part of a
 234 //     transpose and will fail the test against the reference solution
 235 //
 236 //     They are used to assess performance characteristics of different
 237 //     components of a full transpose
 238 // --------------------------------------------------------------------
 239
 240 __global__ void transposeFineGrained(float *odata, float *idata, int width, int height)
 241 {
 242     __shared__ float block[TILE_DIM][TILE_DIM+1];
 243
 244     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 245     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 246     int index = xIndex + (yIndex)*width;
 247
 248     for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
 249     {
 250       block[threadIdx.y+i][threadIdx.x] = idata[index+i*width];
 251     }
 252
 253     __syncthreads();
 254
 255     for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
 256     {
 257       odata[index+i*height] = block[threadIdx.x][threadIdx.y+i];
 258     }
 259 }
 260
 261
 262 __global__ void transposeCoarseGrained(float *odata, float *idata, int width, int height)
 263 {
 264     __shared__ float block[TILE_DIM][TILE_DIM+1];
 265
 266     int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 267     int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 268     int index_in = xIndex + (yIndex)*width;
 269
 270     xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
 271     yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
 272     int index_out = xIndex + (yIndex)*height;
 273
 274     for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
 275     {
 276       block[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
 277     }
 278
 279     __syncthreads();
 280
 281     for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
 282     {
 283             odata[index_out+i*height] = block[threadIdx.y+i][threadIdx.x];
 284     }
 285 }
 286
 287
 288 // ---------------------
 289 // host utility routines
 290 // ---------------------
 291
 292 void computeTransposeGold(float *gold, float *idata,
 293                           const  int size_x, const  int size_y)
 294 {
 295     for (int y = 0; y < size_y; ++y)
 296     {
 297         for (int x = 0; x < size_x; ++x)
 298         {
 299             gold[(x * size_y) + y] = idata[(y * size_x) + x];
 300         }
 301     }
 302 }
 303
 304
 305 void getParams(int argc, char **argv, cudaDeviceProp &deviceProp, int &size_x, int &size_y, int max_tile_dim)
 306 {
 307     // set matrix size (if (x,y) dim of matrix is not square, then this will have to be modified
 308     if (checkCmdLineFlag(argc, (const char **)argv, "dimX"))
 309     {
 310         size_x = getCmdLineArgumentInt(argc, (const char **) argv, "dimX");
 311
 312         if (size_x > max_tile_dim)
 313         {
 314             printf("> MatrixSize X = %d is greater than the recommended size = %d\n", size_x, max_tile_dim);
 315         }
 316         else
 317         {
 318             printf("> MatrixSize X = %d\n", size_x);
 319         }
 320     }
 321     else
 322     {
 323         size_x = max_tile_dim;
 324
 325         // If this is SM12 hardware, we want to round down to a multiple of 512
 326         if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
 327         {
 328             size_x = FLOOR(size_x, 512);
 329         }
 330         else     // else for SM10,SM11 we round down to a multiple of 384
 331         {
 332             size_x = FLOOR(size_x, 384);
 333         }
 334     }
 335
 336     if (checkCmdLineFlag(argc, (const char **)argv, "dimY"))
 337     {
 338         size_y = getCmdLineArgumentInt(argc, (const char **) argv, "dimY");
 339
 340         if (size_y > max_tile_dim)
 341         {
 342             printf("> MatrixSize Y = %d is greater than the recommended size = %d\n", size_y, max_tile_dim);
 343         }
 344         else
 345         {
 346             printf("> MatrixSize Y = %d\n", size_y);
 347         }
 348     }
 349     else
 350     {
 351         size_y = max_tile_dim;
 352
 353         // If this is SM12 hardware, we want to round down to a multiple of 512
 354         if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
 355         {
 356             size_y = FLOOR(size_y, 512);
 357         }
 358         else     // else for SM10,SM11 we round down to a multiple of 384
 359         {
 360             size_y = FLOOR(size_y, 384);
 361         }
 362     }
 363 }
 364
 365
 366 void
 367 showHelp()
 368 {
 369     printf("\n%s : Command line options\n", sSDKsample);
 370     printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 371     printf("> The default matrix size can be overridden with these parameters\n");
 372     printf("\t-dimX=row_dim_size (matrix row    dimensions)\n");
 373     printf("\t-dimY=col_dim_size (matrix column dimensions)\n");
 374 }
 375
 376
 377 // ----
 378 // main
 379 // ----
 380
 381 int
 382 main(int argc, char **argv)
 383 {
 384     // Start logs
 385     printf("%s Starting...\n\n", sSDKsample);
 386
 387     if (checkCmdLineFlag(argc, (const char **)argv, "help"))
 388     {
 389         showHelp();
 390         return 0;
 391     }
 392
 393     int devID = findCudaDevice(argc, (const char **)argv);
 394     cudaDeviceProp deviceProp;
 395
 396     // get number of SMs on this GPU
 397     checkCudaErrors(cudaGetDevice(&devID));
 398     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 399
 400     // compute the scaling factor (for GPUs with fewer MPs)
 401     float scale_factor, total_tiles;
 402     scale_factor = max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
 403
 404     printf("> Device %d: \"%s\"\n", devID, deviceProp.name);
 405     printf("> SM Capability %d.%d detected:\n", deviceProp.major, deviceProp.minor);
 406
 407     // Calculate number of tiles we will run for the Matrix Transpose performance tests
 408     int size_x, size_y, max_matrix_dim, matrix_size_test;
 409
 410     if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1)
 411     {
 412         matrix_size_test = 512;  // we round down max_matrix_dim for this perf test
 413         total_tiles = (float)MAX_TILES_SM12 / scale_factor;
 414     }
 415     else
 416     {
 417         matrix_size_test = 384;  // we round down max_matrix_dim for this perf test
 418         total_tiles = (float)MAX_TILES_SM10 / scale_factor;
 419     }
 420
 421     max_matrix_dim = FLOOR((int)(floor(sqrt(total_tiles))* TILE_DIM), matrix_size_test);
 422
 423     // This is the minimum size allowed
 424     if (max_matrix_dim == 0)
 425     {
 426         max_matrix_dim = matrix_size_test;
 427     }
 428
 429     printf("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
 430            deviceProp.name, deviceProp.multiProcessorCount,
 431            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 432            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 433
 434     printf("> Compute performance scaling factor = %4.2f\n", scale_factor);
 435
 436     // Extract parameters if there are any, command line -dimx and -dimy can override
 437     // any of these settings
 438     getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim);
 439
 440     if (size_x != size_y)
 441     {
 442         printf("\n[%s] does not support non-square matrices (row_dim_size(%d) != col_dim_size(%d))\nExiting...\n\n", sSDKsample, size_x, size_y);
 443         cudaDeviceReset();
 444         exit(EXIT_FAILURE);
 445     }
 446
 447     if (size_x%TILE_DIM != 0 || size_y%TILE_DIM != 0)
 448     {
 449         printf("[%s] Matrix size must be integral multiple of tile size\nExiting...\n\n", sSDKsample);
 450         cudaDeviceReset();
 451         exit(EXIT_FAILURE);
 452     }
 453
 454     // kernel pointer and descriptor
 455     void (*kernel)(float *, float *, int, int);
 456     char *kernelName;
 457
 458     // execution configuration parameters
 459     dim3 grid(size_x/TILE_DIM, size_y/TILE_DIM), threads(TILE_DIM,BLOCK_ROWS);
 460
 461     if (grid.x < 1 || grid.y < 1)
 462     {
 463         printf("[%s] grid size computation incorrect in test \nExiting...\n\n", sSDKsample);
 464         cudaDeviceReset();
 465         exit(EXIT_FAILURE);
 466     }
 467
 468     // CUDA events
 469     cudaEvent_t start, stop;
 470
 471     // size of memory required to store the matrix
 472     const  int mem_size = sizeof(float) * size_x*size_y;
 473
 474     if (2*mem_size > deviceProp.totalGlobalMem)
 475     {
 476         printf("Input matrix size is larger than the available device memory!\n");
 477         printf("Please choose a smaller size matrix\n");
 478         cudaDeviceReset();
 479         exit(EXIT_FAILURE);
 480     }
 481
 482     // allocate host memory
 483     float *h_idata = (float *) malloc(mem_size);
 484     float *h_odata = (float *) malloc(mem_size);
 485     float *transposeGold = (float *) malloc(mem_size);
 486     float *gold;
 487
 488     // allocate device memory
 489     float *d_idata, *d_odata;
 490     checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
 491     checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
 492
 493     // initalize host data
 494     for (int i = 0; i < (size_x*size_y); ++i)
 495     {
 496         h_idata[i] = (float) i;
 497     }
 498
 499     // copy host data to device
 500     checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
 501
 502     // Compute reference transpose solution
 503     computeTransposeGold(transposeGold, h_idata, size_x, size_y);
 504
 505     // print out common data for all kernels
 506     printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n",
 507            size_x, size_y, size_x/TILE_DIM, size_y/TILE_DIM, TILE_DIM, TILE_DIM, TILE_DIM, BLOCK_ROWS);
 508
 509     // initialize events
 510     checkCudaErrors(cudaEventCreate(&start));
 511     checkCudaErrors(cudaEventCreate(&stop));
 512
 513     //
 514     // loop over different kernels
 515     //
 516
 517     bool success = true;
 518
 519     for (int k = 0; k<8; k++)
 520     {
 521         // set kernel pointer
 522         switch (k)
 523         {
 524             case 0:
 525                 kernel = &copy;
 526                 kernelName = "simple copy       ";
 527                 break;
 528
 529             case 1:
 530                 kernel = &copySharedMem;
 531                 kernelName = "shared memory copy";
 532                 break;
 533
 534             case 2:
 535                 kernel = &transposeNaive;
 536                 kernelName = "naive             ";
 537                 break;
 538
 539             case 3:
 540                 kernel = &transposeCoalesced;
 541                 kernelName = "coalesced         ";
 542                 break;
 543
 544             case 4:
 545                 kernel = &transposeNoBankConflicts;
 546                 kernelName = "optimized         ";
 547                 break;
 548
 549             case 5:
 550                 kernel = &transposeCoarseGrained;
 551                 kernelName = "coarse-grained    ";
 552                 break;
 553
 554             case 6:
 555                 kernel = &transposeFineGrained;
 556                 kernelName = "fine-grained      ";
 557                 break;
 558
 559             case 7:
 560                 kernel = &transposeDiagonal;
 561                 kernelName = "diagonal          ";
 562                 break;
 563         }
 564
 565         // set reference solution
 566         if (kernel == &copy || kernel == &copySharedMem)
 567         {
 568             gold = h_idata;
 569         }
 570         else if (kernel == &transposeCoarseGrained || kernel == &transposeFineGrained)
 571         {
 572             gold = h_odata;   // fine- and coarse-grained kernels are not full transposes, so bypass check
 573         }
 574         else
 575         {
 576             gold = transposeGold;
 577         }
 578
 579         // Clear error status
 580         checkCudaErrors(cudaGetLastError());
 581
 582         // warmup to avoid timing startup
 583         kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);
 584
 585         // take measurements for loop over kernel launches
 586         checkCudaErrors(cudaEventRecord(start, 0));
 587
 588         for (int i=0; i < NUM_REPS; i++)
 589         {
 590             kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);
 591             // Ensure no launch failure
 592             checkCudaErrors(cudaGetLastError());
 593         }
 594         checkCudaErrors(cudaEventRecord(stop, 0));
 595         checkCudaErrors(cudaEventSynchronize(stop));
 596         float kernelTime;
 597         checkCudaErrors(cudaEventElapsedTime(&kernelTime, start, stop));
 598
 599         checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
 600         bool res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);
 601
 602         if (res == false)
 603         {
 604             printf("*** %s kernel FAILED ***\n", kernelName);
 605             success = false;
 606         }
 607
 608         // take measurements for loop inside kernel
 609         checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
 610         res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);
 611
 612         if (res == false)
 613         {
 614             printf("*** %s kernel FAILED ***\n", kernelName);
 615             success = false;
 616         }
 617
 618         // report effective bandwidths
 619         float kernelBandwidth = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(kernelTime/NUM_REPS);
 620         printf("transpose %s, Throughput = %.4f GB/s, Time = %.5f ms, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
 621                kernelName,
 622                kernelBandwidth,
 623                kernelTime/NUM_REPS,
 624                (size_x *size_y), 1, TILE_DIM *BLOCK_ROWS);
 625
 626     }
 627
 628     // cleanup
 629     free(h_idata);
 630     free(h_odata);
 631     free(transposeGold);
 632     cudaFree(d_idata);
 633     cudaFree(d_odata);
 634
 635     checkCudaErrors(cudaEventDestroy(start));
 636     checkCudaErrors(cudaEventDestroy(stop));
 637
 638     cudaDeviceReset();
 639
 640     if (!success)
 641     {
 642         printf("Test failed!\n");
 643         exit(EXIT_FAILURE);
 644     }
 645
 646     printf("Test passed\n");
 647     exit(EXIT_SUCCESS);
 648 }