utiltools/X86MAC64/cuda/samples/0_Simple/simpleCubemapTexture/simpleCubemapTexture.cu

   1 /*
   2 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3 *
   4 * Please refer to the NVIDIA end user license agreement (EULA) associated
   5 * with this source code for terms and conditions that govern your use of
   6 * this software. Any use, reproduction, disclosure, or distribution of
   7 * this software and related documentation outside the terms of the EULA
   8 * is strictly prohibited.
   9 *
  10 */
  11
  12 /*
  13 * This sample demonstrates how to use texture fetches from layered 2D textures in CUDA C
  14 *
  15 * This sample first generates a 3D input data array for the layered texture
  16 * and the expected output. Then it starts CUDA C kernels, one for each layer,
  17 * which fetch their layer's texture data (using normalized texture coordinates)
  18 * transform it to the expected output, and write it to a 3D output data array.
  19 */
  20
  21 // includes, system
  22 #include <stdlib.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <math.h>
  26
  27 // includes CUDA
  28 #include <cuda_runtime.h>
  29
  30 // helper functions and utilities to work with CUDA
  31 #include <helper_functions.h>
  32 #include <helper_cuda.h>
  33
  34 static char *sSDKname = "simpleCubemapTexture";
  35
  36 // includes, kernels
  37 // declare texture reference for layered 2D float texture
  38 // Note: The "dim" field in the texture reference template is now deprecated.
  39 // Instead, please use a texture type macro such as cudaTextureType1D, etc.
  40
  41 texture<float, cudaTextureTypeCubemap> tex;
  42
  43 ////////////////////////////////////////////////////////////////////////////////
  44 //! Transform a cubemap face of a linear buffe using cubemap texture lookups
  45 //! @param g_odata  output data in global memory
  46 ////////////////////////////////////////////////////////////////////////////////
  47 __global__ void
  48 transformKernel(float *g_odata, int width)
  49 {
  50     // calculate this thread's data point
  51     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
  52     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
  53
  54     // 0.5f offset and division are necessary to access the original data points
  55     // in the texture (such that bilinear interpolation will not be activated).
  56     // For details, see also CUDA Programming Guide, Appendix D
  57
  58     float u = ((x+0.5f) / (float) width) * 2.f - 1.f;
  59     float v = ((y+0.5f) / (float) width) * 2.f - 1.f;
  60
  61     float cx, cy, cz;
  62
  63     for (unsigned int face = 0; face < 6; face ++)
  64     {
  65         //Layer 0 is positive X face
  66         if (face == 0)
  67         {
  68             cx = 1;
  69             cy = -v;
  70             cz = -u;
  71         }
  72         //Layer 1 is negative X face
  73         else if (face == 1)
  74         {
  75             cx = -1;
  76             cy = -v;
  77             cz = u;
  78         }
  79         //Layer 2 is positive Y face
  80         else if (face == 2)
  81         {
  82             cx = u;
  83             cy = 1;
  84             cz = v;
  85         }
  86         //Layer 3 is negative Y face
  87         else if (face == 3)
  88         {
  89             cx = u;
  90             cy = -1;
  91             cz = -v;
  92         }
  93         //Layer 4 is positive Z face
  94         else if (face == 4)
  95         {
  96             cx = u;
  97             cy = -v;
  98             cz = 1;
  99         }
 100         //Layer 4 is negative Z face
 101         else if (face == 5)
 102         {
 103             cx = -u;
 104             cy = -v;
 105             cz = -1;
 106         }
 107
 108         // read from texture, do expected transformation and write to global memory
 109         g_odata[face*width*width + y*width + x] = -texCubemap(tex, cx, cy, cz);
 110     }
 111 }
 112
 113
 114
 115 ////////////////////////////////////////////////////////////////////////////////
 116 // Program main
 117 ////////////////////////////////////////////////////////////////////////////////
 118 int
 119 main(int argc, char **argv)
 120 {
 121     // use command-line specified CUDA device, otherwise use device with highest Gflops/s
 122     int devID = findCudaDevice(argc, (const char **)argv);
 123
 124     bool bResult = true;
 125
 126     // get number of SMs on this GPU
 127     cudaDeviceProp deviceProps;
 128
 129     checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 130     printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
 131     printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
 132
 133     if (deviceProps.major < 2)
 134     {
 135         printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test will exit... \n", sSDKname);
 136         cudaDeviceReset();
 137         exit(EXIT_SUCCESS);
 138     }
 139
 140     // generate input data for layered texture
 141     unsigned int width=64, num_faces = 6, num_layers = 1;
 142     unsigned int cubemap_size = width * width * num_faces;
 143     unsigned int size = cubemap_size * num_layers * sizeof(float);
 144     float *h_data = (float *) malloc(size);
 145
 146     for (int i = 0; i < (int)(cubemap_size * num_layers); i++)
 147     {
 148         h_data[i] = (float)i;
 149     }
 150
 151
 152
 153     // this is the expected transformation of the input data (the expected output)
 154     float *h_data_ref = (float *) malloc(size);
 155
 156     for (unsigned int layer = 0; layer < num_layers; layer++)
 157     {
 158         for (int i = 0; i < (int)(cubemap_size); i++)
 159         {
 160             h_data_ref[layer*cubemap_size + i] = -h_data[layer*cubemap_size + i] + layer;
 161         }
 162     }
 163
 164     // allocate device memory for result
 165     float *d_data = NULL;
 166     checkCudaErrors(cudaMalloc((void **) &d_data, size));
 167
 168     // allocate array and copy image data
 169     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 170     cudaArray *cu_3darray;
 171     //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
 172     checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
 173     cudaMemcpy3DParms myparms = {0};
 174     myparms.srcPos = make_cudaPos(0,0,0);
 175     myparms.dstPos = make_cudaPos(0,0,0);
 176     myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
 177     myparms.dstArray = cu_3darray;
 178     myparms.extent = make_cudaExtent(width, width, num_faces);
 179     myparms.kind = cudaMemcpyHostToDevice;
 180     checkCudaErrors(cudaMemcpy3D(&myparms));
 181
 182     // set texture parameters
 183     tex.addressMode[0] = cudaAddressModeWrap;
 184     tex.addressMode[1] = cudaAddressModeWrap;
 185     tex.filterMode = cudaFilterModeLinear;
 186     tex.normalized = true;  // access with normalized texture coordinates
 187
 188     // Bind the array to the texture
 189     checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));
 190
 191     dim3 dimBlock(8, 8, 1);
 192     dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
 193
 194     printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
 195            width, num_layers, dimGrid.x, dimGrid.y);
 196
 197     transformKernel<<< dimGrid, dimBlock >>>(d_data, width);  // warmup (for better timing)
 198
 199     // check if kernel execution generated an error
 200     getLastCudaError("warmup Kernel execution failed");
 201
 202     checkCudaErrors(cudaDeviceSynchronize());
 203
 204     StopWatchInterface *timer = NULL;
 205     sdkCreateTimer(&timer);
 206     sdkStartTimer(&timer);
 207
 208     // execute the kernel
 209     transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width);
 210
 211     // check if kernel execution generated an error
 212     getLastCudaError("Kernel execution failed");
 213
 214     checkCudaErrors(cudaDeviceSynchronize());
 215     sdkStopTimer(&timer);
 216     printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
 217     printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
 218     sdkDeleteTimer(&timer);
 219
 220     // allocate mem for the result on host side
 221     float *h_odata = (float *) malloc(size);
 222     // copy result from device to host
 223     checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
 224
 225     // write regression file if necessary
 226     if (checkCmdLineFlag(argc, (const char **)argv, "regression"))
 227     {
 228         // write file for regression test
 229         sdkWriteFile<float>("./data/regression.dat", h_odata, width*width, 0.0f, false);
 230     }
 231     else
 232     {
 233         printf("Comparing kernel output to expected data\n");
 234
 235 #define MIN_EPSILON_ERROR 5e-3f
 236         bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
 237     }
 238
 239     // cleanup memory
 240     free(h_data);
 241     free(h_data_ref);
 242     free(h_odata);
 243
 244     checkCudaErrors(cudaFree(d_data));
 245     checkCudaErrors(cudaFreeArray(cu_3darray));
 246
 247     cudaDeviceReset();
 248     exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 249 }