utiltools/X86MAC64/cuda/samples/6_Advanced/fastWalshTransform/fastWalshTransform.cu

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 /*
  13  * Walsh transforms belong to a class of generalized Fourier transformations.
  14  * They have applications in various fields of electrical engineering
  15  * and numeric theory. In this sample we demonstrate efficient implementation
  16  * of naturally-ordered Walsh transform
  17  * (also known as Walsh-Hadamard or Hadamard transform) in CUDA and its
  18  * particular application to dyadic convolution computation.
  19  * Refer to excellent Jorg Arndt's "Algorithms for Programmers" textbook
  20  * http://www.jjj.de/fxt/fxtbook.pdf (Chapter 22)
  21  *
  22  * Victor Podlozhnyuk (vpodlozhnyuk@nvidia.com)
  23  */
  24
  25
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <helper_functions.h>
  31 #include <helper_cuda.h>
  32
  33
  34 ////////////////////////////////////////////////////////////////////////////////
  35 // Reference CPU FWT
  36 ////////////////////////////////////////////////////////////////////////////////
  37 extern"C" void fwtCPU(float *h_Output, float *h_Input, int log2N);
  38 extern"C" void slowWTcpu(float *h_Output, float *h_Input, int log2N);
  39 extern "C" void dyadicConvolutionCPU(
  40     float *h_Result,
  41     float *h_Data,
  42     float *h_Kernel,
  43     int log2dataN,
  44     int log2kernelN
  45 );
  46
  47
  48 ////////////////////////////////////////////////////////////////////////////////
  49 // GPU FWT
  50 ////////////////////////////////////////////////////////////////////////////////
  51 #include "fastWalshTransform_kernel.cuh"
  52
  53
  54
  55 ////////////////////////////////////////////////////////////////////////////////
  56 // Data configuration
  57 ////////////////////////////////////////////////////////////////////////////////
  58 const int log2Kernel = 7;
  59 const   int log2Data = 23;
  60
  61 const int   dataN = 1 << log2Data;
  62 const int kernelN = 1 << log2Kernel;
  63
  64 const int   DATA_SIZE = dataN   * sizeof(float);
  65 const int KERNEL_SIZE = kernelN * sizeof(float);
  66
  67 const double NOPS = 3.0 * (double)dataN * (double)log2Data / 2.0;
  68
  69
  70
  71 ////////////////////////////////////////////////////////////////////////////////
  72 // Main program
  73 ////////////////////////////////////////////////////////////////////////////////
  74 int main(int argc, char *argv[])
  75 {
  76     float *h_Data,
  77           *h_Kernel,
  78           *h_ResultCPU,
  79           *h_ResultGPU;
  80
  81     float *d_Data,
  82           *d_Kernel;
  83
  84     double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime;
  85
  86     StopWatchInterface *hTimer = NULL;
  87     int i;
  88
  89     printf("%s Starting...\n\n", argv[0]);
  90
  91     // use command-line specified CUDA device, otherwise use device with highest Gflops/s
  92     findCudaDevice(argc, (const char **)argv);
  93
  94     sdkCreateTimer(&hTimer);
  95
  96     printf("Initializing data...\n");
  97     printf("...allocating CPU memory\n");
  98     h_Kernel    = (float *)malloc(KERNEL_SIZE);
  99     h_Data      = (float *)malloc(DATA_SIZE);
 100     h_ResultCPU = (float *)malloc(DATA_SIZE);
 101     h_ResultGPU = (float *)malloc(DATA_SIZE);
 102     printf("...allocating GPU memory\n");
 103     checkCudaErrors(cudaMalloc((void **)&d_Kernel, DATA_SIZE));
 104     checkCudaErrors(cudaMalloc((void **)&d_Data,   DATA_SIZE));
 105
 106     printf("...generating data\n");
 107     printf("Data length: %i; kernel length: %i\n", dataN, kernelN);
 108     srand(2007);
 109
 110     for (i = 0; i < kernelN; i++)
 111     {
 112         h_Kernel[i] = (float)rand() / (float)RAND_MAX;
 113     }
 114
 115     for (i = 0; i < dataN; i++)
 116     {
 117         h_Data[i] = (float)rand() / (float)RAND_MAX;
 118     }
 119
 120     checkCudaErrors(cudaMemset(d_Kernel, 0, DATA_SIZE));
 121     checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
 122     checkCudaErrors(cudaMemcpy(d_Data,   h_Data,     DATA_SIZE, cudaMemcpyHostToDevice));
 123
 124     printf("Running GPU dyadic convolution using Fast Walsh Transform...\n");
 125     checkCudaErrors(cudaDeviceSynchronize());
 126     sdkResetTimer(&hTimer);
 127     sdkStartTimer(&hTimer);
 128     fwtBatchGPU(d_Data, 1, log2Data);
 129     fwtBatchGPU(d_Kernel, 1, log2Data);
 130     modulateGPU(d_Data, d_Kernel, dataN);
 131     fwtBatchGPU(d_Data, 1, log2Data);
 132     checkCudaErrors(cudaDeviceSynchronize());
 133     sdkStopTimer(&hTimer);
 134     gpuTime = sdkGetTimerValue(&hTimer);
 135     printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, NOPS / (gpuTime * 0.001 * 1E+9));
 136
 137     printf("Reading back GPU results...\n");
 138     checkCudaErrors(cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost));
 139
 140     printf("Running straightforward CPU dyadic convolution...\n");
 141     dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel);
 142
 143     printf("Comparing the results...\n");
 144     sum_delta2 = 0;
 145     sum_ref2   = 0;
 146
 147     for (i = 0; i < dataN; i++)
 148     {
 149         delta       = h_ResultCPU[i] - h_ResultGPU[i];
 150         ref         = h_ResultCPU[i];
 151         sum_delta2 += delta * delta;
 152         sum_ref2   += ref * ref;
 153     }
 154
 155     L2norm = sqrt(sum_delta2 / sum_ref2);
 156
 157     printf("Shutting down...\n");
 158     sdkDeleteTimer(&hTimer);
 159     checkCudaErrors(cudaFree(d_Data));
 160     checkCudaErrors(cudaFree(d_Kernel));
 161     free(h_ResultGPU);
 162     free(h_ResultCPU);
 163     free(h_Data);
 164     free(h_Kernel);
 165
 166     cudaDeviceReset();
 167     printf("L2 norm: %E\n", L2norm);
 168     printf(L2norm < 1e-6 ? "Test passed\n" : "Test failed!\n");
 169 }