2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
13 * Walsh transforms belong to a class of generalized Fourier transformations.
14 * They have applications in various fields of electrical engineering
15 * and numeric theory. In this sample we demonstrate efficient implementation
16 * of naturally-ordered Walsh transform
17 * (also known as Walsh-Hadamard or Hadamard transform) in CUDA and its
18 * particular application to dyadic convolution computation.
19 * Refer to excellent Jorg Arndt's "Algorithms for Programmers" textbook
20 * http://www.jjj.de/fxt/fxtbook.pdf (Chapter 22)
22 * Victor Podlozhnyuk (vpodlozhnyuk@nvidia.com)
30 #include <helper_functions.h>
31 #include <helper_cuda.h>
34 ////////////////////////////////////////////////////////////////////////////////
36 ////////////////////////////////////////////////////////////////////////////////
37 extern"C" void fwtCPU(float *h_Output, float *h_Input, int log2N);
38 extern"C" void slowWTcpu(float *h_Output, float *h_Input, int log2N);
39 extern "C" void dyadicConvolutionCPU(
48 ////////////////////////////////////////////////////////////////////////////////
50 ////////////////////////////////////////////////////////////////////////////////
51 #include "fastWalshTransform_kernel.cuh"
55 ////////////////////////////////////////////////////////////////////////////////
57 ////////////////////////////////////////////////////////////////////////////////
58 const int log2Kernel = 7;
59 const int log2Data = 23;
61 const int dataN = 1 << log2Data;
62 const int kernelN = 1 << log2Kernel;
64 const int DATA_SIZE = dataN * sizeof(float);
65 const int KERNEL_SIZE = kernelN * sizeof(float);
67 const double NOPS = 3.0 * (double)dataN * (double)log2Data / 2.0;
71 ////////////////////////////////////////////////////////////////////////////////
73 ////////////////////////////////////////////////////////////////////////////////
74 int main(int argc, char *argv[])
84 double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime;
86 StopWatchInterface *hTimer = NULL;
89 printf("%s Starting...\n\n", argv[0]);
91 // use command-line specified CUDA device, otherwise use device with highest Gflops/s
92 findCudaDevice(argc, (const char **)argv);
94 sdkCreateTimer(&hTimer);
96 printf("Initializing data...\n");
97 printf("...allocating CPU memory\n");
98 h_Kernel = (float *)malloc(KERNEL_SIZE);
99 h_Data = (float *)malloc(DATA_SIZE);
100 h_ResultCPU = (float *)malloc(DATA_SIZE);
101 h_ResultGPU = (float *)malloc(DATA_SIZE);
102 printf("...allocating GPU memory\n");
103 checkCudaErrors(cudaMalloc((void **)&d_Kernel, DATA_SIZE));
104 checkCudaErrors(cudaMalloc((void **)&d_Data, DATA_SIZE));
106 printf("...generating data\n");
107 printf("Data length: %i; kernel length: %i\n", dataN, kernelN);
110 for (i = 0; i < kernelN; i++)
112 h_Kernel[i] = (float)rand() / (float)RAND_MAX;
115 for (i = 0; i < dataN; i++)
117 h_Data[i] = (float)rand() / (float)RAND_MAX;
120 checkCudaErrors(cudaMemset(d_Kernel, 0, DATA_SIZE));
121 checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
122 checkCudaErrors(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice));
124 printf("Running GPU dyadic convolution using Fast Walsh Transform...\n");
125 checkCudaErrors(cudaDeviceSynchronize());
126 sdkResetTimer(&hTimer);
127 sdkStartTimer(&hTimer);
128 fwtBatchGPU(d_Data, 1, log2Data);
129 fwtBatchGPU(d_Kernel, 1, log2Data);
130 modulateGPU(d_Data, d_Kernel, dataN);
131 fwtBatchGPU(d_Data, 1, log2Data);
132 checkCudaErrors(cudaDeviceSynchronize());
133 sdkStopTimer(&hTimer);
134 gpuTime = sdkGetTimerValue(&hTimer);
135 printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, NOPS / (gpuTime * 0.001 * 1E+9));
137 printf("Reading back GPU results...\n");
138 checkCudaErrors(cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost));
140 printf("Running straightforward CPU dyadic convolution...\n");
141 dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel);
143 printf("Comparing the results...\n");
147 for (i = 0; i < dataN; i++)
149 delta = h_ResultCPU[i] - h_ResultGPU[i];
150 ref = h_ResultCPU[i];
151 sum_delta2 += delta * delta;
152 sum_ref2 += ref * ref;
155 L2norm = sqrt(sum_delta2 / sum_ref2);
157 printf("Shutting down...\n");
158 sdkDeleteTimer(&hTimer);
159 checkCudaErrors(cudaFree(d_Data));
160 checkCudaErrors(cudaFree(d_Kernel));
167 printf("L2 norm: %E\n", L2norm);
168 printf(L2norm < 1e-6 ? "Test passed\n" : "Test failed!\n");