2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
14 #define POWER_OF_TWO 1
18 texture<float, 1, cudaReadModeElementType> texFloat;
19 #define LOAD_FLOAT(i) tex1Dfetch(texFloat, i)
20 #define SET_FLOAT_BASE checkCudaErrors( cudaBindTexture(0, texFloat, d_Src) )
22 #define LOAD_FLOAT(i) d_Src[i]
23 #define SET_FLOAT_BASE
28 ////////////////////////////////////////////////////////////////////////////////
29 /// Position convolution kernel center at (0, 0) in the image
30 ////////////////////////////////////////////////////////////////////////////////
31 __global__ void padKernel_kernel(
42 const int y = blockDim.y * blockIdx.y + threadIdx.y;
43 const int x = blockDim.x * blockIdx.x + threadIdx.x;
45 if (y < kernelH && x < kernelW)
61 d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
67 ////////////////////////////////////////////////////////////////////////////////
68 // Prepare data for "pad to border" addressing mode
69 ////////////////////////////////////////////////////////////////////////////////
70 __global__ void padDataClampToBorder_kernel(
83 const int y = blockDim.y * blockIdx.y + threadIdx.y;
84 const int x = blockDim.x * blockIdx.x + threadIdx.x;
85 const int borderH = dataH + kernelY;
86 const int borderW = dataW + kernelX;
88 if (y < fftH && x < fftW)
102 if (y >= dataH && y < borderH)
107 if (x >= dataW && x < borderW)
122 d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
128 ////////////////////////////////////////////////////////////////////////////////
129 // Modulate Fourier image of padded data by Fourier image of padded kernel
130 // and normalize by FFT size
131 ////////////////////////////////////////////////////////////////////////////////
132 inline __device__ void mulAndScale(fComplex &a, const fComplex &b, const float &c)
134 fComplex t = {c *(a.x * b.x - a.y * b.y), c *(a.y * b.x + a.x * b.y)};
138 __global__ void modulateAndNormalize_kernel(
145 const int i = blockDim.x * blockIdx.x + threadIdx.x;
152 fComplex a = d_Src[i];
153 fComplex b = d_Dst[i];
155 mulAndScale(a, b, c);
162 ////////////////////////////////////////////////////////////////////////////////
163 // 2D R2C / C2R post/preprocessing kernels
164 ////////////////////////////////////////////////////////////////////////////////
166 texture<fComplex, 1, cudaReadModeElementType> texComplexA;
167 texture<fComplex, 1, cudaReadModeElementType> texComplexB;
168 #define LOAD_FCOMPLEX(i) tex1Dfetch(texComplexA, i)
169 #define LOAD_FCOMPLEX_A(i) tex1Dfetch(texComplexA, i)
170 #define LOAD_FCOMPLEX_B(i) tex1Dfetch(texComplexB, i)
172 #define SET_FCOMPLEX_BASE checkCudaErrors( cudaBindTexture(0, texComplexA, d_Src) )
173 #define SET_FCOMPLEX_BASE_A checkCudaErrors( cudaBindTexture(0, texComplexA, d_SrcA) )
174 #define SET_FCOMPLEX_BASE_B checkCudaErrors( cudaBindTexture(0, texComplexB, d_SrcB) )
176 #define LOAD_FCOMPLEX(i) d_Src[i]
177 #define LOAD_FCOMPLEX_A(i) d_SrcA[i]
178 #define LOAD_FCOMPLEX_B(i) d_SrcB[i]
180 #define SET_FCOMPLEX_BASE
181 #define SET_FCOMPLEX_BASE_A
182 #define SET_FCOMPLEX_BASE_B
185 inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle)
187 float A1 = 0.5f * (D1.x + D2.x);
188 float B1 = 0.5f * (D1.y - D2.y);
189 float A2 = 0.5f * (D1.y + D2.y);
190 float B2 = 0.5f * (D1.x - D2.x);
192 D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
193 D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
194 D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
195 D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
198 //Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
199 inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle)
201 float A1 = /* 0.5f * */ (D1.x + D2.x);
202 float B1 = /* 0.5f * */ (D1.y - D2.y);
203 float A2 = /* 0.5f * */ (D1.y + D2.y);
204 float B2 = /* 0.5f * */ (D1.x - D2.x);
206 D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
207 D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
208 D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
209 D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
212 inline __device__ void getTwiddle(fComplex &twiddle, float phase)
214 __sincosf(phase, &twiddle.y, &twiddle.x);
217 inline __device__ uint mod(uint a, uint DA)
219 //(DA - a) % DA, assuming a <= DA
220 return a ? (DA - a) : a;
223 static inline uint factorRadix2(uint &log2N, uint n)
232 for (log2N = 0; n % 2 == 0; n /= 2, log2N++);
238 inline __device__ void udivmod(uint ÷nd, uint divisor, uint &rem)
241 rem = dividend % divisor;
244 rem = dividend & (divisor - 1);
245 dividend >>= (__ffs(divisor) - 1);
249 __global__ void spPostprocess2D_kernel(
259 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
261 if (threadId >= threadCount)
266 uint x, y, i = threadId;
267 udivmod(i, DX / 2, x);
270 const uint srcOffset = i * DY * DX;
271 const uint dstOffset = i * DY * (DX + padding);
273 //Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
275 const uint loadPos1 = srcOffset + y * DX + x;
276 const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
277 const uint storePos1 = dstOffset + y * (DX + padding) + x;
278 const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
280 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
281 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
284 getTwiddle(twiddle, phaseBase * (float)x);
285 spPostprocessC2C(D1, D2, twiddle);
287 d_Dst[storePos1] = D1;
288 d_Dst[storePos2] = D2;
294 const uint loadPos1 = srcOffset + y * DX + DX / 2;
295 const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
296 const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
297 const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
299 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
300 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
302 //twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
303 fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
304 spPostprocessC2C(D1, D2, twiddle);
306 d_Dst[storePos1] = D1;
307 d_Dst[storePos2] = D2;
311 __global__ void spPreprocess2D_kernel(
321 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
323 if (threadId >= threadCount)
328 uint x, y, i = threadId;
329 udivmod(i, DX / 2, x);
332 //Avoid overwrites in columns 0 and DX / 2 by different threads (lower and upper halves)
333 if ((x == 0) && (y > DY / 2))
338 const uint srcOffset = i * DY * (DX + padding);
339 const uint dstOffset = i * DY * DX;
341 //Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
343 const uint loadPos1 = srcOffset + y * (DX + padding) + x;
344 const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
345 const uint storePos1 = dstOffset + y * DX + x;
346 const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
348 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
349 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
352 getTwiddle(twiddle, phaseBase * (float)x);
353 spPreprocessC2C(D1, D2, twiddle);
355 d_Dst[storePos1] = D1;
356 d_Dst[storePos2] = D2;
362 const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
363 const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
364 const uint storePos1 = dstOffset + y * DX + DX / 2;
365 const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
367 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
368 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
370 //twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
371 fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
372 spPreprocessC2C(D1, D2, twiddle);
374 d_Dst[storePos1] = D1;
375 d_Dst[storePos2] = D2;
381 ////////////////////////////////////////////////////////////////////////////////
382 // Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D
383 ////////////////////////////////////////////////////////////////////////////////
384 __global__ void spProcess2D_kernel(
395 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
397 if (threadId >= threadCount)
402 uint x, y, i = threadId;
404 udivmod(i, DY / 2, y);
406 const uint offset = i * DY * DX;
408 //Avoid overwrites in rows 0 and DY / 2 by different threads (left and right halves)
409 //Otherwise correctness for in-place transformations is affected
410 if ((y == 0) && (x > DX / 2))
417 //Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
419 const uint pos1 = offset + y * DX + x;
420 const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
422 fComplex D1 = LOAD_FCOMPLEX_A(pos1);
423 fComplex D2 = LOAD_FCOMPLEX_A(pos2);
424 fComplex K1 = LOAD_FCOMPLEX_B(pos1);
425 fComplex K2 = LOAD_FCOMPLEX_B(pos2);
426 getTwiddle(twiddle, phaseBase * (float)x);
428 spPostprocessC2C(D1, D2, twiddle);
429 spPostprocessC2C(K1, K2, twiddle);
430 mulAndScale(D1, K1, c);
431 mulAndScale(D2, K2, c);
432 spPreprocessC2C(D1, D2, twiddle);
440 const uint pos1 = offset + (DY / 2) * DX + x;
441 const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
443 fComplex D1 = LOAD_FCOMPLEX_A(pos1);
444 fComplex D2 = LOAD_FCOMPLEX_A(pos2);
445 fComplex K1 = LOAD_FCOMPLEX_B(pos1);
446 fComplex K2 = LOAD_FCOMPLEX_B(pos2);
448 spPostprocessC2C(D1, D2, twiddle);
449 spPostprocessC2C(K1, K2, twiddle);
450 mulAndScale(D1, K1, c);
451 mulAndScale(D2, K2, c);
452 spPreprocessC2C(D1, D2, twiddle);