hostdepend/X86MAC64/util/X86MAC64/cuda/samples/3_Imaging/convolutionFFT2D/convolutionFFT2D.cuh

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12
  13 #define  USE_TEXTURE 1
  14 #define POWER_OF_TWO 1
  15
  16
  17 #if(USE_TEXTURE)
  18 texture<float, 1, cudaReadModeElementType> texFloat;
  19 #define   LOAD_FLOAT(i) tex1Dfetch(texFloat, i)
  20 #define  SET_FLOAT_BASE checkCudaErrors( cudaBindTexture(0, texFloat, d_Src) )
  21 #else
  22 #define  LOAD_FLOAT(i) d_Src[i]
  23 #define SET_FLOAT_BASE
  24 #endif
  25
  26
  27
  28 ////////////////////////////////////////////////////////////////////////////////
  29 /// Position convolution kernel center at (0, 0) in the image
  30 ////////////////////////////////////////////////////////////////////////////////
  31 __global__ void padKernel_kernel(
  32     float *d_Dst,
  33     float *d_Src,
  34     int fftH,
  35     int fftW,
  36     int kernelH,
  37     int kernelW,
  38     int kernelY,
  39     int kernelX
  40 )
  41 {
  42     const int y = blockDim.y * blockIdx.y + threadIdx.y;
  43     const int x = blockDim.x * blockIdx.x + threadIdx.x;
  44
  45     if (y < kernelH && x < kernelW)
  46     {
  47         int ky = y - kernelY;
  48
  49         if (ky < 0)
  50         {
  51             ky += fftH;
  52         }
  53
  54         int kx = x - kernelX;
  55
  56         if (kx < 0)
  57         {
  58             kx += fftW;
  59         }
  60
  61         d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
  62     }
  63 }
  64
  65
  66
  67 ////////////////////////////////////////////////////////////////////////////////
  68 // Prepare data for "pad to border" addressing mode
  69 ////////////////////////////////////////////////////////////////////////////////
  70 __global__ void padDataClampToBorder_kernel(
  71     float *d_Dst,
  72     float *d_Src,
  73     int fftH,
  74     int fftW,
  75     int dataH,
  76     int dataW,
  77     int kernelH,
  78     int kernelW,
  79     int kernelY,
  80     int kernelX
  81 )
  82 {
  83     const int y = blockDim.y * blockIdx.y + threadIdx.y;
  84     const int x = blockDim.x * blockIdx.x + threadIdx.x;
  85     const int borderH = dataH + kernelY;
  86     const int borderW = dataW + kernelX;
  87
  88     if (y < fftH && x < fftW)
  89     {
  90         int dy, dx;
  91
  92         if (y < dataH)
  93         {
  94             dy = y;
  95         }
  96
  97         if (x < dataW)
  98         {
  99             dx = x;
 100         }
 101
 102         if (y >= dataH && y < borderH)
 103         {
 104             dy = dataH - 1;
 105         }
 106
 107         if (x >= dataW && x < borderW)
 108         {
 109             dx = dataW - 1;
 110         }
 111
 112         if (y >= borderH)
 113         {
 114             dy = 0;
 115         }
 116
 117         if (x >= borderW)
 118         {
 119             dx = 0;
 120         }
 121
 122         d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
 123     }
 124 }
 125
 126
 127
 128 ////////////////////////////////////////////////////////////////////////////////
 129 // Modulate Fourier image of padded data by Fourier image of padded kernel
 130 // and normalize by FFT size
 131 ////////////////////////////////////////////////////////////////////////////////
 132 inline __device__ void mulAndScale(fComplex &a, const fComplex &b, const float &c)
 133 {
 134     fComplex t = {c *(a.x * b.x - a.y * b.y), c *(a.y * b.x + a.x * b.y)};
 135     a = t;
 136 }
 137
 138 __global__ void modulateAndNormalize_kernel(
 139     fComplex *d_Dst,
 140     fComplex *d_Src,
 141     int dataSize,
 142     float c
 143 )
 144 {
 145     const int i = blockDim.x * blockIdx.x + threadIdx.x;
 146
 147     if (i >= dataSize)
 148     {
 149         return;
 150     }
 151
 152     fComplex a = d_Src[i];
 153     fComplex b = d_Dst[i];
 154
 155     mulAndScale(a, b, c);
 156
 157     d_Dst[i] = a;
 158 }
 159
 160
 161
 162 ////////////////////////////////////////////////////////////////////////////////
 163 // 2D R2C / C2R post/preprocessing kernels
 164 ////////////////////////////////////////////////////////////////////////////////
 165 #if(USE_TEXTURE)
 166 texture<fComplex, 1, cudaReadModeElementType> texComplexA;
 167 texture<fComplex, 1, cudaReadModeElementType> texComplexB;
 168 #define    LOAD_FCOMPLEX(i) tex1Dfetch(texComplexA, i)
 169 #define  LOAD_FCOMPLEX_A(i) tex1Dfetch(texComplexA, i)
 170 #define  LOAD_FCOMPLEX_B(i) tex1Dfetch(texComplexB, i)
 171
 172 #define   SET_FCOMPLEX_BASE checkCudaErrors( cudaBindTexture(0, texComplexA,  d_Src) )
 173 #define SET_FCOMPLEX_BASE_A checkCudaErrors( cudaBindTexture(0, texComplexA, d_SrcA) )
 174 #define SET_FCOMPLEX_BASE_B checkCudaErrors( cudaBindTexture(0, texComplexB, d_SrcB) )
 175 #else
 176 #define    LOAD_FCOMPLEX(i)  d_Src[i]
 177 #define  LOAD_FCOMPLEX_A(i) d_SrcA[i]
 178 #define  LOAD_FCOMPLEX_B(i) d_SrcB[i]
 179
 180 #define   SET_FCOMPLEX_BASE
 181 #define SET_FCOMPLEX_BASE_A
 182 #define SET_FCOMPLEX_BASE_B
 183 #endif
 184
 185 inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle)
 186 {
 187     float A1 = 0.5f * (D1.x + D2.x);
 188     float B1 = 0.5f * (D1.y - D2.y);
 189     float A2 = 0.5f * (D1.y + D2.y);
 190     float B2 = 0.5f * (D1.x - D2.x);
 191
 192     D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
 193     D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
 194     D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
 195     D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
 196 }
 197
 198 //Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
 199 inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle)
 200 {
 201     float A1 = /* 0.5f * */ (D1.x + D2.x);
 202     float B1 = /* 0.5f * */ (D1.y - D2.y);
 203     float A2 = /* 0.5f * */ (D1.y + D2.y);
 204     float B2 = /* 0.5f * */ (D1.x - D2.x);
 205
 206     D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
 207     D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
 208     D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
 209     D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
 210 }
 211
 212 inline __device__ void getTwiddle(fComplex &twiddle, float phase)
 213 {
 214     __sincosf(phase, &twiddle.y, &twiddle.x);
 215 }
 216
 217 inline __device__ uint mod(uint a, uint DA)
 218 {
 219     //(DA - a) % DA, assuming a <= DA
 220     return a ? (DA - a) : a;
 221 }
 222
 223 static inline uint factorRadix2(uint &log2N, uint n)
 224 {
 225     if (!n)
 226     {
 227         log2N = 0;
 228         return 0;
 229     }
 230     else
 231     {
 232         for (log2N = 0; n % 2 == 0; n /= 2, log2N++);
 233
 234         return n;
 235     }
 236 }
 237
 238 inline __device__ void udivmod(uint &dividend, uint divisor, uint &rem)
 239 {
 240 #if(!POWER_OF_TWO)
 241     rem = dividend % divisor;
 242     dividend /= divisor;
 243 #else
 244     rem = dividend & (divisor - 1);
 245     dividend >>= (__ffs(divisor) - 1);
 246 #endif
 247 }
 248
 249 __global__ void spPostprocess2D_kernel(
 250     fComplex *d_Dst,
 251     fComplex *d_Src,
 252     uint DY,
 253     uint DX,
 254     uint threadCount,
 255     uint padding,
 256     float phaseBase
 257 )
 258 {
 259     const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
 260
 261     if (threadId >= threadCount)
 262     {
 263         return;
 264     }
 265
 266     uint x, y, i = threadId;
 267     udivmod(i, DX / 2, x);
 268     udivmod(i, DY, y);
 269
 270     const uint srcOffset = i * DY * DX;
 271     const uint dstOffset = i * DY * (DX + padding);
 272
 273     //Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
 274     {
 275         const uint  loadPos1 = srcOffset +          y * DX +          x;
 276         const uint  loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
 277         const uint storePos1 = dstOffset +          y * (DX + padding) +        x;
 278         const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
 279
 280         fComplex D1 = LOAD_FCOMPLEX(loadPos1);
 281         fComplex D2 = LOAD_FCOMPLEX(loadPos2);
 282
 283         fComplex twiddle;
 284         getTwiddle(twiddle, phaseBase * (float)x);
 285         spPostprocessC2C(D1, D2, twiddle);
 286
 287         d_Dst[storePos1] = D1;
 288         d_Dst[storePos2] = D2;
 289     }
 290
 291     //Process x = DX / 2
 292     if (x == 0)
 293     {
 294         const uint  loadPos1 = srcOffset +          y * DX + DX / 2;
 295         const uint  loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
 296         const uint storePos1 = dstOffset +          y * (DX + padding) + DX / 2;
 297         const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
 298
 299         fComplex D1 = LOAD_FCOMPLEX(loadPos1);
 300         fComplex D2 = LOAD_FCOMPLEX(loadPos2);
 301
 302         //twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
 303         fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
 304         spPostprocessC2C(D1, D2, twiddle);
 305
 306         d_Dst[storePos1] = D1;
 307         d_Dst[storePos2] = D2;
 308     }
 309 }
 310
 311 __global__ void spPreprocess2D_kernel(
 312     fComplex *d_Dst,
 313     fComplex *d_Src,
 314     uint DY,
 315     uint DX,
 316     uint threadCount,
 317     uint padding,
 318     float phaseBase
 319 )
 320 {
 321     const uint threadId = blockIdx.x *  blockDim.x + threadIdx.x;
 322
 323     if (threadId >= threadCount)
 324     {
 325         return;
 326     }
 327
 328     uint x, y, i = threadId;
 329     udivmod(i, DX / 2, x);
 330     udivmod(i, DY, y);
 331
 332     //Avoid overwrites in columns 0 and DX / 2 by different threads (lower and upper halves)
 333     if ((x == 0) && (y > DY / 2))
 334     {
 335         return;
 336     }
 337
 338     const uint srcOffset = i * DY * (DX + padding);
 339     const uint dstOffset = i * DY * DX;
 340
 341     //Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
 342     {
 343         const uint  loadPos1 = srcOffset +          y * (DX + padding) +        x;
 344         const uint  loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
 345         const uint storePos1 = dstOffset +          y * DX +          x;
 346         const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
 347
 348         fComplex D1 = LOAD_FCOMPLEX(loadPos1);
 349         fComplex D2 = LOAD_FCOMPLEX(loadPos2);
 350
 351         fComplex twiddle;
 352         getTwiddle(twiddle, phaseBase * (float)x);
 353         spPreprocessC2C(D1, D2, twiddle);
 354
 355         d_Dst[storePos1] = D1;
 356         d_Dst[storePos2] = D2;
 357     }
 358
 359     //Process x = DX / 2
 360     if (x == 0)
 361     {
 362         const uint  loadPos1 = srcOffset +          y * (DX + padding) + DX / 2;
 363         const uint  loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
 364         const uint storePos1 = dstOffset +          y * DX + DX / 2;
 365         const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
 366
 367         fComplex D1 = LOAD_FCOMPLEX(loadPos1);
 368         fComplex D2 = LOAD_FCOMPLEX(loadPos2);
 369
 370         //twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
 371         fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
 372         spPreprocessC2C(D1, D2, twiddle);
 373
 374         d_Dst[storePos1] = D1;
 375         d_Dst[storePos2] = D2;
 376     }
 377 }
 378
 379
 380
 381 ////////////////////////////////////////////////////////////////////////////////
 382 // Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D
 383 ////////////////////////////////////////////////////////////////////////////////
 384 __global__ void spProcess2D_kernel(
 385     fComplex *d_Dst,
 386     fComplex *d_SrcA,
 387     fComplex *d_SrcB,
 388     uint DY,
 389     uint DX,
 390     uint threadCount,
 391     float phaseBase,
 392     float c
 393 )
 394 {
 395     const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
 396
 397     if (threadId >= threadCount)
 398     {
 399         return;
 400     }
 401
 402     uint x, y, i = threadId;
 403     udivmod(i, DX, x);
 404     udivmod(i, DY / 2, y);
 405
 406     const uint offset = i * DY * DX;
 407
 408     //Avoid overwrites in rows 0 and DY / 2 by different threads (left and right halves)
 409     //Otherwise correctness for in-place transformations is affected
 410     if ((y == 0) && (x > DX / 2))
 411     {
 412         return;
 413     }
 414
 415     fComplex twiddle;
 416
 417     //Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
 418     {
 419         const uint pos1 = offset +          y * DX +          x;
 420         const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
 421
 422         fComplex D1 = LOAD_FCOMPLEX_A(pos1);
 423         fComplex D2 = LOAD_FCOMPLEX_A(pos2);
 424         fComplex K1 = LOAD_FCOMPLEX_B(pos1);
 425         fComplex K2 = LOAD_FCOMPLEX_B(pos2);
 426         getTwiddle(twiddle, phaseBase * (float)x);
 427
 428         spPostprocessC2C(D1, D2, twiddle);
 429         spPostprocessC2C(K1, K2, twiddle);
 430         mulAndScale(D1, K1, c);
 431         mulAndScale(D2, K2, c);
 432         spPreprocessC2C(D1, D2, twiddle);
 433
 434         d_Dst[pos1] = D1;
 435         d_Dst[pos2] = D2;
 436     }
 437
 438     if (y == 0)
 439     {
 440         const uint pos1 = offset + (DY / 2) * DX +          x;
 441         const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
 442
 443         fComplex D1 = LOAD_FCOMPLEX_A(pos1);
 444         fComplex D2 = LOAD_FCOMPLEX_A(pos2);
 445         fComplex K1 = LOAD_FCOMPLEX_B(pos1);
 446         fComplex K2 = LOAD_FCOMPLEX_B(pos2);
 447
 448         spPostprocessC2C(D1, D2, twiddle);
 449         spPostprocessC2C(K1, K2, twiddle);
 450         mulAndScale(D1, K1, c);
 451         mulAndScale(D2, K2, c);
 452         spPreprocessC2C(D1, D2, twiddle);
 453
 454         d_Dst[pos1] = D1;
 455         d_Dst[pos2] = D2;
 456     }
 457 }