utiltools/X86MAC64/cuda/samples/3_Imaging/dct8x8/dct8x8_kernel2.cuh

   1 /*
   2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5  * with this source code for terms and conditions that govern your use of
   6  * this software. Any use, reproduction, disclosure, or distribution of
   7  * this software and related documentation outside the terms of the EULA
   8  * is strictly prohibited.
   9  *
  10  */
  11
  12 /**
  13 **************************************************************************
  14 * \file dct8x8_kernel2.cu
  15 * \brief Contains 2nd kernel implementations of DCT and IDCT routines, used in
  16 *        JPEG internal data processing. Optimized device code.
  17 *
  18 * This code implements traditional approach to forward and inverse Discrete
  19 * Cosine Transform to blocks of image pixels (of 8x8 size), as in JPEG standard.
  20 * The data processing is done using floating point representation.
  21 * The routine that performs quantization of coefficients can be found in
  22 * dct8x8_kernel_quantization.cu file.
  23 */
  24
  25 #pragma once
  26
  27 #include "Common.h"
  28
  29
  30 #define C_a 1.387039845322148f //!< a = (2^0.5) * cos(    pi / 16);  Used in forward and inverse DCT.
  31 #define C_b 1.306562964876377f //!< b = (2^0.5) * cos(    pi /  8);  Used in forward and inverse DCT.
  32 #define C_c 1.175875602419359f //!< c = (2^0.5) * cos(3 * pi / 16);  Used in forward and inverse DCT.
  33 #define C_d 0.785694958387102f //!< d = (2^0.5) * cos(5 * pi / 16);  Used in forward and inverse DCT.
  34 #define C_e 0.541196100146197f //!< e = (2^0.5) * cos(3 * pi /  8);  Used in forward and inverse DCT.
  35 #define C_f 0.275899379282943f //!< f = (2^0.5) * cos(7 * pi / 16);  Used in forward and inverse DCT.
  36
  37
  38 /**
  39 *  Normalization constant that is used in forward and inverse DCT
  40 */
  41 #define C_norm 0.3535533905932737f // 1 / (8^0.5)
  42
  43
  44 /**
  45 *  Width of data block (2nd kernel)
  46 */
  47 #define KER2_BLOCK_WIDTH          32
  48
  49
  50 /**
  51 *  Height of data block (2nd kernel)
  52 */
  53 #define KER2_BLOCK_HEIGHT         16
  54
  55
  56 /**
  57 *  LOG2 of width of data block (2nd kernel)
  58 */
  59 #define KER2_BW_LOG2              5
  60
  61
  62 /**
  63 *  LOG2 of height of data block (2nd kernel)
  64 */
  65 #define KER2_BH_LOG2              4
  66
  67
  68 /**
  69 *  Stride of shared memory buffer (2nd kernel)
  70 */
  71 #define KER2_SMEMBLOCK_STRIDE     (KER2_BLOCK_WIDTH+1)
  72
  73
  74 /**
  75 **************************************************************************
  76 *  Performs in-place DCT of vector of 8 elements.
  77 *
  78 * \param Vect0          [IN/OUT] - Pointer to the first element of vector
  79 * \param Step           [IN/OUT] - Value to add to ptr to access other elements
  80 *
  81 * \return None
  82 */
  83 __device__ void CUDAsubroutineInplaceDCTvector(float *Vect0, int Step)
  84 {
  85     float *Vect1 = Vect0 + Step;
  86     float *Vect2 = Vect1 + Step;
  87     float *Vect3 = Vect2 + Step;
  88     float *Vect4 = Vect3 + Step;
  89     float *Vect5 = Vect4 + Step;
  90     float *Vect6 = Vect5 + Step;
  91     float *Vect7 = Vect6 + Step;
  92
  93     float X07P = (*Vect0) + (*Vect7);
  94     float X16P = (*Vect1) + (*Vect6);
  95     float X25P = (*Vect2) + (*Vect5);
  96     float X34P = (*Vect3) + (*Vect4);
  97
  98     float X07M = (*Vect0) - (*Vect7);
  99     float X61M = (*Vect6) - (*Vect1);
 100     float X25M = (*Vect2) - (*Vect5);
 101     float X43M = (*Vect4) - (*Vect3);
 102
 103     float X07P34PP = X07P + X34P;
 104     float X07P34PM = X07P - X34P;
 105     float X16P25PP = X16P + X25P;
 106     float X16P25PM = X16P - X25P;
 107
 108     (*Vect0) = C_norm * (X07P34PP + X16P25PP);
 109     (*Vect2) = C_norm * (C_b * X07P34PM + C_e * X16P25PM);
 110     (*Vect4) = C_norm * (X07P34PP - X16P25PP);
 111     (*Vect6) = C_norm * (C_e * X07P34PM - C_b * X16P25PM);
 112
 113     (*Vect1) = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M);
 114     (*Vect3) = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M);
 115     (*Vect5) = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M);
 116     (*Vect7) = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M);
 117 }
 118
 119
 120 /**
 121 **************************************************************************
 122 *  Performs in-place IDCT of vector of 8 elements.
 123 *
 124 * \param Vect0          [IN/OUT] - Pointer to the first element of vector
 125 * \param Step           [IN/OUT] - Value to add to ptr to access other elements
 126 *
 127 * \return None
 128 */
 129 __device__ void CUDAsubroutineInplaceIDCTvector(float *Vect0, int Step)
 130 {
 131     float *Vect1 = Vect0 + Step;
 132     float *Vect2 = Vect1 + Step;
 133     float *Vect3 = Vect2 + Step;
 134     float *Vect4 = Vect3 + Step;
 135     float *Vect5 = Vect4 + Step;
 136     float *Vect6 = Vect5 + Step;
 137     float *Vect7 = Vect6 + Step;
 138
 139     float Y04P   = (*Vect0) + (*Vect4);
 140     float Y2b6eP = C_b * (*Vect2) + C_e * (*Vect6);
 141
 142     float Y04P2b6ePP = Y04P + Y2b6eP;
 143     float Y04P2b6ePM = Y04P - Y2b6eP;
 144     float Y7f1aP3c5dPP = C_f * (*Vect7) + C_a * (*Vect1) + C_c * (*Vect3) + C_d * (*Vect5);
 145     float Y7a1fM3d5cMP = C_a * (*Vect7) - C_f * (*Vect1) + C_d * (*Vect3) - C_c * (*Vect5);
 146
 147     float Y04M   = (*Vect0) - (*Vect4);
 148     float Y2e6bM = C_e * (*Vect2) - C_b * (*Vect6);
 149
 150     float Y04M2e6bMP = Y04M + Y2e6bM;
 151     float Y04M2e6bMM = Y04M - Y2e6bM;
 152     float Y1c7dM3f5aPM = C_c * (*Vect1) - C_d * (*Vect7) - C_f * (*Vect3) - C_a * (*Vect5);
 153     float Y1d7cP3a5fMM = C_d * (*Vect1) + C_c * (*Vect7) - C_a * (*Vect3) + C_f * (*Vect5);
 154
 155     (*Vect0) = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP);
 156     (*Vect7) = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP);
 157     (*Vect4) = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP);
 158     (*Vect3) = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP);
 159
 160     (*Vect1) = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM);
 161     (*Vect5) = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM);
 162     (*Vect2) = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM);
 163     (*Vect6) = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM);
 164 }
 165
 166
 167 /**
 168 **************************************************************************
 169 *  Performs 8x8 block-wise Forward Discrete Cosine Transform of the given
 170 *  image plane and outputs result to the array of coefficients. 2nd implementation.
 171 *  This kernel is designed to process image by blocks of blocks8x8 that
 172 *  utilizes maximum warps capacity, assuming that it is enough of 8 threads
 173 *  per block8x8.
 174 *
 175 * \param SrcDst                     [OUT] - Coefficients plane
 176 * \param ImgStride                  [IN] - Stride of SrcDst
 177 *
 178 * \return None
 179 */
 180
 181 __global__ void CUDAkernel2DCT(float *dst, float *src, int ImgStride)
 182 {
 183     __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE];
 184
 185     int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x;
 186     int OffsThreadInCol = threadIdx.z * BLOCK_SIZE;
 187     src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
 188     dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
 189     float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow;
 190
 191 #pragma unroll
 192
 193     for (unsigned int i = 0; i < BLOCK_SIZE; i++)
 194         bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride];
 195
 196     //process rows
 197     CUDAsubroutineInplaceDCTvector(block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1);
 198
 199     //process columns
 200     CUDAsubroutineInplaceDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE);
 201
 202     for (unsigned int i = 0; i < BLOCK_SIZE; i++)
 203         dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE];
 204 }
 205
 206
 207 /**
 208 **************************************************************************
 209 *  Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given
 210 *  coefficients plane and outputs result to the image. 2nd implementation.
 211 *  This kernel is designed to process image by blocks of blocks8x8 that
 212 *  utilizes maximum warps capacity, assuming that it is enough of 8 threads
 213 *  per block8x8.
 214 *
 215 * \param SrcDst                     [OUT] - Coefficients plane
 216 * \param ImgStride                  [IN] - Stride of SrcDst
 217 *
 218 * \return None
 219 */
 220
 221 __global__ void CUDAkernel2IDCT(float *dst, float *src, int ImgStride)
 222 {
 223     __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE];
 224
 225     int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x;
 226     int OffsThreadInCol = threadIdx.z * BLOCK_SIZE;
 227     src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
 228     dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
 229     float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow;
 230
 231 #pragma unroll
 232
 233     for (unsigned int i = 0; i < BLOCK_SIZE; i++)
 234         bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride];
 235
 236     //process rows
 237     CUDAsubroutineInplaceIDCTvector(block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1);
 238
 239     //process columns
 240     CUDAsubroutineInplaceIDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE);
 241
 242     for (unsigned int i = 0; i < BLOCK_SIZE; i++)
 243         dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE];
 244 }