2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
13 **************************************************************************
14 * \file dct8x8_kernel2.cu
15 * \brief Contains 2nd kernel implementations of DCT and IDCT routines, used in
16 * JPEG internal data processing. Optimized device code.
18 * This code implements traditional approach to forward and inverse Discrete
19 * Cosine Transform to blocks of image pixels (of 8x8 size), as in JPEG standard.
20 * The data processing is done using floating point representation.
21 * The routine that performs quantization of coefficients can be found in
22 * dct8x8_kernel_quantization.cu file.
30 #define C_a 1.387039845322148f //!< a = (2^0.5) * cos( pi / 16); Used in forward and inverse DCT.
31 #define C_b 1.306562964876377f //!< b = (2^0.5) * cos( pi / 8); Used in forward and inverse DCT.
32 #define C_c 1.175875602419359f //!< c = (2^0.5) * cos(3 * pi / 16); Used in forward and inverse DCT.
33 #define C_d 0.785694958387102f //!< d = (2^0.5) * cos(5 * pi / 16); Used in forward and inverse DCT.
34 #define C_e 0.541196100146197f //!< e = (2^0.5) * cos(3 * pi / 8); Used in forward and inverse DCT.
35 #define C_f 0.275899379282943f //!< f = (2^0.5) * cos(7 * pi / 16); Used in forward and inverse DCT.
39 * Normalization constant that is used in forward and inverse DCT
41 #define C_norm 0.3535533905932737f // 1 / (8^0.5)
45 * Width of data block (2nd kernel)
47 #define KER2_BLOCK_WIDTH 32
51 * Height of data block (2nd kernel)
53 #define KER2_BLOCK_HEIGHT 16
57 * LOG2 of width of data block (2nd kernel)
59 #define KER2_BW_LOG2 5
63 * LOG2 of height of data block (2nd kernel)
65 #define KER2_BH_LOG2 4
69 * Stride of shared memory buffer (2nd kernel)
71 #define KER2_SMEMBLOCK_STRIDE (KER2_BLOCK_WIDTH+1)
75 **************************************************************************
76 * Performs in-place DCT of vector of 8 elements.
78 * \param Vect0 [IN/OUT] - Pointer to the first element of vector
79 * \param Step [IN/OUT] - Value to add to ptr to access other elements
83 __device__ void CUDAsubroutineInplaceDCTvector(float *Vect0, int Step)
85 float *Vect1 = Vect0 + Step;
86 float *Vect2 = Vect1 + Step;
87 float *Vect3 = Vect2 + Step;
88 float *Vect4 = Vect3 + Step;
89 float *Vect5 = Vect4 + Step;
90 float *Vect6 = Vect5 + Step;
91 float *Vect7 = Vect6 + Step;
93 float X07P = (*Vect0) + (*Vect7);
94 float X16P = (*Vect1) + (*Vect6);
95 float X25P = (*Vect2) + (*Vect5);
96 float X34P = (*Vect3) + (*Vect4);
98 float X07M = (*Vect0) - (*Vect7);
99 float X61M = (*Vect6) - (*Vect1);
100 float X25M = (*Vect2) - (*Vect5);
101 float X43M = (*Vect4) - (*Vect3);
103 float X07P34PP = X07P + X34P;
104 float X07P34PM = X07P - X34P;
105 float X16P25PP = X16P + X25P;
106 float X16P25PM = X16P - X25P;
108 (*Vect0) = C_norm * (X07P34PP + X16P25PP);
109 (*Vect2) = C_norm * (C_b * X07P34PM + C_e * X16P25PM);
110 (*Vect4) = C_norm * (X07P34PP - X16P25PP);
111 (*Vect6) = C_norm * (C_e * X07P34PM - C_b * X16P25PM);
113 (*Vect1) = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M);
114 (*Vect3) = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M);
115 (*Vect5) = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M);
116 (*Vect7) = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M);
121 **************************************************************************
122 * Performs in-place IDCT of vector of 8 elements.
124 * \param Vect0 [IN/OUT] - Pointer to the first element of vector
125 * \param Step [IN/OUT] - Value to add to ptr to access other elements
129 __device__ void CUDAsubroutineInplaceIDCTvector(float *Vect0, int Step)
131 float *Vect1 = Vect0 + Step;
132 float *Vect2 = Vect1 + Step;
133 float *Vect3 = Vect2 + Step;
134 float *Vect4 = Vect3 + Step;
135 float *Vect5 = Vect4 + Step;
136 float *Vect6 = Vect5 + Step;
137 float *Vect7 = Vect6 + Step;
139 float Y04P = (*Vect0) + (*Vect4);
140 float Y2b6eP = C_b * (*Vect2) + C_e * (*Vect6);
142 float Y04P2b6ePP = Y04P + Y2b6eP;
143 float Y04P2b6ePM = Y04P - Y2b6eP;
144 float Y7f1aP3c5dPP = C_f * (*Vect7) + C_a * (*Vect1) + C_c * (*Vect3) + C_d * (*Vect5);
145 float Y7a1fM3d5cMP = C_a * (*Vect7) - C_f * (*Vect1) + C_d * (*Vect3) - C_c * (*Vect5);
147 float Y04M = (*Vect0) - (*Vect4);
148 float Y2e6bM = C_e * (*Vect2) - C_b * (*Vect6);
150 float Y04M2e6bMP = Y04M + Y2e6bM;
151 float Y04M2e6bMM = Y04M - Y2e6bM;
152 float Y1c7dM3f5aPM = C_c * (*Vect1) - C_d * (*Vect7) - C_f * (*Vect3) - C_a * (*Vect5);
153 float Y1d7cP3a5fMM = C_d * (*Vect1) + C_c * (*Vect7) - C_a * (*Vect3) + C_f * (*Vect5);
155 (*Vect0) = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP);
156 (*Vect7) = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP);
157 (*Vect4) = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP);
158 (*Vect3) = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP);
160 (*Vect1) = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM);
161 (*Vect5) = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM);
162 (*Vect2) = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM);
163 (*Vect6) = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM);
168 **************************************************************************
169 * Performs 8x8 block-wise Forward Discrete Cosine Transform of the given
170 * image plane and outputs result to the array of coefficients. 2nd implementation.
171 * This kernel is designed to process image by blocks of blocks8x8 that
172 * utilizes maximum warps capacity, assuming that it is enough of 8 threads
175 * \param SrcDst [OUT] - Coefficients plane
176 * \param ImgStride [IN] - Stride of SrcDst
181 __global__ void CUDAkernel2DCT(float *dst, float *src, int ImgStride)
183 __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE];
185 int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x;
186 int OffsThreadInCol = threadIdx.z * BLOCK_SIZE;
187 src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
188 dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
189 float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow;
193 for (unsigned int i = 0; i < BLOCK_SIZE; i++)
194 bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride];
197 CUDAsubroutineInplaceDCTvector(block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1);
200 CUDAsubroutineInplaceDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE);
202 for (unsigned int i = 0; i < BLOCK_SIZE; i++)
203 dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE];
208 **************************************************************************
209 * Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given
210 * coefficients plane and outputs result to the image. 2nd implementation.
211 * This kernel is designed to process image by blocks of blocks8x8 that
212 * utilizes maximum warps capacity, assuming that it is enough of 8 threads
215 * \param SrcDst [OUT] - Coefficients plane
216 * \param ImgStride [IN] - Stride of SrcDst
221 __global__ void CUDAkernel2IDCT(float *dst, float *src, int ImgStride)
223 __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE];
225 int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x;
226 int OffsThreadInCol = threadIdx.z * BLOCK_SIZE;
227 src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
228 dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow;
229 float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow;
233 for (unsigned int i = 0; i < BLOCK_SIZE; i++)
234 bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride];
237 CUDAsubroutineInplaceIDCTvector(block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1);
240 CUDAsubroutineInplaceIDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE);
242 for (unsigned int i = 0; i < BLOCK_SIZE; i++)
243 dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE];