OSDN Git Service

optimize DCT coefficients decoding.
authorNoumi Akira <noumiakira@users.sourceforge.jp>
Wed, 9 Sep 2009 02:58:09 +0000 (11:58 +0900)
committerNoumi Akira <noumiakira@users.sourceforge.jp>
Wed, 9 Sep 2009 02:58:09 +0000 (11:58 +0900)
Lib/QTheoraEx/FrameReconstructor_MMX.c
Lib/QTheoraEx/FrameReconstructor_SSE2.c

index adab225..6533ce6 100644 (file)
@@ -529,17 +529,6 @@ static __inline void IDCT_C_8_4_MMX(
 
 /* */
 
-static const UINT8 TZZ[64] = {
-        0,  2,  3,  9, 10, 20, 21, 35,
-        1,  4,  8, 11, 19, 22, 34, 36,
-        5,  7, 12, 18, 23, 33, 37, 48,
-        6, 13, 17, 24, 32, 38, 47, 49,
-       14, 16, 25, 31, 39, 46, 50, 57,
-       15, 26, 30, 40, 45, 51, 56, 58,
-       27, 29, 41, 44, 52, 55, 59, 62,
-       28, 42, 43, 53, 54, 60, 61, 63
-};
-
 static __inline void DequantizeIDCT8x8_MMX(
        const INT16* block,
        const INT16* matrix,
@@ -547,34 +536,30 @@ static __inline void DequantizeIDCT8x8_MMX(
 {
        ALIGN(0x10) INT16 c0[64];
 
-       { /* Reorder */
-               const UINT8* t = TZZ;
-
-               INT16* c = c0;
-               INT16* e = c + 64;
-               for (; c < e; c += 8, t += 8) {
-                       c[0] = block[t[0]];
-                       c[1] = block[t[1]];
-                       c[2] = block[t[2]];
-                       c[3] = block[t[3]];
-                       c[4] = block[t[4]];
-                       c[5] = block[t[5]];
-                       c[6] = block[t[6]];
-                       c[7] = block[t[7]];
-               }
-       }
-
        { /* Dequantize */
-               __m64*       d = (__m64*) c0;
-               __m64*       e = (__m64*)(c0 + 64);
+               const __m64* b = (const __m64*)block;
                const __m64* m = (const __m64*)matrix;
+               __m64*       d = (__m64*) c0;
 
-               for (; d < e; d += 4, m += 4) {
-                       d[0] = _mm_mullo_pi16(d[0], m[0]);
-                       d[1] = _mm_mullo_pi16(d[1], m[1]);
-                       d[2] = _mm_mullo_pi16(d[2], m[2]);
-                       d[3] = _mm_mullo_pi16(d[3], m[3]);
-               }
+               d[ 0] = _mm_mullo_pi16(b[ 0], m[ 0]);
+               d[ 1] = _mm_mullo_pi16(b[ 1], m[ 1]);
+               d[ 2] = _mm_mullo_pi16(b[ 2], m[ 2]);
+               d[ 3] = _mm_mullo_pi16(b[ 3], m[ 3]);
+
+               d[ 4] = _mm_mullo_pi16(b[ 4], m[ 4]);
+               d[ 5] = _mm_mullo_pi16(b[ 5], m[ 5]);
+               d[ 6] = _mm_mullo_pi16(b[ 6], m[ 6]);
+               d[ 7] = _mm_mullo_pi16(b[ 7], m[ 7]);
+
+               d[ 8] = _mm_mullo_pi16(b[ 8], m[ 8]);
+               d[ 9] = _mm_mullo_pi16(b[ 9], m[ 9]);
+               d[10] = _mm_mullo_pi16(b[10], m[10]);
+               d[11] = _mm_mullo_pi16(b[11], m[11]);
+
+               d[12] = _mm_mullo_pi16(b[12], m[12]);
+               d[13] = _mm_mullo_pi16(b[13], m[13]);
+               d[14] = _mm_mullo_pi16(b[14], m[14]);
+               d[15] = _mm_mullo_pi16(b[15], m[15]);
        }
 
        /* iDCT Row */
@@ -596,36 +581,15 @@ static __inline void DequantizeIDCT8x8_16_MMX(
 {
        ALIGN(0x10) INT16 c0[64];
 
-       const __m64 z = _mm_setzero_si64();
-
-       *((__m64*)(c0 + 0x00)) = z;
-       *((__m64*)(c0 + 0x08)) = z;
-       *((__m64*)(c0 + 0x10)) = z;
-       *((__m64*)(c0 + 0x18)) = z;
-
-       /* Reorder */
-       c0[ 0 + 0] = block[TZZ[ 0 + 0]];
-       c0[ 0 + 1] = block[TZZ[ 0 + 1]];
-       c0[ 0 + 2] = block[TZZ[ 0 + 2]];
-       c0[ 0 + 3] = block[TZZ[ 0 + 3]];
-
-       c0[ 8 + 0] = block[TZZ[ 8 + 0]];
-       c0[ 8 + 1] = block[TZZ[ 8 + 1]];
-       c0[ 8 + 2] = block[TZZ[ 8 + 2]];
-
-       c0[16 + 0] = block[TZZ[16 + 0]];
-       c0[16 + 1] = block[TZZ[16 + 1]];
-
-       c0[24 + 0] = block[TZZ[24 + 0]];
-
        { /* Dequantize */
+               const __m64* b = (const __m64*)block;
                const __m64* m = (const __m64*)matrix;
                __m64*       d = (__m64*)c0;
 
-               d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
-               d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
-               d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
-               d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+               d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
+               d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
+               d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
+               d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
        }
 
        /* iDCT Row */
@@ -679,28 +643,50 @@ struct DecodeCoefficientsContext {
 
 typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
 
+ALIGN(0x10) static const UINT8 IZZ[64] = {
+        0,  8,  1,  2,  9, 16, 24, 17,
+       10,  3,  4, 11, 18, 25, 32, 40,
+       33, 26, 19, 12,  5,  6, 13, 20,
+       27, 34, 41, 48, 56, 49, 42, 35,
+       28, 21, 14,  7, 15, 22, 29, 36,
+       43, 50, 57, 58, 51, 44, 37, 30,
+       23, 31, 38, 45, 52, 59, 60, 53,
+       46, 39, 47, 54, 61, 62, 55, 63
+};
+
 static INT32 DecodeCoefficients_MMX(
        FrameDecoder_t*              t,
        DecodeCoefficientsContext_t* ctx,
        INT16*                       block)
 {
-       INT16* b = block;
-       INT16* e = b + 64;
+       const INT8* bi = IZZ;
+       const INT8* ei = IZZ + 64;
 
        DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
 
        const __m64 z = _mm_setzero_si64();
 
-       for (; b < e; b += 16) {
-               *((__m64*)(b +  0)) = z;
-               *((__m64*)(b +  4)) = z;
-               *((__m64*)(b +  8)) = z;
-               *((__m64*)(b + 12)) = z;
-       }
+       *((__m64*)(block + 0x00)) = z;
+       *((__m64*)(block + 0x04)) = z;
+       *((__m64*)(block + 0x08)) = z;
+       *((__m64*)(block + 0x0c)) = z;
+
+       *((__m64*)(block + 0x10)) = z;
+       *((__m64*)(block + 0x14)) = z;
+       *((__m64*)(block + 0x18)) = z;
+       *((__m64*)(block + 0x1c)) = z;
 
-       b = block;
+       *((__m64*)(block + 0x20)) = z;
+       *((__m64*)(block + 0x24)) = z;
+       *((__m64*)(block + 0x28)) = z;
+       *((__m64*)(block + 0x2c)) = z;
 
-       while (b < e) {
+       *((__m64*)(block + 0x30)) = z;
+       *((__m64*)(block + 0x34)) = z;
+       *((__m64*)(block + 0x38)) = z;
+       *((__m64*)(block + 0x3c)) = z;
+
+       while (bi < ei) {
                if (leaf->EOB_Run > 0) {
                        leaf->EOB_Run -= 1;
                        break;
@@ -713,19 +699,16 @@ static INT32 DecodeCoefficients_MMX(
                                leaf->EOB_Run = coeff;
 
                        } else {
-                               b += run;
-                               if (b >= e) {
-                                       break;
-                               }
+                               bi += run;
 
-                               *(b++) = coeff;
+                               block[*(bi++)] = coeff;
 
-                               leaf = ctx->Leaf + (b - block);
+                               leaf = ctx->Leaf + (bi - IZZ);
                        }
                }
        }
 
-       return b - block;
+       return bi - IZZ;
 }
 
 /* */
@@ -741,7 +724,7 @@ static void Reconstruct_IntraBlock(
        Plane_t*                     r,
        DecodeCoefficientsContext_t* ctx)
 {
-       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 block[64 + 64];
        ALIGN(0x10) INT16 coeff[64];
 
        const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
@@ -783,7 +766,7 @@ static void Reconstruct_InterBlock(
        Plane_t*                     r,
        DecodeCoefficientsContext_t* ctx)
 {
-       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 block[64 + 64];
        ALIGN(0x10) INT16 coeff[64];
 
        const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];
index 1fec564..db52246 100644 (file)
@@ -537,17 +537,6 @@ static __inline void Transpose_U_SSE2(
 
 /* */
 
-static const UINT8 TZZ[64] = {
-        0,  2,  3,  9, 10, 20, 21, 35,
-        1,  4,  8, 11, 19, 22, 34, 36,
-        5,  7, 12, 18, 23, 33, 37, 48,
-        6, 13, 17, 24, 32, 38, 47, 49,
-       14, 16, 25, 31, 39, 46, 50, 57,
-       15, 26, 30, 40, 45, 51, 56, 58,
-       27, 29, 41, 44, 52, 55, 59, 62,
-       28, 42, 43, 53, 54, 60, 61, 63
-};
-
 static __inline void DequantizeIDCT8x8_SSE2(
        const INT16* block,
        const INT16* matrix,
@@ -555,35 +544,19 @@ static __inline void DequantizeIDCT8x8_SSE2(
 {
        ALIGN(0x10) INT16 c0[64];
 
-       { /* Reorder */
-               const UINT8* t = TZZ;
-
-               INT16* c = c0;
-               INT16* e = c + 64;
-               for (; c < e; c += 8, t += 8) {
-                       c[0] = block[t[0]];
-                       c[1] = block[t[1]];
-                       c[2] = block[t[2]];
-                       c[3] = block[t[3]];
-                       c[4] = block[t[4]];
-                       c[5] = block[t[5]];
-                       c[6] = block[t[6]];
-                       c[7] = block[t[7]];
-               }
-       }
-
        { /* Dequantize */
+               const __m128i* b = (const __m128i*)block;
                const __m128i* m = (const __m128i*)matrix;
                __m128i*       d = (__m128i*)c0;
 
-               d[0] = _mm_mullo_epi16(d[0], m[0]);
-               d[1] = _mm_mullo_epi16(d[1], m[1]);
-               d[2] = _mm_mullo_epi16(d[2], m[2]);
-               d[3] = _mm_mullo_epi16(d[3], m[3]);
-               d[4] = _mm_mullo_epi16(d[4], m[4]);
-               d[5] = _mm_mullo_epi16(d[5], m[5]);
-               d[6] = _mm_mullo_epi16(d[6], m[6]);
-               d[7] = _mm_mullo_epi16(d[7], m[7]);
+               d[0] = _mm_mullo_epi16(b[0], m[0]);
+               d[1] = _mm_mullo_epi16(b[1], m[1]);
+               d[2] = _mm_mullo_epi16(b[2], m[2]);
+               d[3] = _mm_mullo_epi16(b[3], m[3]);
+               d[4] = _mm_mullo_epi16(b[4], m[4]);
+               d[5] = _mm_mullo_epi16(b[5], m[5]);
+               d[6] = _mm_mullo_epi16(b[6], m[6]);
+               d[7] = _mm_mullo_epi16(b[7], m[7]);
        }
 
        /* iDCT Row */
@@ -605,36 +578,15 @@ static __inline void DequantizeIDCT8x8_16_SSE2(
 {
        ALIGN(0x10) INT16 c0[64];
 
-       const __m128i z = _mm_setzero_si128();
-
-       _mm_store_si128((__m128i*)(c0 + 0x00), z);
-       _mm_store_si128((__m128i*)(c0 + 0x08), z);
-       _mm_store_si128((__m128i*)(c0 + 0x10), z);
-       _mm_store_si128((__m128i*)(c0 + 0x18), z);
-
-       /* Reorder */
-       c0[ 0 + 0] = block[TZZ[ 0 + 0]];
-       c0[ 0 + 1] = block[TZZ[ 0 + 1]];
-       c0[ 0 + 2] = block[TZZ[ 0 + 2]];
-       c0[ 0 + 3] = block[TZZ[ 0 + 3]];
-
-       c0[ 8 + 0] = block[TZZ[ 8 + 0]];
-       c0[ 8 + 1] = block[TZZ[ 8 + 1]];
-       c0[ 8 + 2] = block[TZZ[ 8 + 2]];
-
-       c0[16 + 0] = block[TZZ[16 + 0]];
-       c0[16 + 1] = block[TZZ[16 + 1]];
-
-       c0[24 + 0] = block[TZZ[24 + 0]];
-
        { /* Dequantize */
+               const __m64* b = (__m64*)block;
                const __m64* m = (const __m64*)matrix;
                __m64*       d = (__m64*)c0;
 
-               d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
-               d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
-               d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
-               d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+               d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
+               d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
+               d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
+               d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
        }
 
        /* iDCT Row */
@@ -688,13 +640,24 @@ struct DecodeCoefficientsContext {
 
 typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
 
+ALIGN(0x10) static const UINT8 IZZ[64] = {
+        0,  8,  1,  2,  9, 16, 24, 17,
+       10,  3,  4, 11, 18, 25, 32, 40,
+       33, 26, 19, 12,  5,  6, 13, 20,
+       27, 34, 41, 48, 56, 49, 42, 35,
+       28, 21, 14,  7, 15, 22, 29, 36,
+       43, 50, 57, 58, 51, 44, 37, 30,
+       23, 31, 38, 45, 52, 59, 60, 53,
+       46, 39, 47, 54, 61, 62, 55, 63
+};
+
 static INT32 DecodeCoefficients_SSE2(
        FrameDecoder_t*              t,
        DecodeCoefficientsContext_t* ctx,
        INT16*                       block)
 {
-       INT16* b = block;
-       INT16* e = b + 64;
+       const INT8* bi = IZZ;
+       const INT8* ei = IZZ + 64;
 
        DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
 
@@ -709,7 +672,7 @@ static INT32 DecodeCoefficients_SSE2(
        _mm_store_si128((__m128i*)(block + 0x30), z);
        _mm_store_si128((__m128i*)(block + 0x38), z);
 
-       while (b < e) {
+       while (bi < ei) {
                if (leaf->EOB_Run > 0) {
                        leaf->EOB_Run -= 1;
                        break;
@@ -722,19 +685,16 @@ static INT32 DecodeCoefficients_SSE2(
                                leaf->EOB_Run = coeff;
 
                        } else {
-                               b += run;
-                               if (b >= e) {
-                                       break;
-                               }
+                               bi += run;
 
-                               *(b++) = coeff;
+                               block[*(bi++)] = coeff;
 
-                               leaf = ctx->Leaf + (b - block);
+                               leaf = ctx->Leaf + (bi - IZZ);
                        }
                }
        }
 
-       return b - block;
+       return bi - IZZ;
 }
 
 /* */
@@ -750,7 +710,7 @@ static void Reconstruct_IntraBlock(
        Plane_t*                     r,
        DecodeCoefficientsContext_t* ctx)
 {
-       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 block[64 + 64];
        ALIGN(0x10) INT16 coeff[64];
 
        const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
@@ -790,7 +750,7 @@ static void Reconstruct_InterBlock(
        Plane_t*                     r,
        DecodeCoefficientsContext_t* ctx)
 {
-       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 block[64 + 64];
        ALIGN(0x10) INT16 coeff[64];
 
        const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];