optimize Block DCT coefficients decoding.

author Noumi Akira <noumiakira@users.sourceforge.jp>

Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)

committer Noumi Akira <noumiakira@users.sourceforge.jp>

Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)
author Noumi Akira <noumiakira@users.sourceforge.jp>
Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)
committer Noumi Akira <noumiakira@users.sourceforge.jp>
Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)
diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c

index bfc08a6..edd3ee2 100644 (file)
--- a/Lib/QTheoraEx/FrameReconstructor_SSE2.c
+++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
@@ -320,6 +320,63 @@ static __inline void IDCT_R_8_SSE2(
         Y[7] = _mm_sub_epi16(t0, t7);
  }
  
+static __inline void IDCT_R_8_4_SSE2(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m128i* C = (const __m128i*)COS[0];
+       const __m128i* X = (const __m128i*)x;
+       __m128i*       Y = (__m128i*)y;
+
+       __m128i s0;
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+       /* Stage.1 */
+
+       t1 = t0 = MUL1(4, X[0]);
+
+       t2 = MUL0(6, X[2]);
+       t3 = MUL1(2, X[2]);
+
+       t4 = MUL0(7, X[1]);
+       t5 = _mm_sub_epi16(_mm_setzero_si128(), MUL1(5, X[3]));
+
+       t6 = MUL1(3, X[3]);
+       t7 = MUL1(1, X[1]);
+
+       /* Stage.2 */
+
+       s0 = _mm_sub_epi16(t4, t5);
+       t4 = _mm_add_epi16(t4, t5);
+       t5 = MUL1(4, s0);
+
+       s0 = _mm_sub_epi16(t7, t6);
+       t7 = _mm_add_epi16(t7, t6);
+       t6 = MUL1(4, s0);
+
+       /* Stage.3 */
+
+       s0 = _mm_sub_epi16(t0, t3);
+       t0 = _mm_add_epi16(t0, t3);
+
+       t3 = _mm_sub_epi16(t1, t2);
+       t1 = _mm_add_epi16(t1, t2);
+
+       t2 = _mm_sub_epi16(t6, t5);
+       t6 = _mm_add_epi16(t6, t5);
+
+       /* Stage.4 */
+
+       Y[0] = _mm_add_epi16(t0, t7);
+       Y[1] = _mm_add_epi16(t1, t6);
+       Y[2] = _mm_add_epi16(t3, t2);
+       Y[3] = _mm_add_epi16(s0, t4);
+       Y[4] = _mm_sub_epi16(s0, t4);
+       Y[5] = _mm_sub_epi16(t3, t2);
+       Y[6] = _mm_sub_epi16(t1, t6);
+       Y[7] = _mm_sub_epi16(t0, t7);
+}
+
  static __inline void IDCT_C_8_SSE2(
         const INT16* x,
         INT16*       y)
@@ -482,6 +539,59 @@ static __inline void DequantizeIDCT8x8_SSE2(
  
  /* */
  
+static __inline void DequantizeIDCT8x8_16_SSE2(
+       const INT16* block,
+       const INT16* matrix,
+       INT16*       coeff)
+{
+       ALIGN(0x10) INT16 c0[64];
+
+       const __m128i z = _mm_setzero_si128();
+
+       _mm_store_si128((__m128i*)(c0 + 0x00), z);
+       _mm_store_si128((__m128i*)(c0 + 0x08), z);
+       _mm_store_si128((__m128i*)(c0 + 0x10), z);
+       _mm_store_si128((__m128i*)(c0 + 0x18), z);
+       _mm_store_si128((__m128i*)(c0 + 0x20), z);
+       _mm_store_si128((__m128i*)(c0 + 0x28), z);
+       _mm_store_si128((__m128i*)(c0 + 0x30), z);
+       _mm_store_si128((__m128i*)(c0 + 0x38), z);
+
+       { /* Reorder */
+               const UINT8* t = TZZ;
+
+               INT16* c = c0;
+               INT16* e = c + 32;
+               for (; c < e; c += 8, t += 8) {
+                       c[0] = block[t[0]];
+                       c[1] = block[t[1]];
+                       c[2] = block[t[2]];
+                       c[3] = block[t[3]];
+               }
+       }
+
+       { /* Dequantize */
+               const __m128i* m = (const __m128i*)matrix;
+               __m128i*       d = (__m128i*)c0;
+
+               d[0] = _mm_mullo_epi16(d[0], m[0]);
+               d[1] = _mm_mullo_epi16(d[1], m[1]);
+               d[2] = _mm_mullo_epi16(d[2], m[2]);
+               d[3] = _mm_mullo_epi16(d[3], m[3]);
+       }
+
+       /* iDCT Row */
+       IDCT_R_8_4_SSE2(c0, coeff);
+
+       /* Transpose */
+       Transpose_U_SSE2(coeff, c0);
+
+       /* iDCT Colum */
+       IDCT_C_8_SSE2(c0, coeff);
+}
+
+/* */
+
  static __inline void DequantizeIDCT8x8_0_SSE2(
         INT16        dc,
         const INT16* matrix,
@@ -502,12 +612,20 @@ static __inline void DequantizeIDCT8x8_0_SSE2(
  
  /* */
  
-struct DecodeCoefficientsContext {
+struct DecodeCoefficientsLeaf {
  
-       INT32 EOB_Run[64];
+       INT32 EOB_Run;
  
-       INT8*  Run  [64];
-       INT16* Coeff[64];
+       INT8*  Run;
+       INT16* Coeff;
+
+}; /* DecodeCoefficientsLeaf */
+
+typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;
+
+struct DecodeCoefficientsContext {
+
+       DecodeCoefficientsLeaf_t Leaf[64];
  
  }; /* DecodeCoefficientsContext */
  
@@ -521,7 +639,7 @@ static INT32 DecodeCoefficients_SSE2(
         INT16* b = block;
         INT16* e = b + 64;
  
-       INT32 i = 0;
+       DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
  
         const __m128i z = _mm_setzero_si128();
  
@@ -535,16 +653,16 @@ static INT32 DecodeCoefficients_SSE2(
         _mm_store_si128((__m128i*)(block + 0x38), z);
  
         while (b < e) {
-               if (ctx->EOB_Run[i] > 0) {
-                       ctx->EOB_Run[i] -= 1;
+               if (leaf->EOB_Run > 0) {
+                       leaf->EOB_Run -= 1;
                         break;
  
                 } else {
-                       INT32 run   = *((ctx->Run  [i])++);
-                       INT16 coeff = *((ctx->Coeff[i])++);
+                       INT32 run   = *((leaf->Run  )++);
+                       INT32 coeff = *((leaf->Coeff)++);
  
                         if (run < 0) {
-                               ctx->EOB_Run[i] = coeff;
+                               leaf->EOB_Run = coeff;
  
                         } else {
                                 b += run;
@@ -554,12 +672,12 @@ static INT32 DecodeCoefficients_SSE2(
  
                                 *(b++) = coeff;
  
-                               i = b - block;
+                               leaf = ctx->Leaf + (b - block);
                         }
                 }
         }
  
-       return i;
+       return b - block;
  }
  
  /* */
@@ -579,15 +697,23 @@ static void Reconstruct_IntraBlock(
  
         const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
  
+       INT32 cs;
+
         if (dc == NOT_CODED) {
                 Block_CopyPlane8x8_SSE2(p, x, y, r);
                 return;
         }
  
-       if (DecodeCoefficients_SSE2(t, ctx, block) >= 2) {
+       cs = DecodeCoefficients_SSE2(t, ctx, block);
+
+       if (cs >= 10) {
                 block[0] = dc;
                 DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
  
+       } else if (cs >= 2) {
+               block[0] = dc;
+               DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);
+
         } else {
                 DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
         }
@@ -610,6 +736,8 @@ static void Reconstruct_InterBlock(
  
         const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
  
+       INT32 cs;
+
         if (dc == NOT_CODED) {
                 if (r != NULL) {
                         Block_CopyPlane8x8_SSE2(p, x, y, r);
@@ -617,10 +745,16 @@ static void Reconstruct_InterBlock(
                 return;
         }
  
-       if (DecodeCoefficients_SSE2(t, ctx, block) >= 2) {
+       cs = DecodeCoefficients_SSE2(t, ctx, block);
+
+       if (cs >= 10) {
                 block[0] = dc;
                 DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
  
+       } else if (cs >= 2) {
+               block[0] = dc;
+               DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);
+
         } else {
                 DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
         }
@@ -680,12 +814,12 @@ static void Reconstruct_YPlane_SSE2(
         const UINT8*          mm = t->MBMode;
         const MotionVector_t* mv = t->MV;
  
-       DecodeCoefficientsContext_t ctx = { 0 };
+       ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };
  
         INT32 i;
         for (i = 0; i < 64; i++) {
-               ctx.Run  [i] = t->BRun  [0][i];
-               ctx.Coeff[i] = t->BCoeff[0][i];
+               ctx.Leaf[i].Run   = t->BRun  [0][i];
+               ctx.Leaf[i].Coeff = t->BCoeff[0][i];
         }
  
         for (y = 0; y < sy; y += 32) {
@@ -809,15 +943,15 @@ static void Reconstruct_CPlane_SSE2(
  
         const UINT8* m = t->BMode + t->Index->BC[0];
  
-       DecodeCoefficientsContext_t ctx[2] = { 0 };
+       ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };
  
         INT32 i;
         for (i = 0; i < 64; i++) {
-               ctx[0].Run  [i] = t->BRun  [1][i];
-               ctx[0].Coeff[i] = t->BCoeff[1][i];
+               ctx[0].Leaf[i].Run   = t->BRun  [1][i];
+               ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];
  
-               ctx[1].Run  [i] = t->BRun  [2][i];
-               ctx[1].Coeff[i] = t->BCoeff[2][i];
+               ctx[1].Leaf[i].Run   = t->BRun  [2][i];
+               ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
         }
  
         for (y = 0; y < sy; y += 32) {
author	Noumi Akira <noumiakira@users.sourceforge.jp>
	Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)
committer	Noumi Akira <noumiakira@users.sourceforge.jp>
	Wed, 8 Jul 2009 04:50:13 +0000 (13:50 +0900)