add MMX optimized code.

author Noumi Akira <noumiakira@users.sourceforge.jp>

Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)

committer Noumi Akira <noumiakira@users.sourceforge.jp>

Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)
author Noumi Akira <noumiakira@users.sourceforge.jp>
Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)
committer Noumi Akira <noumiakira@users.sourceforge.jp>
Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)
diff --git a/Lib/QTheoraEx/FrameDecoder.c b/Lib/QTheoraEx/FrameDecoder.c

index ba9d8aa..cd37f9b 100644 (file)
--- a/Lib/QTheoraEx/FrameDecoder.c
+++ b/Lib/QTheoraEx/FrameDecoder.c
@@ -651,7 +651,7 @@ BOOL QT_FrameDecoder_Setup(
  
         /* */
  
-       if (g_QT_Enable_SSE2) {
+       if (g_QT_Enable_SSE2 || g_QT_Enable_MMX) {
                 t->Reconstructor = (FrameReconstructor_SSE2_t*)QT_MemoryPool_Allocate(pool, sizeof(FrameReconstructor_SSE2_t));
                 if (t->Reconstructor == NULL) {
                         return FALSE;
@@ -723,12 +723,16 @@ BOOL QT_FrameDecoder_Setup(
  
         if (g_QT_Enable_SSE2) {
                 t->UpdateDequantizeMatrix = QT_UpdateDequantizeMatrix_SSE2;
+       } else if (g_QT_Enable_MMX) {
+               t->UpdateDequantizeMatrix = QT_UpdateDequantizeMatrix_MMX;
         } else {
                 t->UpdateDequantizeMatrix = NULL;
         }
  
         if (g_QT_Enable_SSE2) {
                 t->Reconstruct = QT_ReconstructFrame_SSE2;
+       } else if (g_QT_Enable_MMX) {
+               t->Reconstruct = QT_ReconstructFrame_MMX;
         } else {
                 t->Reconstruct = QT_ReconstructFrame;
         }
diff --git a/Lib/QTheoraEx/FrameReconstructor.h b/Lib/QTheoraEx/FrameReconstructor.h

index 754dc1c..1896672 100644 (file)
--- a/Lib/QTheoraEx/FrameReconstructor.h
+++ b/Lib/QTheoraEx/FrameReconstructor.h
@@ -15,6 +15,14 @@ void QT_ReconstructFrame(
  
  /* */
  
+void QT_UpdateDequantizeMatrix_MMX(
+       FrameDecoder_t* t);
+
+void QT_ReconstructFrame_MMX(
+       FrameDecoder_t* t);
+
+/* */
+
  void QT_UpdateDequantizeMatrix_SSE2(
         FrameDecoder_t* t);
  
diff --git a/Lib/QTheoraEx/FrameReconstructor_MMX.c b/Lib/QTheoraEx/FrameReconstructor_MMX.c

new file mode 100644 (file)

index 0000000..a9d1950
--- /dev/null
+++ b/Lib/QTheoraEx/FrameReconstructor_MMX.c
@@ -0,0 +1,1083 @@
+/* FrameReconstructor_MMX.c */
+/* 2009/07/09               */
+
+#include "StdAfx.h"
+
+#include "FrameReconstructor.h"
+
+#include "MotionComp_MMX.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static __inline void Transpose_MMX(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m64* X = (const __m64*)x;
+       __m64*       Y = (__m64*)y;
+
+       __m64 t0, t1, t2, t3;
+       __m64 u0, u1, u2, u3;
+
+       /* */
+
+       t0 = X[2 * 0];
+       t1 = X[2 * 1];
+       t2 = X[2 * 2];
+       t3 = X[2 * 3];
+
+       u0 = _mm_unpacklo_pi16(t0, t1);
+       u1 = _mm_unpackhi_pi16(t0, t1);
+       u2 = _mm_unpacklo_pi16(t2, t3);
+       u3 = _mm_unpackhi_pi16(t2, t3);
+
+       Y[2 * 0] = _mm_unpacklo_pi32(u0, u2);
+       Y[2 * 1] = _mm_unpackhi_pi32(u0, u2);
+       Y[2 * 2] = _mm_unpacklo_pi32(u1, u3);
+       Y[2 * 3] = _mm_unpackhi_pi32(u1, u3);
+
+       /* */
+
+       t0 = X[2 * 0 + 1];
+       t1 = X[2 * 1 + 1];
+       t2 = X[2 * 2 + 1];
+       t3 = X[2 * 3 + 1];
+
+       u0 = _mm_unpacklo_pi16(t0, t1);
+       u1 = _mm_unpackhi_pi16(t0, t1);
+       u2 = _mm_unpacklo_pi16(t2, t3);
+       u3 = _mm_unpackhi_pi16(t2, t3);
+
+       Y[2 * 4] = _mm_unpacklo_pi32(u0, u2);
+       Y[2 * 5] = _mm_unpackhi_pi32(u0, u2);
+       Y[2 * 6] = _mm_unpacklo_pi32(u1, u3);
+       Y[2 * 7] = _mm_unpackhi_pi32(u1, u3);
+
+       /* */
+
+       t0 = X[2 * 4];
+       t1 = X[2 * 5];
+       t2 = X[2 * 6];
+       t3 = X[2 * 7];
+
+       u0 = _mm_unpacklo_pi16(t0, t1);
+       u1 = _mm_unpackhi_pi16(t0, t1);
+       u2 = _mm_unpacklo_pi16(t2, t3);
+       u3 = _mm_unpackhi_pi16(t2, t3);
+
+       Y[2 * 0 + 1] = _mm_unpacklo_pi32(u0, u2);
+       Y[2 * 1 + 1] = _mm_unpackhi_pi32(u0, u2);
+       Y[2 * 2 + 1] = _mm_unpacklo_pi32(u1, u3);
+       Y[2 * 3 + 1] = _mm_unpackhi_pi32(u1, u3);
+
+       /* */
+
+       t0 = X[2 * 4 + 1];
+       t1 = X[2 * 5 + 1];
+       t2 = X[2 * 6 + 1];
+       t3 = X[2 * 7 + 1];
+
+       u0 = _mm_unpacklo_pi16(t0, t1);
+       u1 = _mm_unpackhi_pi16(t0, t1);
+       u2 = _mm_unpacklo_pi16(t2, t3);
+       u3 = _mm_unpackhi_pi16(t2, t3);
+
+       Y[2 * 4 + 1] = _mm_unpacklo_pi32(u0, u2);
+       Y[2 * 5 + 1] = _mm_unpackhi_pi32(u0, u2);
+       Y[2 * 6 + 1] = _mm_unpacklo_pi32(u1, u3);
+       Y[2 * 7 + 1] = _mm_unpackhi_pi32(u1, u3);
+}
+
+void QT_UpdateDequantizeMatrix_MMX(
+       FrameDecoder_t* t)
+{
+       FrameReconstructor_SSE2_t* r = t->Reconstructor;
+
+       INT32 i, p;
+
+       for (i = 0; i < 2; i++) {
+               for (p = 0; p < 3; p++) {
+                       const INT16* x = t->Dequantize.Matrix[i][p];
+                       INT16*       y = r->Matrix[i][p];
+                       Transpose_MMX(x, y);
+               }
+       }
+}
+
+/* */
+
+static __inline void Block_CopyPlane8x8_MMX(
+       Plane_t* p,
+       INT32    x,
+       INT32    y,
+       Plane_t* r)
+{
+       const UINT8* s = r->Plane + y * r->Pitch + x;
+       UINT8*       d = p->Plane + y * p->Pitch + x;
+
+       __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+       s0 = *((const __m64*)s); s += r->Pitch;
+       s1 = *((const __m64*)s); s += r->Pitch;
+       s2 = *((const __m64*)s); s += r->Pitch;
+       s3 = *((const __m64*)s); s += r->Pitch;
+       s4 = *((const __m64*)s); s += r->Pitch;
+       s5 = *((const __m64*)s); s += r->Pitch;
+       s6 = *((const __m64*)s); s += r->Pitch;
+       s7 = *((const __m64*)s);
+
+       *((__m64*)d) = s0; d += p->Pitch;
+       *((__m64*)d) = s1; d += p->Pitch;
+       *((__m64*)d) = s2; d += p->Pitch;
+       *((__m64*)d) = s3; d += p->Pitch;
+       *((__m64*)d) = s4; d += p->Pitch;
+       *((__m64*)d) = s5; d += p->Pitch;
+       *((__m64*)d) = s6; d += p->Pitch;
+       *((__m64*)d) = s7;
+}
+
+static __inline void Block_CopyPlane16x16_MMX(
+       Plane_t* p,
+       INT32    x,
+       INT32    y,
+       Plane_t* r)
+{
+       const UINT8* s = r->Plane + y * r->Pitch + x;
+       const UINT8* e = s + 16 * r->Pitch;
+       UINT8*       d = p->Plane + y * p->Pitch + x;
+
+       __m64 s00, s10, s20, s30;
+       __m64 s01, s11, s21, s31;
+
+       while (s < e) {
+               s00 = *((const __m64*)(s + 0));
+               s01 = *((const __m64*)(s + 8)); s += r->Pitch;
+               s10 = *((const __m64*)(s + 0));
+               s11 = *((const __m64*)(s + 8)); s += r->Pitch;
+               s20 = *((const __m64*)(s + 0));
+               s21 = *((const __m64*)(s + 8)); s += r->Pitch;
+               s30 = *((const __m64*)(s + 0));
+               s31 = *((const __m64*)(s + 8)); s += r->Pitch;
+
+               *((__m64*)(d + 0)) = s00;
+               *((__m64*)(d + 8)) = s01; d += p->Pitch;
+               *((__m64*)(d + 0)) = s10;
+               *((__m64*)(d + 8)) = s11; d += p->Pitch;
+               *((__m64*)(d + 0)) = s20;
+               *((__m64*)(d + 8)) = s21; d += p->Pitch;
+               *((__m64*)(d + 0)) = s30;
+               *((__m64*)(d + 8)) = s31; d += p->Pitch;
+       }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 IPRED[4] = {
+       128, 128, 128, 128
+};
+
+static __inline void Block_CopyIntra8x8_MMX(
+       Plane_t*     p,
+       INT32        x,
+       INT32        y,
+       const INT16* c)
+{
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       const __m64* B = (const __m64*)IPRED;
+       const __m64* C = (const __m64*)c;
+
+       __m64 s00, s01, s10, s11;
+       const __m64 z = _mm_setzero_si64();
+
+       s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 0], B[0]), z);
+       s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 1], B[0]), z);
+       s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 0], B[0]), z);
+       s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 1], B[0]), z);
+
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+       s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 0], B[0]), z);
+       s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 1], B[0]), z);
+       s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 0], B[0]), z);
+       s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 1], B[0]), z);
+
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+       s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 0], B[0]), z);
+       s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 1], B[0]), z);
+       s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 0], B[0]), z);
+       s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 1], B[0]), z);
+
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+       s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 0], B[0]), z);
+       s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 1], B[0]), z);
+       s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 0], B[0]), z);
+       s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 1], B[0]), z);
+
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+       *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+       *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11);
+}
+
+static __inline void Block_ReviseInter8x8_MMX(
+       Plane_t*     p,
+       INT32        x,
+       INT32        y,
+       const INT16* c)
+{
+       UINT8* d = p->Plane + y * p->Pitch + x;
+       UINT8* e = d + 8 * p->Pitch;
+
+       const __m64* C = (const __m64*)c;
+
+       __m64 b0, b1;
+       __m64 s0, s1;
+       const __m64 z = _mm_setzero_si64();
+
+       for (; d < e; d += p->Pitch, C += 2) {
+               b0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 0))), z);
+               b1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 4))), z);
+
+               s0 = _mm_packs_pu16(_mm_adds_pi16(C[0], b0), z);
+               s1 = _mm_packs_pu16(_mm_adds_pi16(C[1], b1), z);
+
+               *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s0);
+               *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s1);
+       }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 COS_MMX[8][4] = {
+       {     8,     8,     8,     8 }, /* 0 */
+       { 64277, 64277, 64277, 64277 }, /* 1 */
+       { 60547, 60547, 60547, 60547 }, /* 2 */
+       { 54491, 54491, 54491, 54491 }, /* 3 */
+       { 46341, 46341, 46341, 46341 }, /* 4 */
+       { 36410, 36410, 36410, 36410 }, /* 5 */
+       { 25080, 25080, 25080, 25080 }, /* 6 */
+       { 12785, 12785, 12785, 12785 }  /* 7 */
+};
+
+#define MUL1(T,X) _mm_add_pi16(_mm_mulhi_pi16(X, C[T]), X)
+#define MUL0(T,X) _mm_mulhi_pi16(X, C[T])
+
+/* */
+
+static __inline void IDCT_R_8_MMX(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m64* C = (const __m64*)COS_MMX[0];
+       const __m64* X = (const __m64*)x;
+       const __m64* E = X + 2;
+       __m64*       Y = (__m64*)y;
+
+       __m64 s0;
+       __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+       for (; X < E; X++, Y++) {
+               /* Stage.1 */
+
+               s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
+               t0 = MUL1(4, s0);
+
+               s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
+               t1 = MUL1(4, s0);
+
+               t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
+               t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));
+
+               t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
+               t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));
+
+               t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
+               t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));
+
+               /* Stage.2 */
+
+               s0 = _mm_sub_pi16(t4, t5);
+               t4 = _mm_add_pi16(t4, t5);
+               t5 = MUL1(4, s0);
+
+               s0 = _mm_sub_pi16(t7, t6);
+               t7 = _mm_add_pi16(t7, t6);
+               t6 = MUL1(4, s0);
+
+               /* Stage.3 */
+
+               s0 = _mm_sub_pi16(t0, t3);
+               t0 = _mm_add_pi16(t0, t3);
+
+               t3 = _mm_sub_pi16(t1, t2);
+               t1 = _mm_add_pi16(t1, t2);
+
+               t2 = _mm_sub_pi16(t6, t5);
+               t6 = _mm_add_pi16(t6, t5);
+
+               /* Stage.4 */
+
+               Y[2 * 0] = _mm_add_pi16(t0, t7);
+               Y[2 * 1] = _mm_add_pi16(t1, t6);
+               Y[2 * 2] = _mm_add_pi16(t3, t2);
+               Y[2 * 3] = _mm_add_pi16(s0, t4);
+               Y[2 * 4] = _mm_sub_pi16(s0, t4);
+               Y[2 * 5] = _mm_sub_pi16(t3, t2);
+               Y[2 * 6] = _mm_sub_pi16(t1, t6);
+               Y[2 * 7] = _mm_sub_pi16(t0, t7);
+       }
+}
+
+static __inline void IDCT_R_8_4_MMX(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m64* C = (const __m64*)COS_MMX[0];
+       const __m64* X = (const __m64*)x;
+       __m64*       Y = (__m64*)y;
+
+       __m64 s0;
+       __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+       /* Stage.1 */
+
+       t1 = t0 = MUL1(4, X[2 * 0]);
+
+       t2 = MUL0(6, X[2 * 2]);
+       t3 = MUL1(2, X[2 * 2]);
+
+       t4 = MUL0(7, X[2 * 1]);
+       t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));
+
+       t6 = MUL1(3, X[2 * 3]);
+       t7 = MUL1(1, X[2 * 1]);
+
+       /* Stage.2 */
+
+       s0 = _mm_sub_pi16(t4, t5);
+       t4 = _mm_add_pi16(t4, t5);
+       t5 = MUL1(4, s0);
+
+       s0 = _mm_sub_pi16(t7, t6);
+       t7 = _mm_add_pi16(t7, t6);
+       t6 = MUL1(4, s0);
+
+       /* Stage.3 */
+
+       s0 = _mm_sub_pi16(t0, t3);
+       t0 = _mm_add_pi16(t0, t3);
+
+       t3 = _mm_sub_pi16(t1, t2);
+       t1 = _mm_add_pi16(t1, t2);
+
+       t2 = _mm_sub_pi16(t6, t5);
+       t6 = _mm_add_pi16(t6, t5);
+
+       /* Stage.4 */
+
+       Y[2 * 0] = _mm_add_pi16(t0, t7);
+       Y[2 * 1] = _mm_add_pi16(t1, t6);
+       Y[2 * 2] = _mm_add_pi16(t3, t2);
+       Y[2 * 3] = _mm_add_pi16(s0, t4);
+       Y[2 * 4] = _mm_sub_pi16(s0, t4);
+       Y[2 * 5] = _mm_sub_pi16(t3, t2);
+       Y[2 * 6] = _mm_sub_pi16(t1, t6);
+       Y[2 * 7] = _mm_sub_pi16(t0, t7);
+}
+
+static __inline void IDCT_C_8_MMX(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m64* C = (const __m64*)COS_MMX[0];
+       const __m64* X = (const __m64*)x;
+       const __m64* E = X + 2;
+       __m64*       Y = (__m64*)y;
+
+       __m64 s0;
+       __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+       for (; X < E; X++, Y++) {
+               /* Stage.1 */
+
+               s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
+               t0 = MUL1(4, s0);
+
+               s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
+               t1 = MUL1(4, s0);
+
+               t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
+               t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));
+
+               t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
+               t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));
+
+               t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
+               t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));
+
+               /* Stage.2 */
+
+               s0 = _mm_sub_pi16(t4, t5);
+               t4 = _mm_add_pi16(t4, t5);
+               t5 = MUL1(4, s0);
+
+               s0 = _mm_sub_pi16(t7, t6);
+               t7 = _mm_add_pi16(t7, t6);
+               t6 = MUL1(4, s0);
+
+               /* Stage.3 */
+
+               s0 = _mm_sub_pi16(t0, t3);
+               t0 = _mm_add_pi16(t0, t3);
+
+               t3 = _mm_sub_pi16(t1, t2);
+               t1 = _mm_add_pi16(t1, t2);
+
+               t2 = _mm_sub_pi16(t6, t5);
+               t6 = _mm_add_pi16(t6, t5);
+
+               /* Stage.4 */
+
+               Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
+               Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
+               Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
+               Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
+               Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
+               Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
+               Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
+               Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
+       }
+}
+
+static __inline void IDCT_C_8_4_MMX(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m64* C = (const __m64*)COS_MMX[0];
+       const __m64* X = (const __m64*)x;
+       const __m64* E = X + 2;
+       __m64*       Y = (__m64*)y;
+
+       __m64 s0;
+       __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+       for (; X < E; X++, Y++) {
+               /* Stage.1 */
+
+               t1 = t0 = MUL1(4, X[2 * 0]);
+
+               t2 = MUL0(6, X[2 * 2]);
+               t3 = MUL1(2, X[2 * 2]);
+
+               t4 = MUL0(7, X[2 * 1]);
+               t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));
+
+               t6 = MUL1(3, X[2 * 3]);
+               t7 = MUL1(1, X[2 * 1]);
+
+               /* Stage.2 */
+
+               s0 = _mm_sub_pi16(t4, t5);
+               t4 = _mm_add_pi16(t4, t5);
+               t5 = MUL1(4, s0);
+
+               s0 = _mm_sub_pi16(t7, t6);
+               t7 = _mm_add_pi16(t7, t6);
+               t6 = MUL1(4, s0);
+
+               /* Stage.3 */
+
+               s0 = _mm_sub_pi16(t0, t3);
+               t0 = _mm_add_pi16(t0, t3);
+
+               t3 = _mm_sub_pi16(t1, t2);
+               t1 = _mm_add_pi16(t1, t2);
+
+               t2 = _mm_sub_pi16(t6, t5);
+               t6 = _mm_add_pi16(t6, t5);
+
+               /* Stage.4 */
+
+               Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
+               Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
+               Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
+               Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
+               Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
+               Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
+               Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
+               Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
+       }
+}
+
+/* */
+
+static const UINT8 TZZ[64] = {
+        0,  2,  3,  9, 10, 20, 21, 35,
+        1,  4,  8, 11, 19, 22, 34, 36,
+        5,  7, 12, 18, 23, 33, 37, 48,
+        6, 13, 17, 24, 32, 38, 47, 49,
+       14, 16, 25, 31, 39, 46, 50, 57,
+       15, 26, 30, 40, 45, 51, 56, 58,
+       27, 29, 41, 44, 52, 55, 59, 62,
+       28, 42, 43, 53, 54, 60, 61, 63
+};
+
+static __inline void DequantizeIDCT8x8_MMX(
+       const INT16* block,
+       const INT16* matrix,
+       INT16*       coeff)
+{
+       ALIGN(0x10) INT16 c0[64];
+
+       { /* Reorder */
+               const UINT8* t = TZZ;
+
+               INT16* c = c0;
+               INT16* e = c + 64;
+               for (; c < e; c += 8, t += 8) {
+                       c[0] = block[t[0]];
+                       c[1] = block[t[1]];
+                       c[2] = block[t[2]];
+                       c[3] = block[t[3]];
+                       c[4] = block[t[4]];
+                       c[5] = block[t[5]];
+                       c[6] = block[t[6]];
+                       c[7] = block[t[7]];
+               }
+       }
+
+       { /* Dequantize */
+               __m64*       d = (__m64*) c0;
+               __m64*       e = (__m64*)(c0 + 64);
+               const __m64* m = (const __m64*)matrix;
+
+               for (; d < e; d += 4, m += 4) {
+                       d[0] = _mm_mullo_pi16(d[0], m[0]);
+                       d[1] = _mm_mullo_pi16(d[1], m[1]);
+                       d[2] = _mm_mullo_pi16(d[2], m[2]);
+                       d[3] = _mm_mullo_pi16(d[3], m[3]);
+               }
+       }
+
+       /* iDCT Row */
+       IDCT_R_8_MMX(c0, coeff);
+
+       /* Transpose */
+       Transpose_MMX(coeff, c0);
+
+       /* iDCT Colum */
+       IDCT_C_8_MMX(c0, coeff);
+}
+
+/* */
+
+static __inline void DequantizeIDCT8x8_16_MMX(
+       const INT16* block,
+       const INT16* matrix,
+       INT16*       coeff)
+{
+       ALIGN(0x10) INT16 c0[64];
+
+       const __m64 z = _mm_setzero_si64();
+
+       *((__m64*)(c0 + 0x00)) = z;
+       *((__m64*)(c0 + 0x08)) = z;
+       *((__m64*)(c0 + 0x10)) = z;
+       *((__m64*)(c0 + 0x18)) = z;
+
+       /* Reorder */
+       c0[ 0 + 0] = block[TZZ[ 0 + 0]];
+       c0[ 0 + 1] = block[TZZ[ 0 + 1]];
+       c0[ 0 + 2] = block[TZZ[ 0 + 2]];
+       c0[ 0 + 3] = block[TZZ[ 0 + 3]];
+
+       c0[ 8 + 0] = block[TZZ[ 8 + 0]];
+       c0[ 8 + 1] = block[TZZ[ 8 + 1]];
+       c0[ 8 + 2] = block[TZZ[ 8 + 2]];
+
+       c0[16 + 0] = block[TZZ[16 + 0]];
+       c0[16 + 1] = block[TZZ[16 + 1]];
+
+       c0[24 + 0] = block[TZZ[24 + 0]];
+
+       { /* Dequantize */
+               const __m64* m = (const __m64*)matrix;
+               __m64*       d = (__m64*)c0;
+
+               d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
+               d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
+               d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
+               d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+       }
+
+       /* iDCT Row */
+       IDCT_R_8_4_MMX(c0, coeff);
+
+       /* Transpose */
+       Transpose_MMX(coeff, c0);
+
+       /* iDCT Colum */
+       IDCT_C_8_4_MMX(c0, coeff);
+}
+
+/* */
+
+static __inline void DequantizeIDCT8x8_0_MMX(
+       INT16        dc,
+       const INT16* matrix,
+       INT16*       coeff)
+{
+       INT16* d = coeff;
+       INT16* e = d + 64;
+
+       __m64 d0 = _mm_set1_pi16(((dc * matrix[0]) + 15) >> 5);
+
+       for (; d < e; d += 16) {
+               *((__m64*)(d +  0)) = d0;
+               *((__m64*)(d +  4)) = d0;
+               *((__m64*)(d +  8)) = d0;
+               *((__m64*)(d + 12)) = d0;
+       }
+}
+
+/* */
+
+struct DecodeCoefficientsLeaf {
+
+       INT32 EOB_Run;
+
+       INT8*  Run;
+       INT16* Coeff;
+
+}; /* DecodeCoefficientsLeaf */
+
+typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;
+
+struct DecodeCoefficientsContext {
+
+       DecodeCoefficientsLeaf_t Leaf[64];
+
+}; /* DecodeCoefficientsContext */
+
+typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+
+static INT32 DecodeCoefficients_MMX(
+       FrameDecoder_t*              t,
+       DecodeCoefficientsContext_t* ctx,
+       INT16*                       block)
+{
+       INT16* b = block;
+       INT16* e = b + 64;
+
+       DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
+
+       const __m64 z = _mm_setzero_si64();
+
+       for (; b < e; b += 16) {
+               *((__m64*)(b +  0)) = z;
+               *((__m64*)(b +  4)) = z;
+               *((__m64*)(b +  8)) = z;
+               *((__m64*)(b + 12)) = z;
+       }
+
+       b = block;
+
+       while (b < e) {
+               if (leaf->EOB_Run > 0) {
+                       leaf->EOB_Run -= 1;
+                       break;
+
+               } else {
+                       INT32 run   = *((leaf->Run  )++);
+                       INT32 coeff = *((leaf->Coeff)++);
+
+                       if (run < 0) {
+                               leaf->EOB_Run = coeff;
+
+                       } else {
+                               b += run;
+                               if (b >= e) {
+                                       break;
+                               }
+
+                               *(b++) = coeff;
+
+                               leaf = ctx->Leaf + (b - block);
+                       }
+               }
+       }
+
+       return b - block;
+}
+
+/* */
+
+static void Reconstruct_IntraBlock(
+       FrameDecoder_t*              t,
+       Plane_t*                     p,
+       INT32                        x,
+       INT32                        y,
+       INT16                        dc,
+       INT32                        plane,
+       Plane_t*                     r,
+       DecodeCoefficientsContext_t* ctx)
+{
+       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 coeff[64];
+
+       const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+
+       INT32 cs;
+
+       if (dc == NOT_CODED) {
+               Block_CopyPlane8x8_MMX(p, x, y, r);
+               return;
+       }
+
+       cs = DecodeCoefficients_MMX(t, ctx, block);
+
+       if (cs > 10) {
+               block[0] = dc;
+               DequantizeIDCT8x8_MMX(block, mat[plane], coeff);
+
+       } else if (cs > 1) {
+               block[0] = dc;
+               DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);
+
+       } else {
+               DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
+       }
+
+       Block_CopyIntra8x8_MMX(p, x, y, coeff);
+}
+
+/* */
+
+static void Reconstruct_InterBlock(
+       FrameDecoder_t*              t,
+       Plane_t*                     p,
+       INT32                        x,
+       INT32                        y,
+       INT16                        dc,
+       INT32                        plane,
+       Plane_t*                     r,
+       DecodeCoefficientsContext_t* ctx)
+{
+       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 coeff[64];
+
+       const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+
+       INT32 cs;
+
+       if (dc == NOT_CODED) {
+               if (r != NULL) {
+                       Block_CopyPlane8x8_MMX(p, x, y, r);
+               }
+               return;
+       }
+
+       cs = DecodeCoefficients_MMX(t, ctx, block);
+
+       if (cs > 10) {
+               block[0] = dc;
+               DequantizeIDCT8x8_MMX(block, mat[plane], coeff);
+
+       } else if (cs > 1) {
+               block[0] = dc;
+               DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);
+
+       } else {
+               DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
+       }
+
+       Block_ReviseInter8x8_MMX(p, x, y, coeff);
+}
+
+/* */
+
+static const INT8 S_PX[16] = {
+       0*8, 1*8, 1*8, 0*8,
+       0*8, 0*8, 1*8, 1*8,
+       2*8, 2*8, 3*8, 3*8,
+       3*8, 2*8, 2*8, 3*8
+};
+
+static const INT8 S_PY[16] = {
+       0*8, 0*8, 1*8, 1*8,
+       2*8, 3*8, 3*8, 2*8,
+       2*8, 3*8, 3*8, 2*8,
+       1*8, 1*8, 0*8, 0*8
+};
+
+static const INT8 M_PX[4] = {
+       0*16, 0*16,
+       1*16, 1*16
+};
+
+static const INT8 M_PY[4] = {
+       0*16, 1*16,
+       1*16, 0*16
+};
+
+/* */
+
+static void Reconstruct_YPlane_MMX(
+       FrameDecoder_t* t)
+{
+       INT32 x, y;
+
+       INT32 sx = t->Index->SX[0] * 32;
+       INT32 sy = t->Index->SY[0] * 32;
+
+       INT32 mx = t->Index->MX * 16;
+       INT32 my = t->Index->MY * 16;
+
+       INT32 bx = t->Index->BX[0];
+
+       const UINT16* bi = t->Index->BIndex[0];
+
+       Plane_t* g = t->Frame[0];
+       Plane_t* p = t->Frame[1];
+       Plane_t* r = t->Frame[2];
+
+       const UINT8*          mm = t->MBMode;
+       const MotionVector_t* mv = t->MV;
+
+       ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };
+
+       INT32 i;
+       for (i = 0; i < 64; i++) {
+               ctx.Leaf[i].Run   = t->BRun  [0][i];
+               ctx.Leaf[i].Coeff = t->BCoeff[0][i];
+       }
+
+       for (y = 0; y < sy; y += 32) {
+               for (x = 0; x < sx; x += 32) {
+                       INT32 i = 0;
+
+                       INT32 m;
+                       for (m = 0; m < 4; m++, i += 4) {
+                               INT32 x0 = x + M_PX[m];
+                               INT32 y0 = y + M_PY[m];
+                               if (x0 < mx && y0 < my) {
+                                       switch (*mm) {
+                                       case 0: /* INTER_NOMV */
+                                               Block_CopyPlane16x16_MMX(p, x0, y0, r);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx);
+                                               break;
+
+                                       case 1: /* INTRA */
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 2: /* INTER_MV */
+                                       case 3: /* INTER_MV_LAST */
+                                       case 4: /* INTER_MV_LAST2 */
+                                               MotionComp_Block16x16_MMX(p, x0, y0, r, mv);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 5: /* INTER_GOLDEN_NOMV */
+                                               Block_CopyPlane16x16_MMX(p, x0, y0, g);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 6: /* INTER_GOLDEN_MV */
+                                               MotionComp_Block16x16_MMX(p, x0, y0, g, mv);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 7: /* INTER_MV_FOUR */
+                                       {
+                                               const MotionVector_t* v = mv;
+
+                                               const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;
+
+                                               if (dc[0] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 0, r, v++);
+                                               }
+
+                                               if (dc[1] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 0, r, v++);
+                                               }
+
+                                               if (dc[0 + bx] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 8, r, v++);
+                                               }
+
+                                               if (dc[1 + bx] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 8, r, v++);
+                                               }
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+                                       }
+
+                                       } /* switch */
+
+                                       bi += 4;
+                                       mm += 1;
+                                       mv += 4;
+                               }
+                       }
+               }
+       }
+}
+
+/* */
+
+static void Reconstruct_CPlane_MMX(
+       FrameDecoder_t* t)
+{
+       INT32 x, y;
+
+       INT32 sx = t->Index->SX[1] * 32;
+       INT32 sy = t->Index->SY[1] * 32;
+
+       INT32 mx = t->Index->MX * 8;
+       INT32 my = t->Index->MY * 8;
+
+       INT32 bx = t->Index->BX[1];
+
+       const UINT16* bi = t->Index->BIndex[1];
+
+       Plane_t* g = t->Frame[0];
+       Plane_t* p = t->Frame[1];
+       Plane_t* r = t->Frame[2];
+
+       const INT16* DC0 = t->DC + t->Index->BC[0];
+       const INT16* DC1 = DC0   + t->Index->BC[1];
+
+       const UINT8* m = t->BMode + t->Index->BC[0];
+
+       ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };
+
+       INT32 i;
+       for (i = 0; i < 64; i++) {
+               ctx[0].Leaf[i].Run   = t->BRun  [1][i];
+               ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];
+
+               ctx[1].Leaf[i].Run   = t->BRun  [2][i];
+               ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
+       }
+
+       for (y = 0; y < sy; y += 32) {
+               for (x = 0; x < sx; x += 32) {
+                       INT32 i;
+                       for (i = 0; i < 16; i++) {
+                               INT32 xx = x + S_PX[i];
+                               INT32 yy = y + S_PY[i];
+
+                               if (xx < mx && yy < my) {
+                                       INT32 idx = (xx >> 3) + (yy >> 3) * bx;
+
+                                       switch (m[idx]) {
+                                       case 0: /* INTER_NOMV */
+                                               Block_CopyPlane8x8_MMX(p + 1, xx, yy, r + 1);
+                                               Block_CopyPlane8x8_MMX(p + 2, xx, yy, r + 2);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1);
+                                               break;
+
+                                       case 1: /* INTRA */
+                                               Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 2: /* INTER_MV */
+                                       case 3: /* INTER_MV_LAST */
+                                       case 4: /* INTER_MV_LAST2 */
+                                               MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 5: /* INTER_GOLDEN_NOMV */
+                                               Block_CopyPlane8x8_MMX(p + 1, xx, yy, g + 1);
+                                               Block_CopyPlane8x8_MMX(p + 2, xx, yy, g + 2);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 6: /* INTER_GOLDEN_MV */
+                                               MotionComp_Block8x8C_MMX(p + 1, xx, yy, g + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_MMX(p + 2, xx, yy, g + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 7: /* INTER_MV_FOUR */
+                                               MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       } /* switch */
+                               }
+                       }
+               }
+       }
+}
+
+/* */
+
+void QT_ReconstructFrame_MMX(
+       FrameDecoder_t* t)
+{
+       Reconstruct_YPlane_MMX(t);
+
+       Reconstruct_CPlane_MMX(t);
+
+       if (t->Filter.Limit > 0) {
+               QT_FrameLoopFilter(t);
+       }
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c

index bd286ce..a6413e9 100644 (file)
--- a/Lib/QTheoraEx/FrameReconstructor_SSE2.c
+++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
@@ -819,8 +819,6 @@ static void Reconstruct_InterBlock(
  
  /* */
  
-/* */
-
  static const INT8 S_PX[16] = {
         0*8, 1*8, 1*8, 0*8,
         0*8, 0*8, 1*8, 1*8,
diff --git a/Lib/QTheoraEx/MotionComp_MMX.c b/Lib/QTheoraEx/MotionComp_MMX.c

new file mode 100644 (file)

index 0000000..70adc8e
--- /dev/null
+++ b/Lib/QTheoraEx/MotionComp_MMX.c
@@ -0,0 +1,541 @@
+/* MotionComp_MMX.c */
+/* 2009/07/09       */
+
+#include "StdAfx.h"
+
+#include "MotionComp_MMX.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+void Block_Extract8x8_MMX(
+       const Plane_t* plane,
+       INT32          x,
+       INT32          y,
+       UINT8*         block,
+       INT32          pitch)
+{
+       ALIGN(0x10) UINT8 r[64 * 4];
+
+       INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
+       INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);
+
+       const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;
+
+       INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
+       INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);
+
+       INT32 xy = (xf << 2) | yf;
+
+       UINT8* rr = r;
+
+       {
+               const UINT8* s = NULL;
+               UINT8*       d = NULL;
+
+               switch (xy) {
+               case 10: /* 10 10 */
+                       s = ss;
+                       d = r;
+                       break;
+
+               case  6: /* 01 10 */
+                       s = ss + 7;
+                       d = r + 8;
+                       break;
+
+               case  9: /* 10 01 */
+                       s = ss + 7 * plane->Pitch;
+                       d = r + 8 * 16;
+                       break;
+
+               case  5: /* 01 01 */
+                       s = ss + 7 * plane->Pitch + 7;
+                       d = r + 8 * 16 + 8;
+                       break;
+               }
+
+               if (d != NULL) {
+                       __m64 pix = _mm_set1_pi8(s[0]);
+                       *((__m64*)(d + 16 * 0)) = pix;
+                       *((__m64*)(d + 16 * 1)) = pix;
+                       *((__m64*)(d + 16 * 2)) = pix;
+                       *((__m64*)(d + 16 * 3)) = pix;
+                       *((__m64*)(d + 16 * 4)) = pix;
+                       *((__m64*)(d + 16 * 5)) = pix;
+                       *((__m64*)(d + 16 * 6)) = pix;
+                       *((__m64*)(d + 16 * 7)) = pix;
+               }
+       }
+
+       {
+               const UINT8* sx = NULL;
+               UINT8*       dx = r;
+
+               const UINT8* sy = NULL;
+               UINT8*       dy = r;
+
+               if (xf == 2) {
+                       sx = ss;
+                       dy += 8;
+                       rr += 8;
+               } else if (xf == 1) {
+                       sx = ss + 7;
+                       dx += 8;
+               }
+
+               if (yf == 2) {
+                       sy = ss;
+                       dx += 64 * 2;
+                       rr += 64 * 2;
+               } else if (yf == 1) {
+                       sy = ss + 7 * plane->Pitch;
+                       dy += 64 * 2;
+               }
+
+               if (sx != NULL) {
+                       *((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
+               }
+
+               if (sy != NULL) {
+                       __m64 pix = *((const __m64*)sy);
+                       *((__m64*)(dy + 16 * 0)) = pix;
+                       *((__m64*)(dy + 16 * 1)) = pix;
+                       *((__m64*)(dy + 16 * 2)) = pix;
+                       *((__m64*)(dy + 16 * 3)) = pix;
+                       *((__m64*)(dy + 16 * 4)) = pix;
+                       *((__m64*)(dy + 16 * 5)) = pix;
+                       *((__m64*)(dy + 16 * 6)) = pix;
+                       *((__m64*)(dy + 16 * 7)) = pix;
+               }
+       }
+
+       *((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
+       *((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
+       *((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
+       *((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
+       *((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
+       *((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
+       *((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
+       *((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));
+
+       if (x < 0) {
+               if (x <= -8) x = -8;
+               rr += x;
+       } else if (x > plane->CX - 8) {
+               x -= plane->CX - 8;
+               if (x >= 8) x = 8;
+               rr += x;
+       }
+
+       if (y < 0) {
+               if (y <= -8) y = -8;
+               rr += y * 16;
+       } else if (y > plane->CY - 8) {
+               y -= plane->CY - 8;
+               if (y >= 8) y = 8;
+               rr += y * 16;
+       }
+
+       *((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
+       *((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
+       *((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
+       *((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
+       *((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
+       *((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
+       *((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
+       *((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
+}
+
+/* */
+
+static void MotionComp_Compensate16x16_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y);
+
+static void MotionComp_Compensate8x8_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y);
+
+static void MotionComp_Compensate16x16H_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
+
+static void MotionComp_Compensate8x8H_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
+
+/* */
+
+void MotionComp_Compensate16x16_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y)
+{
+       if (x >= 0 && x + 16 < r->CX &&
+               y >= 0 && y + 16 < r->CY) {
+               const UINT8* s = r->Plane + y * r->Pitch + x;
+               const UINT8* e = s + 16 * r->Pitch;
+               UINT8*       d = p;
+
+               __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+               while (s < e) {
+                       s0 = *((const __m64*)(s + 0));
+                       s1 = *((const __m64*)(s + 8)); s += r->Pitch;
+                       s2 = *((const __m64*)(s + 0));
+                       s3 = *((const __m64*)(s + 8)); s += r->Pitch;
+                       s4 = *((const __m64*)(s + 0));
+                       s5 = *((const __m64*)(s + 8)); s += r->Pitch;
+                       s6 = *((const __m64*)(s + 0));
+                       s7 = *((const __m64*)(s + 8)); s += r->Pitch;
+
+                       *((__m64*)(d + 0)) = s0;
+                       *((__m64*)(d + 8)) = s1; d += pitch;
+                       *((__m64*)(d + 0)) = s2;
+                       *((__m64*)(d + 8)) = s3; d += pitch;
+                       *((__m64*)(d + 0)) = s4;
+                       *((__m64*)(d + 8)) = s5; d += pitch;
+                       *((__m64*)(d + 0)) = s6;
+                       *((__m64*)(d + 8)) = s7; d += pitch;
+               }
+
+       } else {
+               MotionComp_Compensate8x8_MMX(p,                 pitch, r, x,     y    );
+               MotionComp_Compensate8x8_MMX(p + 8,             pitch, r, x + 8, y    );
+               MotionComp_Compensate8x8_MMX(p     + 8 * pitch, pitch, r, x,     y + 8);
+               MotionComp_Compensate8x8_MMX(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
+       }
+}
+
+void MotionComp_Compensate8x8_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y)
+{
+       if (x < 0 || x + 8 >= r->CX ||
+               y < 0 || y + 8 >= r->CY) {
+               Block_Extract8x8_MMX(r, x, y, p, pitch);
+
+       } else {
+               const UINT8* s  = r->Plane + y * r->Pitch + x;
+               INT32        p0 = r->Pitch;
+               UINT8*       d  = p;
+
+               __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+               s0 = *((const __m64*)s); s += p0;
+               s1 = *((const __m64*)s); s += p0;
+               s2 = *((const __m64*)s); s += p0;
+               s3 = *((const __m64*)s); s += p0;
+               s4 = *((const __m64*)s); s += p0;
+               s5 = *((const __m64*)s); s += p0;
+               s6 = *((const __m64*)s); s += p0;
+               s7 = *((const __m64*)s);
+
+               *((__m64*)d) = s0; d += pitch;
+               *((__m64*)d) = s1; d += pitch;
+               *((__m64*)d) = s2; d += pitch;
+               *((__m64*)d) = s3; d += pitch;
+               *((__m64*)d) = s4; d += pitch;
+               *((__m64*)d) = s5; d += pitch;
+               *((__m64*)d) = s6; d += pitch;
+               *((__m64*)d) = s7;
+       }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT8 MASK_FE[8] = { 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe };
+
+void MotionComp_Compensate16x16H_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1)
+{
+       if (x0 >= 0 && x0 + 16 < r->CX &&
+               y0 >= 0 && y0 + 16 < r->CY &&
+               x1 >= 0 && x1 + 16 < r->CX &&
+               y1 >= 0 && y1 + 16 < r->CY) {
+               const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+               const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+
+               UINT8* d = p;
+               UINT8* e = d + 16 * pitch;
+
+               __m64 S0, S1, D;
+               const __m64 F = *((const __m64*)MASK_FE);
+
+               while (d < e) {
+                       S0 = *((const __m64*)(s0 + 0));
+                       S1 = *((const __m64*)(s1 + 0));
+                       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+                       *((__m64*)(d + 0)) = D;
+
+                       S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
+                       S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
+                       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+                       *((__m64*)(d + 8)) = D; d += pitch;
+
+                       S0 = *((const __m64*)(s0 + 0));
+                       S1 = *((const __m64*)(s1 + 0));
+                       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+                       *((__m64*)(d + 0)) = D;
+
+                       S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
+                       S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
+                       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+                       *((__m64*)(d + 8)) = D; d += pitch;
+               }
+
+       } else {
+               MotionComp_Compensate8x8H_MMX(p,                 pitch, r, x0,     y0    , x1,     y1    );
+               MotionComp_Compensate8x8H_MMX(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
+               MotionComp_Compensate8x8H_MMX(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
+               MotionComp_Compensate8x8H_MMX(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
+       }
+}
+
+void MotionComp_Compensate8x8H_MMX(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1)
+{
+       ALIGN(0x10) UINT8 b0[64], b1[64];
+
+       const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+       INT32        p0 = r->Pitch;
+
+       const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+       INT32        p1 = r->Pitch;
+
+       UINT8* d = p;
+
+       __m64 S0, S1, D;
+       const __m64 F = *((const __m64*)MASK_FE);
+
+       if (x0 < 0 || x0 + 8 >= r->CX ||
+               y0 < 0 || y0 + 8 >= r->CY ||
+               x1 < 0 || x1 + 8 >= r->CX ||
+               y1 < 0 || y1 + 8 >= r->CY) {
+               s0 = b0;
+               p0 = 8;
+
+               s1 = b1;
+               p1 = 8;
+
+               Block_Extract8x8_MMX(r, x0, y0, b0, 8);
+               Block_Extract8x8_MMX(r, x1, y1, b1, 8);
+       }
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0);
+       S1 = *((const __m64*)s1);
+       D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+       *((__m64*)d) = D;
+}
+
+/* */
+
+void MotionComp_Block16x16_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv)
+{
+       INT32 dx = ((mv->X & 1) != 0);
+       INT32 dy = ((mv->Y & 1) != 0);
+
+       INT32 vx[2] = { mv->X >> 1 };
+       INT32 vy[2] = { mv->Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate16x16_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv->X >= 0] += dx;
+               vy[mv->Y >= 0] += dy;
+
+               MotionComp_Compensate16x16H_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+void MotionComp_Block8x8Y_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv)
+{
+       INT32 dx = ((mv->X & 1) != 0);
+       INT32 dy = ((mv->Y & 1) != 0);
+
+       INT32 vx[2] = { mv->X >> 1 };
+       INT32 vy[2] = { mv->Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate8x8_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv->X >= 0] += dx;
+               vy[mv->Y >= 0] += dy;
+
+               MotionComp_Compensate8x8H_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+void MotionComp_Block8x8C_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv0)
+{
+       MotionVector_t mv = {
+               (mv0->X >> 1) | (mv0->X & 1),
+               (mv0->Y >> 1) | (mv0->Y & 1)
+       };
+
+       INT32 dx = ((mv.X & 1) != 0);
+       INT32 dy = ((mv.Y & 1) != 0);
+
+       INT32 vx[2] = { mv.X >> 1 };
+       INT32 vy[2] = { mv.Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate8x8_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv.X >= 0] += dx;
+               vy[mv.Y >= 0] += dy;
+
+               MotionComp_Compensate8x8H_MMX(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_MMX.h b/Lib/QTheoraEx/MotionComp_MMX.h

new file mode 100644 (file)

index 0000000..ea529e4
--- /dev/null
+++ b/Lib/QTheoraEx/MotionComp_MMX.h
@@ -0,0 +1,41 @@
+/* MotionComp_MMX.h */
+/* 2009/07/09       */
+
+#pragma once
+
+#include "FrameDecoder.h"
+
+/* */
+
+void Block_Extract8x8_MMX(
+       const Plane_t* plane,
+       INT32          x,
+       INT32          y,
+       UINT8*         block,
+       INT32          pitch);
+
+/* */
+
+void MotionComp_Block16x16_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+void MotionComp_Block8x8Y_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+void MotionComp_Block8x8C_MMX(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.c b/Lib/QTheoraEx/MotionComp_SSE2.c

index d2ca154..bf229c2 100644 (file)
--- a/Lib/QTheoraEx/MotionComp_SSE2.c
+++ b/Lib/QTheoraEx/MotionComp_SSE2.c
@@ -3,6 +3,7 @@
  
  #include "StdAfx.h"
  
+#include "MotionComp_MMX.h"
  #include "MotionComp_SSE2.h"
  
  /* */
@@ -11,205 +12,37 @@
  
  /* */
  
-static void Block_Extract8x8_MMX(
-       const Plane_t* plane,
-       INT32          x,
-       INT32          y,
-       UINT8*         block,
-       INT32          pitch)
-{
-       ALIGN(0x10) UINT8 r[64 * 4];
-
-       INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
-       INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);
-
-       const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;
-
-       INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
-       INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);
-
-       INT32 xy = (xf << 2) | yf;
-
-       UINT8* rr = r;
-
-       {
-               const UINT8* s = NULL;
-               UINT8*       d = NULL;
-
-               switch (xy) {
-               case 10: /* 10 10 */
-                       s = ss;
-                       d = r;
-                       break;
-
-               case  6: /* 01 10 */
-                       s = ss + 7;
-                       d = r + 8;
-                       break;
-
-               case  9: /* 10 01 */
-                       s = ss + 7 * plane->Pitch;
-                       d = r + 8 * 16;
-                       break;
-
-               case  5: /* 01 01 */
-                       s = ss + 7 * plane->Pitch + 7;
-                       d = r + 8 * 16 + 8;
-                       break;
-               }
-
-               if (d != NULL) {
-                       __m64 pix = _mm_set1_pi8(s[0]);
-                       *((__m64*)(d + 16 * 0)) = pix;
-                       *((__m64*)(d + 16 * 1)) = pix;
-                       *((__m64*)(d + 16 * 2)) = pix;
-                       *((__m64*)(d + 16 * 3)) = pix;
-                       *((__m64*)(d + 16 * 4)) = pix;
-                       *((__m64*)(d + 16 * 5)) = pix;
-                       *((__m64*)(d + 16 * 6)) = pix;
-                       *((__m64*)(d + 16 * 7)) = pix;
-               }
-       }
-
-       {
-               const UINT8* sx = NULL;
-               UINT8*       dx = r;
-
-               const UINT8* sy = NULL;
-               UINT8*       dy = r;
-
-               if (xf == 2) {
-                       sx = ss;
-                       dy += 8;
-                       rr += 8;
-               } else if (xf == 1) {
-                       sx = ss + 7;
-                       dx += 8;
-               }
-
-               if (yf == 2) {
-                       sy = ss;
-                       dx += 64 * 2;
-                       rr += 64 * 2;
-               } else if (yf == 1) {
-                       sy = ss + 7 * plane->Pitch;
-                       dy += 64 * 2;
-               }
-
-               if (sx != NULL) {
-                       *((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
-                       *((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
-               }
-
-               if (sy != NULL) {
-                       __m64 pix = *((const __m64*)sy);
-                       *((__m64*)(dy + 16 * 0)) = pix;
-                       *((__m64*)(dy + 16 * 1)) = pix;
-                       *((__m64*)(dy + 16 * 2)) = pix;
-                       *((__m64*)(dy + 16 * 3)) = pix;
-                       *((__m64*)(dy + 16 * 4)) = pix;
-                       *((__m64*)(dy + 16 * 5)) = pix;
-                       *((__m64*)(dy + 16 * 6)) = pix;
-                       *((__m64*)(dy + 16 * 7)) = pix;
-               }
-       }
-
-       *((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
-       *((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
-       *((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
-       *((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
-       *((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
-       *((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
-       *((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
-       *((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));
-
-       if (x < 0) {
-               if (x <= -8) x = -8;
-               rr += x;
-       } else if (x > plane->CX - 8) {
-               x -= plane->CX - 8;
-               if (x >= 8) x = 8;
-               rr += x;
-       }
-
-       if (y < 0) {
-               if (y <= -8) y = -8;
-               rr += y * 16;
-       } else if (y > plane->CY - 8) {
-               y -= plane->CY - 8;
-               if (y >= 8) y = 8;
-               rr += y * 16;
-       }
-
-       *((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
-       *((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
-       *((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
-       *((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
-       *((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
-       *((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
-       *((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
-       *((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
-}
-
-/* */
-
-#if 0
-static void Block_Extract8x8_C(
-       const Plane_t* plane,
+static void MotionComp_Compensate16x16_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
         INT32          x,
-       INT32          y,
-       UINT8*         block,
-       INT32          pitch)
-{
-       INT32 i, j;
-
-       for (i = 0; i < 8; i++) {
-               for (j = 0; j < 8; j++) {
-                       INT32 xx = x + j;
-                       INT32 yy = y + i;
-
-                       if (xx < 0) {
-                               xx = 0;
-                       } else if (xx >= plane->CX) {
-                               xx = plane->CX - 1;
-                       }
-
-                       if (yy < 0) {
-                               yy = 0;
-                       } else if (yy >= plane->CY) {
-                               yy = plane->CY - 1;
-                       }
-
-                       block[i * pitch + j] = plane->Plane[yy * plane->Pitch + xx];
-               }
-       }
-}
+       INT32          y);
  
-static void Block_Extract8x8(
-       const Plane_t* plane,
+static void MotionComp_Compensate8x8_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
         INT32          x,
-       INT32          y,
-       UINT8*         block,
-       INT32          pitch)
-{
-       ALIGN(0x10) UINT8 b0[64], b1[64];
+       INT32          y);
  
-       Block_Extract8x8_MMX(plane, x, y, b0, 8);
-       Block_Extract8x8_C  (plane, x, y, b1, 8);
-
-       if (memcmp(b0, b1, 64) != 0) {
-               __asm int 3;
-       }
+static void MotionComp_Compensate16x16H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
  
-       Block_Extract8x8_C(plane, x, y, block, pitch);
-}
-#endif
+static void MotionComp_Compensate8x8H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
  
  /* */
  
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.h b/Lib/QTheoraEx/MotionComp_SSE2.h

index 59020a0..e8e40ad 100644 (file)
--- a/Lib/QTheoraEx/MotionComp_SSE2.h
+++ b/Lib/QTheoraEx/MotionComp_SSE2.h
@@ -7,40 +7,6 @@
  
  /* */
  
-void MotionComp_Compensate16x16_SSE2(
-       UINT8*         p,
-       INT32          pitch,
-       const Plane_t* r,
-       INT32          x,
-       INT32          y);
-
-void MotionComp_Compensate8x8_SSE2(
-       UINT8*         p,
-       INT32          pitch,
-       const Plane_t* r,
-       INT32          x,
-       INT32          y);
-
-void MotionComp_Compensate16x16H_SSE2(
-       UINT8*         p,
-       INT32          pitch,
-       const Plane_t* r,
-       INT32          x0,
-       INT32          y0,
-       INT32          x1,
-       INT32          y1);
-
-void MotionComp_Compensate8x8H_SSE2(
-       UINT8*         p,
-       INT32          pitch,
-       const Plane_t* r,
-       INT32          x0,
-       INT32          y0,
-       INT32          x1,
-       INT32          y1);
-
-/* */
-
  void MotionComp_Block16x16_SSE2(
         Plane_t*              p,
         INT32                 x,
diff --git a/Lib/QTheoraEx/QTheoraEx.vcproj b/Lib/QTheoraEx/QTheoraEx.vcproj

index cca8e3b..434fc6b 100644 (file)
--- a/Lib/QTheoraEx/QTheoraEx.vcproj
+++ b/Lib/QTheoraEx/QTheoraEx.vcproj
@@ -205,6 +205,10 @@
                                 </FileConfiguration>
                         </File>
                         <File
+                               RelativePath=".\FrameReconstructor_MMX.c"
+                               >
+                       </File>
+                       <File
                                 RelativePath=".\FrameReconstructor_SSE2.c"
                                 >
                                 <FileConfiguration
@@ -229,6 +233,10 @@
                                 >
                         </File>
                         <File
+                               RelativePath=".\MotionComp_MMX.c"
+                               >
+                       </File>
+                       <File
                                 RelativePath=".\MotionComp_SSE2.c"
                                 >
                                 <FileConfiguration
@@ -327,6 +335,10 @@
                                 >
                         </File>
                         <File
+                               RelativePath=".\MotionComp_MMX.h"
+                               >
+                       </File>
+                       <File
                                 RelativePath=".\MotionComp_SSE2.h"
                                 >
                         </File>
author	Noumi Akira <noumiakira@users.sourceforge.jp>
	Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)
committer	Noumi Akira <noumiakira@users.sourceforge.jp>
	Thu, 9 Jul 2009 06:58:39 +0000 (15:58 +0900)
Lib/QTheoraEx/FrameDecoder.c		patch \| blob \| history
Lib/QTheoraEx/FrameReconstructor.h		patch \| blob \| history
Lib/QTheoraEx/FrameReconstructor_MMX.c	[new file with mode: 0644]	patch \| blob
Lib/QTheoraEx/FrameReconstructor_SSE2.c		patch \| blob \| history
Lib/QTheoraEx/MotionComp_MMX.c	[new file with mode: 0644]	patch \| blob
Lib/QTheoraEx/MotionComp_MMX.h	[new file with mode: 0644]	patch \| blob
Lib/QTheoraEx/MotionComp_SSE2.c		patch \| blob \| history
Lib/QTheoraEx/MotionComp_SSE2.h		patch \| blob \| history
Lib/QTheoraEx/QTheoraEx.vcproj		patch \| blob \| history