OSDN Git Service

add FrameReconstructor_SSE2.
authorNoumi Akira <noumiakira@users.sourceforge.jp>
Thu, 2 Jul 2009 08:34:43 +0000 (17:34 +0900)
committerNoumi Akira <noumiakira@users.sourceforge.jp>
Thu, 2 Jul 2009 08:34:43 +0000 (17:34 +0900)
Lib/QTheoraEx/FrameDecoder.c
Lib/QTheoraEx/FrameDecoder.h
Lib/QTheoraEx/FrameDecoder_Impl.h
Lib/QTheoraEx/FrameReconstructor.h
Lib/QTheoraEx/FrameReconstructor_SSE2.c [new file with mode: 0644]
Lib/QTheoraEx/MotionComp_SSE2.c [new file with mode: 0644]
Lib/QTheoraEx/MotionComp_SSE2.h [new file with mode: 0644]
Lib/QTheoraEx/QTheoraEx.vcproj

index 08b7afd..a7d4647 100644 (file)
@@ -630,6 +630,15 @@ BOOL QT_FrameDecoder_Setup(
 
        /* */
 
+       if (g_QT_Enable_SSE2) {
+               t->Reconstructor = (FrameReconstructor_SSE2_t*)QT_MemoryPool_Allocate(pool, sizeof(FrameReconstructor_SSE2_t));
+               if (t->Reconstructor == NULL) {
+                       return FALSE;
+               }
+       }
+
+       /* */
+
        t->QIndex = -1;
 
        /* */
@@ -691,6 +700,20 @@ BOOL QT_FrameDecoder_Setup(
 
        /* */
 
+       if (g_QT_Enable_SSE2) {
+               t->UpdateDequantizeMatrix = QT_UpdateDequantizeMatrix_SSE2;
+       } else {
+               t->UpdateDequantizeMatrix = NULL;
+       }
+
+       if (g_QT_Enable_SSE2) {
+               t->Reconstruct = QT_ReconstructFrame_SSE2;
+       } else {
+               t->Reconstruct = QT_ReconstructFrame;
+       }
+
+       /* */
+
        return TRUE;
 }
 
index 9e58844..0255c1f 100644 (file)
@@ -75,6 +75,15 @@ struct MotionVector {
 
 typedef struct MotionVector MotionVector_t;
 
+/* FrameReconstructor_SSE2 */
+struct FrameReconstructor_SSE2 {
+
+       INT16 Matrix[2][3][64];
+
+}; /* FrameReconstructor_SSE2 */
+
+typedef struct FrameReconstructor_SSE2 FrameReconstructor_SSE2_t;
+
 /* QT_FrameDecoder */
 
 struct QT_FrameDecoder;
@@ -94,6 +103,8 @@ struct QT_FrameDecoder {
 
        DequantizeMatrix_t Dequantize;
 
+       FrameReconstructor_SSE2_t* Reconstructor;
+
        LoopFilter_t Filter;
 
        INT32 QIndex;
@@ -126,6 +137,10 @@ struct QT_FrameDecoder {
 
        BOOL (*Decode)(FrameDecoder_t*, const VOID*, SIZE_T);
 
+       VOID (*UpdateDequantizeMatrix)(FrameDecoder_t*);
+
+       VOID (*Reconstruct)(FrameDecoder_t*);
+
 }; /* QT_FrameDecoder */
 
 /* */
index 9bcb6be..a7e6dcf 100644 (file)
@@ -610,6 +610,10 @@ static BOOL FrameDecoder_Decode(
                        &(t->Filter),
                        &(t->Setup->Filter),
                        t->QIndex);
+
+               if (t->UpdateDequantizeMatrix != NULL) {
+                       t->UpdateDequantizeMatrix(t);
+               }
        }
 
        /* */
@@ -717,7 +721,7 @@ static BOOL FrameDecoder_Decode(
 
        /* */
 
-       QT_ReconstructFrame(t);
+       t->Reconstruct(t);
 
        /* */
 
index 68f9ea8..754dc1c 100644 (file)
@@ -5,9 +5,21 @@
 
 #include "FrameDecoder.h"
 
+/* */
+
 void QT_FrameLoopFilter(
        FrameDecoder_t* t);
 
 void QT_ReconstructFrame(
        FrameDecoder_t* t);
 
+/* */
+
+void QT_UpdateDequantizeMatrix_SSE2(
+       FrameDecoder_t* t);
+
+void QT_ReconstructFrame_SSE2(
+       FrameDecoder_t* t);
+
+/* */
+
diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
new file mode 100644 (file)
index 0000000..8b5e765
--- /dev/null
@@ -0,0 +1,872 @@
+/* FrameReconstructor_SSE2.c */
+/* 2009/07/02                */
+
+#include "StdAfx.h"
+
+#include "FrameReconstructor.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Transpose_SSE2(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m128i* X = (const __m128i*)x;
+       __m128i*       Y = (__m128i*)y;
+
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+       __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+
+       t0 = _mm_loadu_si128(X + 0);
+       t1 = _mm_loadu_si128(X + 1);
+       t2 = _mm_loadu_si128(X + 2);
+       t3 = _mm_loadu_si128(X + 3);
+       t4 = _mm_loadu_si128(X + 4);
+       t5 = _mm_loadu_si128(X + 5);
+       t6 = _mm_loadu_si128(X + 6);
+       t7 = _mm_loadu_si128(X + 7);
+
+       u0 = _mm_unpacklo_epi16(t0, t1);
+       u1 = _mm_unpackhi_epi16(t0, t1);
+       u2 = _mm_unpacklo_epi16(t2, t3);
+       u3 = _mm_unpackhi_epi16(t2, t3);
+       u4 = _mm_unpacklo_epi16(t4, t5);
+       u5 = _mm_unpackhi_epi16(t4, t5);
+       u6 = _mm_unpacklo_epi16(t6, t7);
+       u7 = _mm_unpackhi_epi16(t6, t7);
+
+       t0 = _mm_unpacklo_epi32(u0, u2);
+       t1 = _mm_unpacklo_epi32(u1, u3);
+       t2 = _mm_unpackhi_epi32(u0, u2);
+       t3 = _mm_unpackhi_epi32(u1, u3);
+       t4 = _mm_unpacklo_epi32(u4, u6);
+       t5 = _mm_unpacklo_epi32(u5, u7);
+       t6 = _mm_unpackhi_epi32(u4, u6);
+       t7 = _mm_unpackhi_epi32(u5, u7);
+
+       Y[0] = _mm_unpacklo_epi64(t0, t4);
+       Y[1] = _mm_unpackhi_epi64(t0, t4);
+       Y[2] = _mm_unpacklo_epi64(t2, t6);
+       Y[3] = _mm_unpackhi_epi64(t2, t6);
+       Y[4] = _mm_unpacklo_epi64(t1, t5);
+       Y[5] = _mm_unpackhi_epi64(t1, t5);
+       Y[6] = _mm_unpacklo_epi64(t3, t7);
+       Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+void QT_UpdateDequantizeMatrix_SSE2(
+       FrameDecoder_t* t)
+{
+       FrameReconstructor_SSE2_t* r = t->Reconstructor;
+
+       INT32 i, p;
+
+       for (i = 0; i < 2; i++) {
+               for (p = 0; p < 3; p++) {
+                       const INT16* x = t->Dequantize.Matrix[i][p];
+                       INT16*       y = r->Matrix[i][p];
+                       Transpose_SSE2(x, y);
+               }
+       }
+}
+
+/* */
+
+static __inline void Block_CopyPlane8x8_SSE2(
+       Plane_t* p,
+       INT32    x,
+       INT32    y,
+       Plane_t* r)
+{
+       const UINT8* s = r->Plane + y * r->Pitch + x;
+       UINT8*       d = p->Plane + y * p->Pitch + x;
+
+       __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+       s0 = *((const __m64*)s); s += r->Pitch;
+       s1 = *((const __m64*)s); s += r->Pitch;
+       s2 = *((const __m64*)s); s += r->Pitch;
+       s3 = *((const __m64*)s); s += r->Pitch;
+       s4 = *((const __m64*)s); s += r->Pitch;
+       s5 = *((const __m64*)s); s += r->Pitch;
+       s6 = *((const __m64*)s); s += r->Pitch;
+       s7 = *((const __m64*)s);
+
+       *((__m64*)d) = s0; d += p->Pitch;
+       *((__m64*)d) = s1; d += p->Pitch;
+       *((__m64*)d) = s2; d += p->Pitch;
+       *((__m64*)d) = s3; d += p->Pitch;
+       *((__m64*)d) = s4; d += p->Pitch;
+       *((__m64*)d) = s5; d += p->Pitch;
+       *((__m64*)d) = s6; d += p->Pitch;
+       *((__m64*)d) = s7;
+}
+
+static __inline void Block_CopyPlane16x16_SSE2(
+       Plane_t* p,
+       INT32    x,
+       INT32    y,
+       Plane_t* r)
+{
+       const UINT8* s = r->Plane + y * r->Pitch + x;
+       UINT8*       d = p->Plane + y * p->Pitch + x;
+
+       __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+       s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s7 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+
+       _mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s7); d += p->Pitch;
+
+       s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+       s7 = _mm_load_si128((const __m128i*)s);
+
+       _mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+       _mm_store_si128((__m128i*)d, s7);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 IPRED[8] = {
+       128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static __inline void Block_CopyIntra8x8_SSE2(
+       Plane_t*     p,
+       INT32        x,
+       INT32        y,
+       const INT16* c)
+{
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       const __m128i* B = (const __m128i*)IPRED;
+       const __m128i* C = (const __m128i*)c;
+
+       __m128i s0, s1, s2, s3;
+       const __m128i z = _mm_setzero_si128();
+
+       s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], B[0]), z);
+       s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], B[0]), z);
+       s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], B[0]), z);
+       s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], B[0]), z);
+
+       _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+       s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], B[0]), z);
+       s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], B[0]), z);
+       s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], B[0]), z);
+       s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], B[0]), z);
+
+       _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s3);
+}
+
+static __inline void Block_ReviseInter8x8_SSE2(
+       Plane_t*     p,
+       INT32        x,
+       INT32        y,
+       const INT16* c)
+{
+       UINT8*       d = p->Plane + y * p->Pitch + x;
+       const UINT8* s = d;
+
+       const __m128i* C = (const __m128i*)c;
+
+       __m128i b0, b1, b2, b3;
+       __m128i s0, s1, s2, s3;
+       const __m128i z = _mm_setzero_si128();
+
+       b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+
+       s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], b0), z);
+       s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], b1), z);
+       s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], b2), z);
+       s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], b3), z);
+
+       _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+       b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+       b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z);
+
+       s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], b0), z);
+       s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], b1), z);
+       s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], b2), z);
+       s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], b3), z);
+
+       _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+       _mm_storel_epi64((__m128i*)d, s3);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 COS[8][8] = {
+       {     8,     8,     8,     8,     8,     8,     8,     8 }, /* 0 */
+       { 64277, 64277, 64277, 64277, 64277, 64277, 64277, 64277 }, /* 1 */
+       { 60547, 60547, 60547, 60547, 60547, 60547, 60547, 60547 }, /* 2 */
+       { 54491, 54491, 54491, 54491, 54491, 54491, 54491, 54491 }, /* 3 */
+       { 46341, 46341, 46341, 46341, 46341, 46341, 46341, 46341 }, /* 4 */
+       { 36410, 36410, 36410, 36410, 36410, 36410, 36410, 36410 }, /* 5 */
+       { 25080, 25080, 25080, 25080, 25080, 25080, 25080, 25080 }, /* 6 */
+       { 12785, 12785, 12785, 12785, 12785, 12785, 12785, 12785 }, /* 7 */
+};
+
+#define MUL1(T,X) _mm_add_epi16(_mm_mulhi_epi16(X, C[T]), X)
+#define MUL0(T,X) _mm_mulhi_epi16(X, C[T])
+
+static __inline void IDCT_R_8_SSE2(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m128i* C = (const __m128i*)COS[0];
+       const __m128i* X = (const __m128i*)x;
+       __m128i*       Y = (__m128i*)y;
+
+       __m128i s0;
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+       /* Stage.1 */
+
+       s0 = _mm_add_epi16(X[0], X[4]);
+       t0 = MUL1(4, s0);
+
+       s0 = _mm_sub_epi16(X[0], X[4]);
+       t1 = MUL1(4, s0);
+
+       t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+       t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+       t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+       t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+       t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+       t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+       /* Stage.2 */
+
+       s0 = _mm_sub_epi16(t4, t5);
+       t4 = _mm_add_epi16(t4, t5);
+       t5 = MUL1(4, s0);
+
+       s0 = _mm_sub_epi16(t7, t6);
+       t7 = _mm_add_epi16(t7, t6);
+       t6 = MUL1(4, s0);
+
+       /* Stage.3 */
+
+       s0 = _mm_sub_epi16(t0, t3);
+       t0 = _mm_add_epi16(t0, t3);
+
+       t3 = _mm_sub_epi16(t1, t2);
+       t1 = _mm_add_epi16(t1, t2);
+
+       t2 = _mm_sub_epi16(t6, t5);
+       t6 = _mm_add_epi16(t6, t5);
+
+       /* Stage.4 */
+
+       Y[0] = _mm_add_epi16(t0, t7);
+       Y[1] = _mm_add_epi16(t1, t6);
+       Y[2] = _mm_add_epi16(t3, t2);
+       Y[3] = _mm_add_epi16(s0, t4);
+       Y[4] = _mm_sub_epi16(s0, t4);
+       Y[5] = _mm_sub_epi16(t3, t2);
+       Y[6] = _mm_sub_epi16(t1, t6);
+       Y[7] = _mm_sub_epi16(t0, t7);
+}
+
+static __inline void IDCT_C_8_SSE2(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m128i* C = (const __m128i*)COS[0];
+       const __m128i* X = (const __m128i*)x;
+       __m128i*       Y = (__m128i*)y;
+
+       __m128i s0;
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+       /* Stage.1 */
+
+       s0 = _mm_add_epi16(X[0], X[4]);
+       t0 = MUL1(4, s0);
+
+       s0 = _mm_sub_epi16(X[0], X[4]);
+       t1 = MUL1(4, s0);
+
+       t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+       t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+       t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+       t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+       t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+       t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+       /* Stage.2 */
+
+       s0 = _mm_sub_epi16(t4, t5);
+       t4 = _mm_add_epi16(t4, t5);
+       t5 = MUL1(4, s0);
+
+       s0 = _mm_sub_epi16(t7, t6);
+       t7 = _mm_add_epi16(t7, t6);
+       t6 = MUL1(4, s0);
+
+       /* Stage.3 */
+
+       s0 = _mm_sub_epi16(t0, t3);
+       t0 = _mm_add_epi16(t0, t3);
+
+       t3 = _mm_sub_epi16(t1, t2);
+       t1 = _mm_add_epi16(t1, t2);
+
+       t2 = _mm_sub_epi16(t6, t5);
+       t6 = _mm_add_epi16(t6, t5);
+
+       /* Stage.4 */
+
+       Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4);
+       Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4);
+       Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4);
+       Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4);
+       Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4);
+       Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4);
+       Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4);
+       Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4);
+}
+
+static __inline void Transpose_U_SSE2(
+       const INT16* x,
+       INT16*       y)
+{
+       const __m128i* X = (const __m128i*)x;
+       __m128i*       Y = (__m128i*)y;
+
+       __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+       u0 = _mm_unpacklo_epi16(X[0], X[1]);
+       u1 = _mm_unpackhi_epi16(X[0], X[1]);
+       u2 = _mm_unpacklo_epi16(X[2], X[3]);
+       u3 = _mm_unpackhi_epi16(X[2], X[3]);
+       u4 = _mm_unpacklo_epi16(X[4], X[5]);
+       u5 = _mm_unpackhi_epi16(X[4], X[5]);
+       u6 = _mm_unpacklo_epi16(X[6], X[7]);
+       u7 = _mm_unpackhi_epi16(X[6], X[7]);
+
+       t0 = _mm_unpacklo_epi32(u0, u2);
+       t1 = _mm_unpacklo_epi32(u1, u3);
+       t2 = _mm_unpackhi_epi32(u0, u2);
+       t3 = _mm_unpackhi_epi32(u1, u3);
+       t4 = _mm_unpacklo_epi32(u4, u6);
+       t5 = _mm_unpacklo_epi32(u5, u7);
+       t6 = _mm_unpackhi_epi32(u4, u6);
+       t7 = _mm_unpackhi_epi32(u5, u7);
+
+       Y[0] = _mm_unpacklo_epi64(t0, t4);
+       Y[1] = _mm_unpackhi_epi64(t0, t4);
+       Y[2] = _mm_unpacklo_epi64(t2, t6);
+       Y[3] = _mm_unpackhi_epi64(t2, t6);
+       Y[4] = _mm_unpacklo_epi64(t1, t5);
+       Y[5] = _mm_unpackhi_epi64(t1, t5);
+       Y[6] = _mm_unpacklo_epi64(t3, t7);
+       Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+/* */
+
+static const UINT8 TZZ[64] = {
+        0,  2,  3,  9, 10, 20, 21, 35,
+        1,  4,  8, 11, 19, 22, 34, 36,
+        5,  7, 12, 18, 23, 33, 37, 48,
+        6, 13, 17, 24, 32, 38, 47, 49,
+       14, 16, 25, 31, 39, 46, 50, 57,
+       15, 26, 30, 40, 45, 51, 56, 58,
+       27, 29, 41, 44, 52, 55, 59, 62,
+       28, 42, 43, 53, 54, 60, 61, 63
+};
+
+static __inline void DequantizeIDCT8x8_SSE2(
+       const INT16* block,
+       const INT16* matrix,
+       INT16*       coeff)
+{
+       ALIGN(0x10) INT16 c0[64];
+
+       { /* Reorder */
+               const UINT8* t = TZZ;
+
+               INT16* c = c0;
+               INT16* e = c + 64;
+               for (; c < e; c += 4, t += 4) {
+                       c[0] = block[t[0]];
+                       c[1] = block[t[1]];
+                       c[2] = block[t[2]];
+                       c[3] = block[t[3]];
+               }
+       }
+
+       { /* Dequantize */
+               const __m128i* m = (const __m128i*)matrix;
+               __m128i*       d = (__m128i*)c0;
+
+               d[0] = _mm_mullo_epi16(d[0], m[0]);
+               d[1] = _mm_mullo_epi16(d[1], m[1]);
+               d[2] = _mm_mullo_epi16(d[2], m[2]);
+               d[3] = _mm_mullo_epi16(d[3], m[3]);
+               d[4] = _mm_mullo_epi16(d[4], m[4]);
+               d[5] = _mm_mullo_epi16(d[5], m[5]);
+               d[6] = _mm_mullo_epi16(d[6], m[6]);
+               d[7] = _mm_mullo_epi16(d[7], m[7]);
+       }
+
+       /* iDCT Row */
+       IDCT_R_8_SSE2(c0, coeff);
+
+       /* Transpose */
+       Transpose_U_SSE2(coeff, c0);
+
+       /* iDCT Colum */
+       IDCT_C_8_SSE2(c0, coeff);
+}
+
+/* */
+
+struct DecodeCoefficientsContext {
+
+       INT32 EOB_Run[64];
+
+       INT8*  Run  [64];
+       INT16* Coeff[64];
+
+}; /* DecodeCoefficientsContext */
+
+typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+
+static INT32 DecodeCoefficients(
+       FrameDecoder_t*              t,
+       DecodeCoefficientsContext_t* ctx,
+       INT16*                       block)
+{
+       INT16* b = block;
+       INT16* e = b + 64;
+
+       INT32 i = 0;
+
+       while (b < e) {
+               if (ctx->EOB_Run[i] > 0) {
+                       ctx->EOB_Run[i] -= 1;
+                       break;
+
+               } else {
+                       INT32 run   = *((ctx->Run  [i])++);
+                       INT16 coeff = *((ctx->Coeff[i])++);
+
+                       if (run < 0) {
+                               ctx->EOB_Run[i] = coeff;
+
+                       } else {
+                               INT16* p = b + run;
+                               if (p >= e) {
+                                       break;
+                               }
+
+                               while (b < p) {
+                                       *(b++) = 0;
+                               }
+
+                               *(b++) = coeff;
+
+                               i = b - block;
+                       }
+               }
+       }
+
+       while (b < e) {
+               *(b++) = 0;
+       }
+
+       return i;
+}
+
+/* */
+
+static void Reconstruct_IntraBlock(
+       FrameDecoder_t*              t,
+       Plane_t*                     p,
+       INT32                        x,
+       INT32                        y,
+       INT16                        dc,
+       INT32                        plane,
+       Plane_t*                     r,
+       DecodeCoefficientsContext_t* ctx)
+{
+       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 coeff[64];
+
+       const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+
+       if (dc == NOT_CODED) {
+               Block_CopyPlane8x8_SSE2(p, x, y, r);
+               return;
+       }
+
+       DecodeCoefficients(t, ctx, block);
+
+       block[0] = dc;
+
+       DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+       Block_CopyIntra8x8_SSE2(p, x, y, coeff);
+}
+
+static void Reconstruct_InterBlock(
+       FrameDecoder_t*              t,
+       Plane_t*                     p,
+       INT32                        x,
+       INT32                        y,
+       INT16                        dc,
+       INT32                        plane,
+       Plane_t*                     r,
+       DecodeCoefficientsContext_t* ctx)
+{
+       ALIGN(0x10) INT16 block[64];
+       ALIGN(0x10) INT16 coeff[64];
+
+       const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+
+       if (dc == NOT_CODED) {
+               if (r != NULL) {
+                       Block_CopyPlane8x8_SSE2(p, x, y, r);
+               }
+               return;
+       }
+
+       DecodeCoefficients(t, ctx, block);
+
+       block[0] = dc;
+
+       DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+       Block_ReviseInter8x8_SSE2(p, x, y, coeff);
+}
+
+/* */
+
+/* */
+
+static const INT8 S_PX[16] = {
+       0*8, 1*8, 1*8, 0*8,
+       0*8, 0*8, 1*8, 1*8,
+       2*8, 2*8, 3*8, 3*8,
+       3*8, 2*8, 2*8, 3*8
+};
+
+static const INT8 S_PY[16] = {
+       0*8, 0*8, 1*8, 1*8,
+       2*8, 3*8, 3*8, 2*8,
+       2*8, 3*8, 3*8, 2*8,
+       1*8, 1*8, 0*8, 0*8
+};
+
+static const INT8 M_PX[4] = {
+       0*16, 0*16,
+       1*16, 1*16
+};
+
+static const INT8 M_PY[4] = {
+       0*16, 1*16,
+       1*16, 0*16
+};
+
+/* */
+
+static void Reconstruct_YPlane_SSE2(
+       FrameDecoder_t* t)
+{
+       INT32 x, y;
+
+       INT32 sx = t->Index->SX[0] * 32;
+       INT32 sy = t->Index->SY[0] * 32;
+
+       INT32 mx = t->Index->MX * 16;
+       INT32 my = t->Index->MY * 16;
+
+       INT32 bx = t->Index->BX[0];
+
+       const UINT16* bi = t->Index->BIndex[0];
+
+       Plane_t* g = t->Frame[0];
+       Plane_t* p = t->Frame[1];
+       Plane_t* r = t->Frame[2];
+
+       const UINT8*          mm = t->MBMode;
+       const MotionVector_t* mv = t->MV;
+
+       DecodeCoefficientsContext_t ctx = { 0 };
+
+       INT32 i;
+       for (i = 0; i < 64; i++) {
+               ctx.Run  [i] = t->BRun  [0][i];
+               ctx.Coeff[i] = t->BCoeff[0][i];
+       }
+
+       for (y = 0; y < sy; y += 32) {
+               for (x = 0; x < sx; x += 32) {
+                       INT32 i = 0;
+
+                       INT32 m;
+                       for (m = 0; m < 4; m++, i += 4) {
+                               INT32 x0 = x + M_PX[m];
+                               INT32 y0 = y + M_PY[m];
+                               if (x0 < mx && y0 < my) {
+                                       switch (*mm) {
+                                       case 0: /* INTER_NOMV */
+                                               Block_CopyPlane16x16_SSE2(p, x0, y0, r);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx);
+                                               break;
+
+                                       case 1: /* INTRA */
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 2: /* INTER_MV */
+                                       case 3: /* INTER_MV_LAST */
+                                       case 4: /* INTER_MV_LAST2 */
+                                               MotionComp_Block16x16_SSE2(p, x0, y0, r, mv);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 5: /* INTER_GOLDEN_NOMV */
+                                               Block_CopyPlane16x16_SSE2(p, x0, y0, g);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 6: /* INTER_GOLDEN_MV */
+                                               MotionComp_Block16x16_SSE2(p, x0, y0, g, mv);
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+
+                                       case 7: /* INTER_MV_FOUR */
+                                       {
+                                               const MotionVector_t* v = mv;
+
+                                               const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;
+
+                                               if (dc[0] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 0, r, v++);
+                                               }
+
+                                               if (dc[1] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 0, r, v++);
+                                               }
+
+                                               if (dc[0 + bx] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 8, r, v++);
+                                               }
+
+                                               if (dc[1 + bx] != NOT_CODED) {
+                                                       MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 8, r, v++);
+                                               }
+
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+                                               Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+                                               break;
+                                       }
+
+                                       } /* switch */
+
+                                       bi += 4;
+                                       mm += 1;
+                                       mv += 4;
+                               }
+                       }
+               }
+       }
+}
+
+/* */
+
+static void Reconstruct_CPlane_SSE2(
+       FrameDecoder_t* t)
+{
+       INT32 x, y;
+
+       INT32 sx = t->Index->SX[1] * 32;
+       INT32 sy = t->Index->SY[1] * 32;
+
+       INT32 mx = t->Index->MX * 8;
+       INT32 my = t->Index->MY * 8;
+
+       INT32 bx = t->Index->BX[1];
+
+       const UINT16* bi = t->Index->BIndex[1];
+
+       Plane_t* g = t->Frame[0];
+       Plane_t* p = t->Frame[1];
+       Plane_t* r = t->Frame[2];
+
+       const INT16* DC0 = t->DC + t->Index->BC[0];
+       const INT16* DC1 = DC0   + t->Index->BC[1];
+
+       const UINT8* m = t->BMode + t->Index->BC[0];
+
+       DecodeCoefficientsContext_t ctx[2] = { 0 };
+
+       INT32 i;
+       for (i = 0; i < 64; i++) {
+               ctx[0].Run  [i] = t->BRun  [1][i];
+               ctx[0].Coeff[i] = t->BCoeff[1][i];
+
+               ctx[1].Run  [i] = t->BRun  [2][i];
+               ctx[1].Coeff[i] = t->BCoeff[2][i];
+       }
+
+       for (y = 0; y < sy; y += 32) {
+               for (x = 0; x < sx; x += 32) {
+                       INT32 i;
+                       for (i = 0; i < 16; i++) {
+                               INT32 xx = x + S_PX[i];
+                               INT32 yy = y + S_PY[i];
+
+                               if (xx < mx && yy < my) {
+                                       INT32 idx = (xx >> 3) + (yy >> 3) * bx;
+
+                                       switch (m[idx]) {
+                                       case 0: /* INTER_NOMV */
+                                               Block_CopyPlane8x8_SSE2(p + 1, xx, yy, r + 1);
+                                               Block_CopyPlane8x8_SSE2(p + 2, xx, yy, r + 2);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1);
+                                               break;
+
+                                       case 1: /* INTRA */
+                                               Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 2: /* INTER_MV */
+                                       case 3: /* INTER_MV_LAST */
+                                       case 4: /* INTER_MV_LAST2 */
+                                               MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 5: /* INTER_GOLDEN_NOMV */
+                                               Block_CopyPlane8x8_SSE2(p + 1, xx, yy, g + 1);
+                                               Block_CopyPlane8x8_SSE2(p + 2, xx, yy, g + 2);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 6: /* INTER_GOLDEN_MV */
+                                               MotionComp_Block8x8C_SSE2(p + 1, xx, yy, g + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_SSE2(p + 2, xx, yy, g + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       case 7: /* INTER_MV_FOUR */
+                                               MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+                                               MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+                                               Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+                                               Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+                                               break;
+
+                                       } /* switch */
+                               }
+                       }
+               }
+       }
+}
+
+/* */
+
+void QT_ReconstructFrame_SSE2(
+       FrameDecoder_t* t)
+{
+       Reconstruct_YPlane_SSE2(t);
+
+       Reconstruct_CPlane_SSE2(t);
+
+       if (t->Filter.Limit > 0) {
+               QT_FrameLoopFilter(t);
+       }
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.c b/Lib/QTheoraEx/MotionComp_SSE2.c
new file mode 100644 (file)
index 0000000..078145f
--- /dev/null
@@ -0,0 +1,466 @@
+/* MotionComp_SSE2.c */
+/* 2009/07/02        */
+
+#include "StdAfx.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Block_Extract8x8(
+       const Plane_t* plane,
+       INT32          x,
+       INT32          y,
+       UINT8*         block)
+{
+       INT32 i, j;
+
+       for (i = 0; i < 8; i++) {
+               for (j = 0; j < 8; j++) {
+                       INT32 xx = x + j;
+                       INT32 yy = y + i;
+
+                       if (xx < 0) {
+                               xx = 0;
+                       } else if (xx >= plane->CX) {
+                               xx = plane->CX - 1;
+                       }
+
+                       if (yy < 0) {
+                               yy = 0;
+                       } else if (yy >= plane->CY) {
+                               yy = plane->CY - 1;
+                       }
+
+                       block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
+               }
+       }
+}
+
+/* */
+
+void MotionComp_Compensate16x16_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y)
+{
+       if (x >= 0 && x + 16 < r->CX &&
+               y >= 0 && y + 16 < r->CY) {
+               const UINT8* s = r->Plane + y * r->Pitch + x;
+               UINT8*       d = p;
+
+               __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+               s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+
+               _mm_store_si128((__m128i*)d, s0); d += pitch;
+               _mm_store_si128((__m128i*)d, s1); d += pitch;
+               _mm_store_si128((__m128i*)d, s2); d += pitch;
+               _mm_store_si128((__m128i*)d, s3); d += pitch;
+               _mm_store_si128((__m128i*)d, s4); d += pitch;
+               _mm_store_si128((__m128i*)d, s5); d += pitch;
+               _mm_store_si128((__m128i*)d, s6); d += pitch;
+               _mm_store_si128((__m128i*)d, s7); d += pitch;
+
+               s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+               s7 = _mm_loadu_si128((const __m128i*)s);
+
+               _mm_store_si128((__m128i*)d, s0); d += pitch;
+               _mm_store_si128((__m128i*)d, s1); d += pitch;
+               _mm_store_si128((__m128i*)d, s2); d += pitch;
+               _mm_store_si128((__m128i*)d, s3); d += pitch;
+               _mm_store_si128((__m128i*)d, s4); d += pitch;
+               _mm_store_si128((__m128i*)d, s5); d += pitch;
+               _mm_store_si128((__m128i*)d, s6); d += pitch;
+               _mm_store_si128((__m128i*)d, s7);
+
+       } else {
+               MotionComp_Compensate8x8_SSE2(p,                 pitch, r, x,     y    );
+               MotionComp_Compensate8x8_SSE2(p + 8,             pitch, r, x + 8, y    );
+               MotionComp_Compensate8x8_SSE2(p     + 8 * pitch, pitch, r, x,     y + 8);
+               MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
+       }
+}
+
+void MotionComp_Compensate8x8_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y)
+{
+       ALIGN(0x10) UINT8 b[64];
+
+       const UINT8* s  = r->Plane + y * r->Pitch + x;
+       INT32        p0 = r->Pitch;
+       UINT8*       d  = p;
+
+       __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+       if (x < 0 || x + 8 >= r->CX ||
+               y < 0 || y + 8 >= r->CY) {
+               s  = b;
+               p0 = 8;
+               Block_Extract8x8(r, x, y, b);
+       }
+
+       s0 = *((const __m64*)s); s += p0;
+       s1 = *((const __m64*)s); s += p0;
+       s2 = *((const __m64*)s); s += p0;
+       s3 = *((const __m64*)s); s += p0;
+       s4 = *((const __m64*)s); s += p0;
+       s5 = *((const __m64*)s); s += p0;
+       s6 = *((const __m64*)s); s += p0;
+       s7 = *((const __m64*)s);
+
+       *((__m64*)d) = s0; d += pitch;
+       *((__m64*)d) = s1; d += pitch;
+       *((__m64*)d) = s2; d += pitch;
+       *((__m64*)d) = s3; d += pitch;
+       *((__m64*)d) = s4; d += pitch;
+       *((__m64*)d) = s5; d += pitch;
+       *((__m64*)d) = s6; d += pitch;
+       *((__m64*)d) = s7;
+}
+
+/* */
+
+ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+void MotionComp_Compensate16x16H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1)
+{
+       if (x0 >= 0 && x0 + 16 < r->CX &&
+               y0 >= 0 && y0 + 16 < r->CY &&
+               x1 >= 0 && x1 + 16 < r->CX &&
+               y1 >= 0 && y1 + 16 < r->CY) {
+               const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+               const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+
+               UINT8* d = p;
+
+               __m128i S0, S1, D;
+               const __m128i M = *((const __m128i*)MASK_1);
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+               S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D); d += pitch;
+
+               S0 = _mm_loadu_si128((const __m128i*)s0);
+               S1 = _mm_loadu_si128((const __m128i*)s1);
+               D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+               _mm_store_si128((__m128i*)d, D);
+
+       } else {
+               MotionComp_Compensate8x8H_SSE2(p,                 pitch, r, x0,     y0    , x1,     y1    );
+               MotionComp_Compensate8x8H_SSE2(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
+               MotionComp_Compensate8x8H_SSE2(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
+               MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
+       }
+}
+
+void MotionComp_Compensate8x8H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1)
+{
+       ALIGN(0x10) UINT8 b0[64], b1[64];
+
+       const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+       INT32        p0 = r->Pitch;
+
+       const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+       INT32        p1 = r->Pitch;
+
+       UINT8* d = p;
+
+       __m64 S0, S1, D;
+       const __m64 M = *((const __m64*)MASK_1);
+
+       if (x0 < 0 || x0 + 8 >= r->CX ||
+               y0 < 0 || y0 + 8 >= r->CY ||
+               x1 < 0 || x1 + 8 >= r->CX ||
+               y1 < 0 || y1 + 8 >= r->CY) {
+               s0 = b0;
+               p0 = 8;
+
+               s1 = b1;
+               p1 = 8;
+
+               Block_Extract8x8(r, x0, y0, b0);
+               Block_Extract8x8(r, x1, y1, b1);
+       }
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0); s0 += p0;
+       S1 = *((const __m64*)s1); s1 += p1;
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D; d += pitch;
+
+       S0 = *((const __m64*)s0);
+       S1 = *((const __m64*)s1);
+       D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+       *((__m64*)d) = D;
+}
+
+/* */
+
+void MotionComp_Block16x16_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv)
+{
+       INT32 dx = ((mv->X & 1) != 0);
+       INT32 dy = ((mv->Y & 1) != 0);
+
+       INT32 vx[2] = { mv->X >> 1 };
+       INT32 vy[2] = { mv->Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate16x16_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv->X >= 0] += dx;
+               vy[mv->Y >= 0] += dy;
+
+               MotionComp_Compensate16x16H_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+void MotionComp_Block8x8Y_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv)
+{
+       INT32 dx = ((mv->X & 1) != 0);
+       INT32 dy = ((mv->Y & 1) != 0);
+
+       INT32 vx[2] = { mv->X >> 1 };
+       INT32 vy[2] = { mv->Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate8x8_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv->X >= 0] += dx;
+               vy[mv->Y >= 0] += dy;
+
+               MotionComp_Compensate8x8H_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+void MotionComp_Block8x8C_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv0)
+{
+       MotionVector_t mv = {
+               (mv0->X >> 1) | (mv0->X & 1),
+               (mv0->Y >> 1) | (mv0->Y & 1)
+       };
+
+       INT32 dx = ((mv.X & 1) != 0);
+       INT32 dy = ((mv.Y & 1) != 0);
+
+       INT32 vx[2] = { mv.X >> 1 };
+       INT32 vy[2] = { mv.Y >> 1 };
+
+       UINT8* d = p->Plane + y * p->Pitch + x;
+
+       if (dx == 0 && dy == 0) {
+               MotionComp_Compensate8x8_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0]);
+
+       } else {
+               vx[1] = vx[0];
+               vy[1] = vy[0];
+
+               vx[mv.X >= 0] += dx;
+               vy[mv.Y >= 0] += dy;
+
+               MotionComp_Compensate8x8H_SSE2(
+                       d,
+                       p->Pitch,
+                       r,
+                       x + vx[0],
+                       y + vy[0],
+                       x + vx[1],
+                       y + vy[1]);
+       }
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.h b/Lib/QTheoraEx/MotionComp_SSE2.h
new file mode 100644 (file)
index 0000000..59020a0
--- /dev/null
@@ -0,0 +1,66 @@
+/* MotionComp_SSE2.h */
+/* 2009/07/02        */
+
+#pragma once
+
+#include "FrameDecoder.h"
+
+/* */
+
+void MotionComp_Compensate16x16_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y);
+
+void MotionComp_Compensate8x8_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x,
+       INT32          y);
+
+void MotionComp_Compensate16x16H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
+
+void MotionComp_Compensate8x8H_SSE2(
+       UINT8*         p,
+       INT32          pitch,
+       const Plane_t* r,
+       INT32          x0,
+       INT32          y0,
+       INT32          x1,
+       INT32          y1);
+
+/* */
+
+void MotionComp_Block16x16_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+void MotionComp_Block8x8Y_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+void MotionComp_Block8x8C_SSE2(
+       Plane_t*              p,
+       INT32                 x,
+       INT32                 y,
+       const Plane_t*        r,
+       const MotionVector_t* mv);
+
+/* */
+
index 275e3f8..fc350d8 100644 (file)
                                >
                        </File>
                        <File
+                               RelativePath=".\FrameReconstructor_SSE2.c"
+                               >
+                       </File>
+                       <File
                                RelativePath=".\MemoryPool.c"
                                >
                        </File>
                        <File
+                               RelativePath=".\MotionComp_SSE2.c"
+                               >
+                       </File>
+                       <File
                                RelativePath=".\SetupDecoder.c"
                                >
                                <FileConfiguration
                                >
                        </File>
                        <File
+                               RelativePath=".\MotionComp_SSE2.h"
+                               >
+                       </File>
+                       <File
                                RelativePath=".\QTheoraArch.h"
                                >
                        </File>