--- /dev/null
+/* FrameReconstructor_MMX.c */
+/* 2009/07/09 */
+
+#include "StdAfx.h"
+
+#include "FrameReconstructor.h"
+
+#include "MotionComp_MMX.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static __inline void Transpose_MMX(
+ const INT16* x,
+ INT16* y)
+{
+ const __m64* X = (const __m64*)x;
+ __m64* Y = (__m64*)y;
+
+ __m64 t0, t1, t2, t3;
+ __m64 u0, u1, u2, u3;
+
+ /* */
+
+ t0 = X[2 * 0];
+ t1 = X[2 * 1];
+ t2 = X[2 * 2];
+ t3 = X[2 * 3];
+
+ u0 = _mm_unpacklo_pi16(t0, t1);
+ u1 = _mm_unpackhi_pi16(t0, t1);
+ u2 = _mm_unpacklo_pi16(t2, t3);
+ u3 = _mm_unpackhi_pi16(t2, t3);
+
+ Y[2 * 0] = _mm_unpacklo_pi32(u0, u2);
+ Y[2 * 1] = _mm_unpackhi_pi32(u0, u2);
+ Y[2 * 2] = _mm_unpacklo_pi32(u1, u3);
+ Y[2 * 3] = _mm_unpackhi_pi32(u1, u3);
+
+ /* */
+
+ t0 = X[2 * 0 + 1];
+ t1 = X[2 * 1 + 1];
+ t2 = X[2 * 2 + 1];
+ t3 = X[2 * 3 + 1];
+
+ u0 = _mm_unpacklo_pi16(t0, t1);
+ u1 = _mm_unpackhi_pi16(t0, t1);
+ u2 = _mm_unpacklo_pi16(t2, t3);
+ u3 = _mm_unpackhi_pi16(t2, t3);
+
+ Y[2 * 4] = _mm_unpacklo_pi32(u0, u2);
+ Y[2 * 5] = _mm_unpackhi_pi32(u0, u2);
+ Y[2 * 6] = _mm_unpacklo_pi32(u1, u3);
+ Y[2 * 7] = _mm_unpackhi_pi32(u1, u3);
+
+ /* */
+
+ t0 = X[2 * 4];
+ t1 = X[2 * 5];
+ t2 = X[2 * 6];
+ t3 = X[2 * 7];
+
+ u0 = _mm_unpacklo_pi16(t0, t1);
+ u1 = _mm_unpackhi_pi16(t0, t1);
+ u2 = _mm_unpacklo_pi16(t2, t3);
+ u3 = _mm_unpackhi_pi16(t2, t3);
+
+ Y[2 * 0 + 1] = _mm_unpacklo_pi32(u0, u2);
+ Y[2 * 1 + 1] = _mm_unpackhi_pi32(u0, u2);
+ Y[2 * 2 + 1] = _mm_unpacklo_pi32(u1, u3);
+ Y[2 * 3 + 1] = _mm_unpackhi_pi32(u1, u3);
+
+ /* */
+
+ t0 = X[2 * 4 + 1];
+ t1 = X[2 * 5 + 1];
+ t2 = X[2 * 6 + 1];
+ t3 = X[2 * 7 + 1];
+
+ u0 = _mm_unpacklo_pi16(t0, t1);
+ u1 = _mm_unpackhi_pi16(t0, t1);
+ u2 = _mm_unpacklo_pi16(t2, t3);
+ u3 = _mm_unpackhi_pi16(t2, t3);
+
+ Y[2 * 4 + 1] = _mm_unpacklo_pi32(u0, u2);
+ Y[2 * 5 + 1] = _mm_unpackhi_pi32(u0, u2);
+ Y[2 * 6 + 1] = _mm_unpacklo_pi32(u1, u3);
+ Y[2 * 7 + 1] = _mm_unpackhi_pi32(u1, u3);
+}
+
+void QT_UpdateDequantizeMatrix_MMX(
+ FrameDecoder_t* t)
+{
+ FrameReconstructor_SSE2_t* r = t->Reconstructor;
+
+ INT32 i, p;
+
+ for (i = 0; i < 2; i++) {
+ for (p = 0; p < 3; p++) {
+ const INT16* x = t->Dequantize.Matrix[i][p];
+ INT16* y = r->Matrix[i][p];
+ Transpose_MMX(x, y);
+ }
+ }
+}
+
+/* */
+
+static __inline void Block_CopyPlane8x8_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ Plane_t* r)
+{
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = *((const __m64*)s); s += r->Pitch;
+ s1 = *((const __m64*)s); s += r->Pitch;
+ s2 = *((const __m64*)s); s += r->Pitch;
+ s3 = *((const __m64*)s); s += r->Pitch;
+ s4 = *((const __m64*)s); s += r->Pitch;
+ s5 = *((const __m64*)s); s += r->Pitch;
+ s6 = *((const __m64*)s); s += r->Pitch;
+ s7 = *((const __m64*)s);
+
+ *((__m64*)d) = s0; d += p->Pitch;
+ *((__m64*)d) = s1; d += p->Pitch;
+ *((__m64*)d) = s2; d += p->Pitch;
+ *((__m64*)d) = s3; d += p->Pitch;
+ *((__m64*)d) = s4; d += p->Pitch;
+ *((__m64*)d) = s5; d += p->Pitch;
+ *((__m64*)d) = s6; d += p->Pitch;
+ *((__m64*)d) = s7;
+}
+
+static __inline void Block_CopyPlane16x16_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ Plane_t* r)
+{
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ const UINT8* e = s + 16 * r->Pitch;
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ __m64 s00, s10, s20, s30;
+ __m64 s01, s11, s21, s31;
+
+ while (s < e) {
+ s00 = *((const __m64*)(s + 0));
+ s01 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s10 = *((const __m64*)(s + 0));
+ s11 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s20 = *((const __m64*)(s + 0));
+ s21 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s30 = *((const __m64*)(s + 0));
+ s31 = *((const __m64*)(s + 8)); s += r->Pitch;
+
+ *((__m64*)(d + 0)) = s00;
+ *((__m64*)(d + 8)) = s01; d += p->Pitch;
+ *((__m64*)(d + 0)) = s10;
+ *((__m64*)(d + 8)) = s11; d += p->Pitch;
+ *((__m64*)(d + 0)) = s20;
+ *((__m64*)(d + 8)) = s21; d += p->Pitch;
+ *((__m64*)(d + 0)) = s30;
+ *((__m64*)(d + 8)) = s31; d += p->Pitch;
+ }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 IPRED[4] = {
+ 128, 128, 128, 128
+};
+
+static __inline void Block_CopyIntra8x8_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const INT16* c)
+{
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ const __m64* B = (const __m64*)IPRED;
+ const __m64* C = (const __m64*)c;
+
+ __m64 s00, s01, s10, s11;
+ const __m64 z = _mm_setzero_si64();
+
+ s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 0], B[0]), z);
+ s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 1], B[0]), z);
+ s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 0], B[0]), z);
+ s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 1], B[0]), z);
+
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+ s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 0], B[0]), z);
+ s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 1], B[0]), z);
+ s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 0], B[0]), z);
+ s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 1], B[0]), z);
+
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+ s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 0], B[0]), z);
+ s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 1], B[0]), z);
+ s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 0], B[0]), z);
+ s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 1], B[0]), z);
+
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;
+
+ s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 0], B[0]), z);
+ s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 1], B[0]), z);
+ s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 0], B[0]), z);
+ s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 1], B[0]), z);
+
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11);
+}
+
+static __inline void Block_ReviseInter8x8_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const INT16* c)
+{
+ UINT8* d = p->Plane + y * p->Pitch + x;
+ UINT8* e = d + 8 * p->Pitch;
+
+ const __m64* C = (const __m64*)c;
+
+ __m64 b0, b1;
+ __m64 s0, s1;
+ const __m64 z = _mm_setzero_si64();
+
+ for (; d < e; d += p->Pitch, C += 2) {
+ b0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 0))), z);
+ b1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 4))), z);
+
+ s0 = _mm_packs_pu16(_mm_adds_pi16(C[0], b0), z);
+ s1 = _mm_packs_pu16(_mm_adds_pi16(C[1], b1), z);
+
+ *((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s0);
+ *((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s1);
+ }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 COS_MMX[8][4] = {
+ { 8, 8, 8, 8 }, /* 0 */
+ { 64277, 64277, 64277, 64277 }, /* 1 */
+ { 60547, 60547, 60547, 60547 }, /* 2 */
+ { 54491, 54491, 54491, 54491 }, /* 3 */
+ { 46341, 46341, 46341, 46341 }, /* 4 */
+ { 36410, 36410, 36410, 36410 }, /* 5 */
+ { 25080, 25080, 25080, 25080 }, /* 6 */
+ { 12785, 12785, 12785, 12785 } /* 7 */
+};
+
+#define MUL1(T,X) _mm_add_pi16(_mm_mulhi_pi16(X, C[T]), X)
+#define MUL0(T,X) _mm_mulhi_pi16(X, C[T])
+
+/* */
+
+static __inline void IDCT_R_8_MMX(
+ const INT16* x,
+ INT16* y)
+{
+ const __m64* C = (const __m64*)COS_MMX[0];
+ const __m64* X = (const __m64*)x;
+ const __m64* E = X + 2;
+ __m64* Y = (__m64*)y;
+
+ __m64 s0;
+ __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+ for (; X < E; X++, Y++) {
+ /* Stage.1 */
+
+ s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
+ t0 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
+ t1 = MUL1(4, s0);
+
+ t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
+ t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));
+
+ t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
+ t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));
+
+ t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
+ t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_pi16(t4, t5);
+ t4 = _mm_add_pi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(t7, t6);
+ t7 = _mm_add_pi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_pi16(t0, t3);
+ t0 = _mm_add_pi16(t0, t3);
+
+ t3 = _mm_sub_pi16(t1, t2);
+ t1 = _mm_add_pi16(t1, t2);
+
+ t2 = _mm_sub_pi16(t6, t5);
+ t6 = _mm_add_pi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[2 * 0] = _mm_add_pi16(t0, t7);
+ Y[2 * 1] = _mm_add_pi16(t1, t6);
+ Y[2 * 2] = _mm_add_pi16(t3, t2);
+ Y[2 * 3] = _mm_add_pi16(s0, t4);
+ Y[2 * 4] = _mm_sub_pi16(s0, t4);
+ Y[2 * 5] = _mm_sub_pi16(t3, t2);
+ Y[2 * 6] = _mm_sub_pi16(t1, t6);
+ Y[2 * 7] = _mm_sub_pi16(t0, t7);
+ }
+}
+
+static __inline void IDCT_R_8_4_MMX(
+ const INT16* x,
+ INT16* y)
+{
+ const __m64* C = (const __m64*)COS_MMX[0];
+ const __m64* X = (const __m64*)x;
+ __m64* Y = (__m64*)y;
+
+ __m64 s0;
+ __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+ /* Stage.1 */
+
+ t1 = t0 = MUL1(4, X[2 * 0]);
+
+ t2 = MUL0(6, X[2 * 2]);
+ t3 = MUL1(2, X[2 * 2]);
+
+ t4 = MUL0(7, X[2 * 1]);
+ t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));
+
+ t6 = MUL1(3, X[2 * 3]);
+ t7 = MUL1(1, X[2 * 1]);
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_pi16(t4, t5);
+ t4 = _mm_add_pi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(t7, t6);
+ t7 = _mm_add_pi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_pi16(t0, t3);
+ t0 = _mm_add_pi16(t0, t3);
+
+ t3 = _mm_sub_pi16(t1, t2);
+ t1 = _mm_add_pi16(t1, t2);
+
+ t2 = _mm_sub_pi16(t6, t5);
+ t6 = _mm_add_pi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[2 * 0] = _mm_add_pi16(t0, t7);
+ Y[2 * 1] = _mm_add_pi16(t1, t6);
+ Y[2 * 2] = _mm_add_pi16(t3, t2);
+ Y[2 * 3] = _mm_add_pi16(s0, t4);
+ Y[2 * 4] = _mm_sub_pi16(s0, t4);
+ Y[2 * 5] = _mm_sub_pi16(t3, t2);
+ Y[2 * 6] = _mm_sub_pi16(t1, t6);
+ Y[2 * 7] = _mm_sub_pi16(t0, t7);
+}
+
+static __inline void IDCT_C_8_MMX(
+ const INT16* x,
+ INT16* y)
+{
+ const __m64* C = (const __m64*)COS_MMX[0];
+ const __m64* X = (const __m64*)x;
+ const __m64* E = X + 2;
+ __m64* Y = (__m64*)y;
+
+ __m64 s0;
+ __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+ for (; X < E; X++, Y++) {
+ /* Stage.1 */
+
+ s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
+ t0 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
+ t1 = MUL1(4, s0);
+
+ t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
+ t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));
+
+ t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
+ t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));
+
+ t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
+ t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_pi16(t4, t5);
+ t4 = _mm_add_pi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(t7, t6);
+ t7 = _mm_add_pi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_pi16(t0, t3);
+ t0 = _mm_add_pi16(t0, t3);
+
+ t3 = _mm_sub_pi16(t1, t2);
+ t1 = _mm_add_pi16(t1, t2);
+
+ t2 = _mm_sub_pi16(t6, t5);
+ t6 = _mm_add_pi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
+ Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
+ Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
+ Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
+ Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
+ Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
+ Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
+ Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
+ }
+}
+
+static __inline void IDCT_C_8_4_MMX(
+ const INT16* x,
+ INT16* y)
+{
+ const __m64* C = (const __m64*)COS_MMX[0];
+ const __m64* X = (const __m64*)x;
+ const __m64* E = X + 2;
+ __m64* Y = (__m64*)y;
+
+ __m64 s0;
+ __m64 t0, t1, t2, t3, t4, t5, t6, t7;
+
+ for (; X < E; X++, Y++) {
+ /* Stage.1 */
+
+ t1 = t0 = MUL1(4, X[2 * 0]);
+
+ t2 = MUL0(6, X[2 * 2]);
+ t3 = MUL1(2, X[2 * 2]);
+
+ t4 = MUL0(7, X[2 * 1]);
+ t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));
+
+ t6 = MUL1(3, X[2 * 3]);
+ t7 = MUL1(1, X[2 * 1]);
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_pi16(t4, t5);
+ t4 = _mm_add_pi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_pi16(t7, t6);
+ t7 = _mm_add_pi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_pi16(t0, t3);
+ t0 = _mm_add_pi16(t0, t3);
+
+ t3 = _mm_sub_pi16(t1, t2);
+ t1 = _mm_add_pi16(t1, t2);
+
+ t2 = _mm_sub_pi16(t6, t5);
+ t6 = _mm_add_pi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
+ Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
+ Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
+ Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
+ Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
+ Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
+ Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
+ Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
+ }
+}
+
+/* */
+
+static const UINT8 TZZ[64] = {
+ 0, 2, 3, 9, 10, 20, 21, 35,
+ 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48,
+ 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57,
+ 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62,
+ 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+static __inline void DequantizeIDCT8x8_MMX(
+ const INT16* block,
+ const INT16* matrix,
+ INT16* coeff)
+{
+ ALIGN(0x10) INT16 c0[64];
+
+ { /* Reorder */
+ const UINT8* t = TZZ;
+
+ INT16* c = c0;
+ INT16* e = c + 64;
+ for (; c < e; c += 8, t += 8) {
+ c[0] = block[t[0]];
+ c[1] = block[t[1]];
+ c[2] = block[t[2]];
+ c[3] = block[t[3]];
+ c[4] = block[t[4]];
+ c[5] = block[t[5]];
+ c[6] = block[t[6]];
+ c[7] = block[t[7]];
+ }
+ }
+
+ { /* Dequantize */
+ __m64* d = (__m64*) c0;
+ __m64* e = (__m64*)(c0 + 64);
+ const __m64* m = (const __m64*)matrix;
+
+ for (; d < e; d += 4, m += 4) {
+ d[0] = _mm_mullo_pi16(d[0], m[0]);
+ d[1] = _mm_mullo_pi16(d[1], m[1]);
+ d[2] = _mm_mullo_pi16(d[2], m[2]);
+ d[3] = _mm_mullo_pi16(d[3], m[3]);
+ }
+ }
+
+ /* iDCT Row */
+ IDCT_R_8_MMX(c0, coeff);
+
+ /* Transpose */
+ Transpose_MMX(coeff, c0);
+
+ /* iDCT Colum */
+ IDCT_C_8_MMX(c0, coeff);
+}
+
+/* */
+
+static __inline void DequantizeIDCT8x8_16_MMX(
+ const INT16* block,
+ const INT16* matrix,
+ INT16* coeff)
+{
+ ALIGN(0x10) INT16 c0[64];
+
+ const __m64 z = _mm_setzero_si64();
+
+ *((__m64*)(c0 + 0x00)) = z;
+ *((__m64*)(c0 + 0x08)) = z;
+ *((__m64*)(c0 + 0x10)) = z;
+ *((__m64*)(c0 + 0x18)) = z;
+
+ /* Reorder */
+ c0[ 0 + 0] = block[TZZ[ 0 + 0]];
+ c0[ 0 + 1] = block[TZZ[ 0 + 1]];
+ c0[ 0 + 2] = block[TZZ[ 0 + 2]];
+ c0[ 0 + 3] = block[TZZ[ 0 + 3]];
+
+ c0[ 8 + 0] = block[TZZ[ 8 + 0]];
+ c0[ 8 + 1] = block[TZZ[ 8 + 1]];
+ c0[ 8 + 2] = block[TZZ[ 8 + 2]];
+
+ c0[16 + 0] = block[TZZ[16 + 0]];
+ c0[16 + 1] = block[TZZ[16 + 1]];
+
+ c0[24 + 0] = block[TZZ[24 + 0]];
+
+ { /* Dequantize */
+ const __m64* m = (const __m64*)matrix;
+ __m64* d = (__m64*)c0;
+
+ d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
+ d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
+ d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
+ d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+ }
+
+ /* iDCT Row */
+ IDCT_R_8_4_MMX(c0, coeff);
+
+ /* Transpose */
+ Transpose_MMX(coeff, c0);
+
+ /* iDCT Colum */
+ IDCT_C_8_4_MMX(c0, coeff);
+}
+
+/* */
+
+static __inline void DequantizeIDCT8x8_0_MMX(
+ INT16 dc,
+ const INT16* matrix,
+ INT16* coeff)
+{
+ INT16* d = coeff;
+ INT16* e = d + 64;
+
+ __m64 d0 = _mm_set1_pi16(((dc * matrix[0]) + 15) >> 5);
+
+ for (; d < e; d += 16) {
+ *((__m64*)(d + 0)) = d0;
+ *((__m64*)(d + 4)) = d0;
+ *((__m64*)(d + 8)) = d0;
+ *((__m64*)(d + 12)) = d0;
+ }
+}
+
+/* */
+
+struct DecodeCoefficientsLeaf {
+
+ INT32 EOB_Run;
+
+ INT8* Run;
+ INT16* Coeff;
+
+}; /* DecodeCoefficientsLeaf */
+
+typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;
+
+struct DecodeCoefficientsContext {
+
+ DecodeCoefficientsLeaf_t Leaf[64];
+
+}; /* DecodeCoefficientsContext */
+
+typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+
+static INT32 DecodeCoefficients_MMX(
+ FrameDecoder_t* t,
+ DecodeCoefficientsContext_t* ctx,
+ INT16* block)
+{
+ INT16* b = block;
+ INT16* e = b + 64;
+
+ DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
+
+ const __m64 z = _mm_setzero_si64();
+
+ for (; b < e; b += 16) {
+ *((__m64*)(b + 0)) = z;
+ *((__m64*)(b + 4)) = z;
+ *((__m64*)(b + 8)) = z;
+ *((__m64*)(b + 12)) = z;
+ }
+
+ b = block;
+
+ while (b < e) {
+ if (leaf->EOB_Run > 0) {
+ leaf->EOB_Run -= 1;
+ break;
+
+ } else {
+ INT32 run = *((leaf->Run )++);
+ INT32 coeff = *((leaf->Coeff)++);
+
+ if (run < 0) {
+ leaf->EOB_Run = coeff;
+
+ } else {
+ b += run;
+ if (b >= e) {
+ break;
+ }
+
+ *(b++) = coeff;
+
+ leaf = ctx->Leaf + (b - block);
+ }
+ }
+ }
+
+ return b - block;
+}
+
+/* */
+
+static void Reconstruct_IntraBlock(
+ FrameDecoder_t* t,
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ INT16 dc,
+ INT32 plane,
+ Plane_t* r,
+ DecodeCoefficientsContext_t* ctx)
+{
+ ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 coeff[64];
+
+ const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+
+ INT32 cs;
+
+ if (dc == NOT_CODED) {
+ Block_CopyPlane8x8_MMX(p, x, y, r);
+ return;
+ }
+
+ cs = DecodeCoefficients_MMX(t, ctx, block);
+
+ if (cs > 10) {
+ block[0] = dc;
+ DequantizeIDCT8x8_MMX(block, mat[plane], coeff);
+
+ } else if (cs > 1) {
+ block[0] = dc;
+ DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);
+
+ } else {
+ DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
+ }
+
+ Block_CopyIntra8x8_MMX(p, x, y, coeff);
+}
+
+/* */
+
+static void Reconstruct_InterBlock(
+ FrameDecoder_t* t,
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ INT16 dc,
+ INT32 plane,
+ Plane_t* r,
+ DecodeCoefficientsContext_t* ctx)
+{
+ ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 coeff[64];
+
+ const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+
+ INT32 cs;
+
+ if (dc == NOT_CODED) {
+ if (r != NULL) {
+ Block_CopyPlane8x8_MMX(p, x, y, r);
+ }
+ return;
+ }
+
+ cs = DecodeCoefficients_MMX(t, ctx, block);
+
+ if (cs > 10) {
+ block[0] = dc;
+ DequantizeIDCT8x8_MMX(block, mat[plane], coeff);
+
+ } else if (cs > 1) {
+ block[0] = dc;
+ DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);
+
+ } else {
+ DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
+ }
+
+ Block_ReviseInter8x8_MMX(p, x, y, coeff);
+}
+
+/* */
+
+static const INT8 S_PX[16] = {
+ 0*8, 1*8, 1*8, 0*8,
+ 0*8, 0*8, 1*8, 1*8,
+ 2*8, 2*8, 3*8, 3*8,
+ 3*8, 2*8, 2*8, 3*8
+};
+
+static const INT8 S_PY[16] = {
+ 0*8, 0*8, 1*8, 1*8,
+ 2*8, 3*8, 3*8, 2*8,
+ 2*8, 3*8, 3*8, 2*8,
+ 1*8, 1*8, 0*8, 0*8
+};
+
+static const INT8 M_PX[4] = {
+ 0*16, 0*16,
+ 1*16, 1*16
+};
+
+static const INT8 M_PY[4] = {
+ 0*16, 1*16,
+ 1*16, 0*16
+};
+
+/* */
+
+static void Reconstruct_YPlane_MMX(
+ FrameDecoder_t* t)
+{
+ INT32 x, y;
+
+ INT32 sx = t->Index->SX[0] * 32;
+ INT32 sy = t->Index->SY[0] * 32;
+
+ INT32 mx = t->Index->MX * 16;
+ INT32 my = t->Index->MY * 16;
+
+ INT32 bx = t->Index->BX[0];
+
+ const UINT16* bi = t->Index->BIndex[0];
+
+ Plane_t* g = t->Frame[0];
+ Plane_t* p = t->Frame[1];
+ Plane_t* r = t->Frame[2];
+
+ const UINT8* mm = t->MBMode;
+ const MotionVector_t* mv = t->MV;
+
+ ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };
+
+ INT32 i;
+ for (i = 0; i < 64; i++) {
+ ctx.Leaf[i].Run = t->BRun [0][i];
+ ctx.Leaf[i].Coeff = t->BCoeff[0][i];
+ }
+
+ for (y = 0; y < sy; y += 32) {
+ for (x = 0; x < sx; x += 32) {
+ INT32 i = 0;
+
+ INT32 m;
+ for (m = 0; m < 4; m++, i += 4) {
+ INT32 x0 = x + M_PX[m];
+ INT32 y0 = y + M_PY[m];
+ if (x0 < mx && y0 < my) {
+ switch (*mm) {
+ case 0: /* INTER_NOMV */
+ Block_CopyPlane16x16_MMX(p, x0, y0, r);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx);
+ break;
+
+ case 1: /* INTRA */
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 2: /* INTER_MV */
+ case 3: /* INTER_MV_LAST */
+ case 4: /* INTER_MV_LAST2 */
+ MotionComp_Block16x16_MMX(p, x0, y0, r, mv);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 5: /* INTER_GOLDEN_NOMV */
+ Block_CopyPlane16x16_MMX(p, x0, y0, g);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 6: /* INTER_GOLDEN_MV */
+ MotionComp_Block16x16_MMX(p, x0, y0, g, mv);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 7: /* INTER_MV_FOUR */
+ {
+ const MotionVector_t* v = mv;
+
+ const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;
+
+ if (dc[0] != NOT_CODED) {
+ MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 0, r, v++);
+ }
+
+ if (dc[1] != NOT_CODED) {
+ MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 0, r, v++);
+ }
+
+ if (dc[0 + bx] != NOT_CODED) {
+ MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 8, r, v++);
+ }
+
+ if (dc[1 + bx] != NOT_CODED) {
+ MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 8, r, v++);
+ }
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+ }
+
+ } /* switch */
+
+ bi += 4;
+ mm += 1;
+ mv += 4;
+ }
+ }
+ }
+ }
+}
+
+/* */
+
+static void Reconstruct_CPlane_MMX(
+ FrameDecoder_t* t)
+{
+ INT32 x, y;
+
+ INT32 sx = t->Index->SX[1] * 32;
+ INT32 sy = t->Index->SY[1] * 32;
+
+ INT32 mx = t->Index->MX * 8;
+ INT32 my = t->Index->MY * 8;
+
+ INT32 bx = t->Index->BX[1];
+
+ const UINT16* bi = t->Index->BIndex[1];
+
+ Plane_t* g = t->Frame[0];
+ Plane_t* p = t->Frame[1];
+ Plane_t* r = t->Frame[2];
+
+ const INT16* DC0 = t->DC + t->Index->BC[0];
+ const INT16* DC1 = DC0 + t->Index->BC[1];
+
+ const UINT8* m = t->BMode + t->Index->BC[0];
+
+ ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };
+
+ INT32 i;
+ for (i = 0; i < 64; i++) {
+ ctx[0].Leaf[i].Run = t->BRun [1][i];
+ ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];
+
+ ctx[1].Leaf[i].Run = t->BRun [2][i];
+ ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
+ }
+
+ for (y = 0; y < sy; y += 32) {
+ for (x = 0; x < sx; x += 32) {
+ INT32 i;
+ for (i = 0; i < 16; i++) {
+ INT32 xx = x + S_PX[i];
+ INT32 yy = y + S_PY[i];
+
+ if (xx < mx && yy < my) {
+ INT32 idx = (xx >> 3) + (yy >> 3) * bx;
+
+ switch (m[idx]) {
+ case 0: /* INTER_NOMV */
+ Block_CopyPlane8x8_MMX(p + 1, xx, yy, r + 1);
+ Block_CopyPlane8x8_MMX(p + 2, xx, yy, r + 2);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1);
+ break;
+
+ case 1: /* INTRA */
+ Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 2: /* INTER_MV */
+ case 3: /* INTER_MV_LAST */
+ case 4: /* INTER_MV_LAST2 */
+ MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
+ MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 5: /* INTER_GOLDEN_NOMV */
+ Block_CopyPlane8x8_MMX(p + 1, xx, yy, g + 1);
+ Block_CopyPlane8x8_MMX(p + 2, xx, yy, g + 2);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 6: /* INTER_GOLDEN_MV */
+ MotionComp_Block8x8C_MMX(p + 1, xx, yy, g + 1, t->MVC + idx);
+ MotionComp_Block8x8C_MMX(p + 2, xx, yy, g + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 7: /* INTER_MV_FOUR */
+ MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
+ MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ } /* switch */
+ }
+ }
+ }
+ }
+}
+
+/* */
+
+void QT_ReconstructFrame_MMX(
+ FrameDecoder_t* t)
+{
+ Reconstruct_YPlane_MMX(t);
+
+ Reconstruct_CPlane_MMX(t);
+
+ if (t->Filter.Limit > 0) {
+ QT_FrameLoopFilter(t);
+ }
+}
+
+/* */
+
--- /dev/null
+/* MotionComp_MMX.c */
+/* 2009/07/09 */
+
+#include "StdAfx.h"
+
+#include "MotionComp_MMX.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+void Block_Extract8x8_MMX(
+ const Plane_t* plane,
+ INT32 x,
+ INT32 y,
+ UINT8* block,
+ INT32 pitch)
+{
+ ALIGN(0x10) UINT8 r[64 * 4];
+
+ INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
+ INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);
+
+ const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;
+
+ INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
+ INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);
+
+ INT32 xy = (xf << 2) | yf;
+
+ UINT8* rr = r;
+
+ {
+ const UINT8* s = NULL;
+ UINT8* d = NULL;
+
+ switch (xy) {
+ case 10: /* 10 10 */
+ s = ss;
+ d = r;
+ break;
+
+ case 6: /* 01 10 */
+ s = ss + 7;
+ d = r + 8;
+ break;
+
+ case 9: /* 10 01 */
+ s = ss + 7 * plane->Pitch;
+ d = r + 8 * 16;
+ break;
+
+ case 5: /* 01 01 */
+ s = ss + 7 * plane->Pitch + 7;
+ d = r + 8 * 16 + 8;
+ break;
+ }
+
+ if (d != NULL) {
+ __m64 pix = _mm_set1_pi8(s[0]);
+ *((__m64*)(d + 16 * 0)) = pix;
+ *((__m64*)(d + 16 * 1)) = pix;
+ *((__m64*)(d + 16 * 2)) = pix;
+ *((__m64*)(d + 16 * 3)) = pix;
+ *((__m64*)(d + 16 * 4)) = pix;
+ *((__m64*)(d + 16 * 5)) = pix;
+ *((__m64*)(d + 16 * 6)) = pix;
+ *((__m64*)(d + 16 * 7)) = pix;
+ }
+ }
+
+ {
+ const UINT8* sx = NULL;
+ UINT8* dx = r;
+
+ const UINT8* sy = NULL;
+ UINT8* dy = r;
+
+ if (xf == 2) {
+ sx = ss;
+ dy += 8;
+ rr += 8;
+ } else if (xf == 1) {
+ sx = ss + 7;
+ dx += 8;
+ }
+
+ if (yf == 2) {
+ sy = ss;
+ dx += 64 * 2;
+ rr += 64 * 2;
+ } else if (yf == 1) {
+ sy = ss + 7 * plane->Pitch;
+ dy += 64 * 2;
+ }
+
+ if (sx != NULL) {
+ *((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
+ }
+
+ if (sy != NULL) {
+ __m64 pix = *((const __m64*)sy);
+ *((__m64*)(dy + 16 * 0)) = pix;
+ *((__m64*)(dy + 16 * 1)) = pix;
+ *((__m64*)(dy + 16 * 2)) = pix;
+ *((__m64*)(dy + 16 * 3)) = pix;
+ *((__m64*)(dy + 16 * 4)) = pix;
+ *((__m64*)(dy + 16 * 5)) = pix;
+ *((__m64*)(dy + 16 * 6)) = pix;
+ *((__m64*)(dy + 16 * 7)) = pix;
+ }
+ }
+
+ *((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
+ *((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
+ *((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
+ *((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
+ *((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
+ *((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
+ *((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
+ *((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));
+
+ if (x < 0) {
+ if (x <= -8) x = -8;
+ rr += x;
+ } else if (x > plane->CX - 8) {
+ x -= plane->CX - 8;
+ if (x >= 8) x = 8;
+ rr += x;
+ }
+
+ if (y < 0) {
+ if (y <= -8) y = -8;
+ rr += y * 16;
+ } else if (y > plane->CY - 8) {
+ y -= plane->CY - 8;
+ if (y >= 8) y = 8;
+ rr += y * 16;
+ }
+
+ *((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
+ *((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
+ *((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
+ *((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
+ *((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
+ *((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
+ *((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
+ *((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
+}
+
+/* */
+
+static void MotionComp_Compensate16x16_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y);
+
+static void MotionComp_Compensate8x8_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y);
+
+static void MotionComp_Compensate16x16H_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1);
+
+static void MotionComp_Compensate8x8H_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1);
+
+/* */
+
+void MotionComp_Compensate16x16_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y)
+{
+ if (x >= 0 && x + 16 < r->CX &&
+ y >= 0 && y + 16 < r->CY) {
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ const UINT8* e = s + 16 * r->Pitch;
+ UINT8* d = p;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ while (s < e) {
+ s0 = *((const __m64*)(s + 0));
+ s1 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s2 = *((const __m64*)(s + 0));
+ s3 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s4 = *((const __m64*)(s + 0));
+ s5 = *((const __m64*)(s + 8)); s += r->Pitch;
+ s6 = *((const __m64*)(s + 0));
+ s7 = *((const __m64*)(s + 8)); s += r->Pitch;
+
+ *((__m64*)(d + 0)) = s0;
+ *((__m64*)(d + 8)) = s1; d += pitch;
+ *((__m64*)(d + 0)) = s2;
+ *((__m64*)(d + 8)) = s3; d += pitch;
+ *((__m64*)(d + 0)) = s4;
+ *((__m64*)(d + 8)) = s5; d += pitch;
+ *((__m64*)(d + 0)) = s6;
+ *((__m64*)(d + 8)) = s7; d += pitch;
+ }
+
+ } else {
+ MotionComp_Compensate8x8_MMX(p, pitch, r, x, y );
+ MotionComp_Compensate8x8_MMX(p + 8, pitch, r, x + 8, y );
+ MotionComp_Compensate8x8_MMX(p + 8 * pitch, pitch, r, x, y + 8);
+ MotionComp_Compensate8x8_MMX(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
+ }
+}
+
+void MotionComp_Compensate8x8_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y)
+{
+ if (x < 0 || x + 8 >= r->CX ||
+ y < 0 || y + 8 >= r->CY) {
+ Block_Extract8x8_MMX(r, x, y, p, pitch);
+
+ } else {
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ INT32 p0 = r->Pitch;
+ UINT8* d = p;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = *((const __m64*)s); s += p0;
+ s1 = *((const __m64*)s); s += p0;
+ s2 = *((const __m64*)s); s += p0;
+ s3 = *((const __m64*)s); s += p0;
+ s4 = *((const __m64*)s); s += p0;
+ s5 = *((const __m64*)s); s += p0;
+ s6 = *((const __m64*)s); s += p0;
+ s7 = *((const __m64*)s);
+
+ *((__m64*)d) = s0; d += pitch;
+ *((__m64*)d) = s1; d += pitch;
+ *((__m64*)d) = s2; d += pitch;
+ *((__m64*)d) = s3; d += pitch;
+ *((__m64*)d) = s4; d += pitch;
+ *((__m64*)d) = s5; d += pitch;
+ *((__m64*)d) = s6; d += pitch;
+ *((__m64*)d) = s7;
+ }
+}
+
+/* */
+
+ALIGN(0x10) static const UINT8 MASK_FE[8] = { 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe };
+
+void MotionComp_Compensate16x16H_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1)
+{
+ if (x0 >= 0 && x0 + 16 < r->CX &&
+ y0 >= 0 && y0 + 16 < r->CY &&
+ x1 >= 0 && x1 + 16 < r->CX &&
+ y1 >= 0 && y1 + 16 < r->CY) {
+ const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+ const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+
+ UINT8* d = p;
+ UINT8* e = d + 16 * pitch;
+
+ __m64 S0, S1, D;
+ const __m64 F = *((const __m64*)MASK_FE);
+
+ while (d < e) {
+ S0 = *((const __m64*)(s0 + 0));
+ S1 = *((const __m64*)(s1 + 0));
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)(d + 0)) = D;
+
+ S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
+ S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)(d + 8)) = D; d += pitch;
+
+ S0 = *((const __m64*)(s0 + 0));
+ S1 = *((const __m64*)(s1 + 0));
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)(d + 0)) = D;
+
+ S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
+ S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)(d + 8)) = D; d += pitch;
+ }
+
+ } else {
+ MotionComp_Compensate8x8H_MMX(p, pitch, r, x0, y0 , x1, y1 );
+ MotionComp_Compensate8x8H_MMX(p + 8, pitch, r, x0 + 8, y0 , x1 + 8, y1 );
+ MotionComp_Compensate8x8H_MMX(p + 8 * pitch, pitch, r, x0, y0 + 8, x1, y1 + 8);
+ MotionComp_Compensate8x8H_MMX(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
+ }
+}
+
+void MotionComp_Compensate8x8H_MMX(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1)
+{
+ ALIGN(0x10) UINT8 b0[64], b1[64];
+
+ const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+ INT32 p0 = r->Pitch;
+
+ const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+ INT32 p1 = r->Pitch;
+
+ UINT8* d = p;
+
+ __m64 S0, S1, D;
+ const __m64 F = *((const __m64*)MASK_FE);
+
+ if (x0 < 0 || x0 + 8 >= r->CX ||
+ y0 < 0 || y0 + 8 >= r->CY ||
+ x1 < 0 || x1 + 8 >= r->CX ||
+ y1 < 0 || y1 + 8 >= r->CY) {
+ s0 = b0;
+ p0 = 8;
+
+ s1 = b1;
+ p1 = 8;
+
+ Block_Extract8x8_MMX(r, x0, y0, b0, 8);
+ Block_Extract8x8_MMX(r, x1, y1, b1, 8);
+ }
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0);
+ S1 = *((const __m64*)s1);
+ D = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
+ *((__m64*)d) = D;
+}
+
+/* */
+
+void MotionComp_Block16x16_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv)
+{
+ INT32 dx = ((mv->X & 1) != 0);
+ INT32 dy = ((mv->Y & 1) != 0);
+
+ INT32 vx[2] = { mv->X >> 1 };
+ INT32 vy[2] = { mv->Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate16x16_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv->X >= 0] += dx;
+ vy[mv->Y >= 0] += dy;
+
+ MotionComp_Compensate16x16H_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+void MotionComp_Block8x8Y_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv)
+{
+ INT32 dx = ((mv->X & 1) != 0);
+ INT32 dy = ((mv->Y & 1) != 0);
+
+ INT32 vx[2] = { mv->X >> 1 };
+ INT32 vy[2] = { mv->Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate8x8_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv->X >= 0] += dx;
+ vy[mv->Y >= 0] += dy;
+
+ MotionComp_Compensate8x8H_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+void MotionComp_Block8x8C_MMX(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv0)
+{
+ MotionVector_t mv = {
+ (mv0->X >> 1) | (mv0->X & 1),
+ (mv0->Y >> 1) | (mv0->Y & 1)
+ };
+
+ INT32 dx = ((mv.X & 1) != 0);
+ INT32 dy = ((mv.Y & 1) != 0);
+
+ INT32 vx[2] = { mv.X >> 1 };
+ INT32 vy[2] = { mv.Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate8x8_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv.X >= 0] += dx;
+ vy[mv.Y >= 0] += dy;
+
+ MotionComp_Compensate8x8H_MMX(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+/* */
+