--- /dev/null
+/* FrameReconstructor_SSE2.c */
+/* 2009/07/02 */
+
+#include "StdAfx.h"
+
+#include "FrameReconstructor.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Transpose_SSE2(
+ const INT16* x,
+ INT16* y)
+{
+ const __m128i* X = (const __m128i*)x;
+ __m128i* Y = (__m128i*)y;
+
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+
+ t0 = _mm_loadu_si128(X + 0);
+ t1 = _mm_loadu_si128(X + 1);
+ t2 = _mm_loadu_si128(X + 2);
+ t3 = _mm_loadu_si128(X + 3);
+ t4 = _mm_loadu_si128(X + 4);
+ t5 = _mm_loadu_si128(X + 5);
+ t6 = _mm_loadu_si128(X + 6);
+ t7 = _mm_loadu_si128(X + 7);
+
+ u0 = _mm_unpacklo_epi16(t0, t1);
+ u1 = _mm_unpackhi_epi16(t0, t1);
+ u2 = _mm_unpacklo_epi16(t2, t3);
+ u3 = _mm_unpackhi_epi16(t2, t3);
+ u4 = _mm_unpacklo_epi16(t4, t5);
+ u5 = _mm_unpackhi_epi16(t4, t5);
+ u6 = _mm_unpacklo_epi16(t6, t7);
+ u7 = _mm_unpackhi_epi16(t6, t7);
+
+ t0 = _mm_unpacklo_epi32(u0, u2);
+ t1 = _mm_unpacklo_epi32(u1, u3);
+ t2 = _mm_unpackhi_epi32(u0, u2);
+ t3 = _mm_unpackhi_epi32(u1, u3);
+ t4 = _mm_unpacklo_epi32(u4, u6);
+ t5 = _mm_unpacklo_epi32(u5, u7);
+ t6 = _mm_unpackhi_epi32(u4, u6);
+ t7 = _mm_unpackhi_epi32(u5, u7);
+
+ Y[0] = _mm_unpacklo_epi64(t0, t4);
+ Y[1] = _mm_unpackhi_epi64(t0, t4);
+ Y[2] = _mm_unpacklo_epi64(t2, t6);
+ Y[3] = _mm_unpackhi_epi64(t2, t6);
+ Y[4] = _mm_unpacklo_epi64(t1, t5);
+ Y[5] = _mm_unpackhi_epi64(t1, t5);
+ Y[6] = _mm_unpacklo_epi64(t3, t7);
+ Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+void QT_UpdateDequantizeMatrix_SSE2(
+ FrameDecoder_t* t)
+{
+ FrameReconstructor_SSE2_t* r = t->Reconstructor;
+
+ INT32 i, p;
+
+ for (i = 0; i < 2; i++) {
+ for (p = 0; p < 3; p++) {
+ const INT16* x = t->Dequantize.Matrix[i][p];
+ INT16* y = r->Matrix[i][p];
+ Transpose_SSE2(x, y);
+ }
+ }
+}
+
+/* */
+
+static __inline void Block_CopyPlane8x8_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ Plane_t* r)
+{
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = *((const __m64*)s); s += r->Pitch;
+ s1 = *((const __m64*)s); s += r->Pitch;
+ s2 = *((const __m64*)s); s += r->Pitch;
+ s3 = *((const __m64*)s); s += r->Pitch;
+ s4 = *((const __m64*)s); s += r->Pitch;
+ s5 = *((const __m64*)s); s += r->Pitch;
+ s6 = *((const __m64*)s); s += r->Pitch;
+ s7 = *((const __m64*)s);
+
+ *((__m64*)d) = s0; d += p->Pitch;
+ *((__m64*)d) = s1; d += p->Pitch;
+ *((__m64*)d) = s2; d += p->Pitch;
+ *((__m64*)d) = s3; d += p->Pitch;
+ *((__m64*)d) = s4; d += p->Pitch;
+ *((__m64*)d) = s5; d += p->Pitch;
+ *((__m64*)d) = s6; d += p->Pitch;
+ *((__m64*)d) = s7;
+}
+
+static __inline void Block_CopyPlane16x16_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ Plane_t* r)
+{
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s7 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+
+ _mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s7); d += p->Pitch;
+
+ s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+ s7 = _mm_load_si128((const __m128i*)s);
+
+ _mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+ _mm_store_si128((__m128i*)d, s7);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 IPRED[8] = {
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static __inline void Block_CopyIntra8x8_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const INT16* c)
+{
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ const __m128i* B = (const __m128i*)IPRED;
+ const __m128i* C = (const __m128i*)c;
+
+ __m128i s0, s1, s2, s3;
+ const __m128i z = _mm_setzero_si128();
+
+ s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], B[0]), z);
+ s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], B[0]), z);
+ s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], B[0]), z);
+ s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], B[0]), z);
+
+ _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+ s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], B[0]), z);
+ s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], B[0]), z);
+ s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], B[0]), z);
+ s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], B[0]), z);
+
+ _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s3);
+}
+
+static __inline void Block_ReviseInter8x8_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const INT16* c)
+{
+ UINT8* d = p->Plane + y * p->Pitch + x;
+ const UINT8* s = d;
+
+ const __m128i* C = (const __m128i*)c;
+
+ __m128i b0, b1, b2, b3;
+ __m128i s0, s1, s2, s3;
+ const __m128i z = _mm_setzero_si128();
+
+ b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+
+ s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], b0), z);
+ s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], b1), z);
+ s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], b2), z);
+ s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], b3), z);
+
+ _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+ b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+ b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z);
+
+ s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], b0), z);
+ s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], b1), z);
+ s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], b2), z);
+ s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], b3), z);
+
+ _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+ _mm_storel_epi64((__m128i*)d, s3);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 COS[8][8] = {
+ { 8, 8, 8, 8, 8, 8, 8, 8 }, /* 0 */
+ { 64277, 64277, 64277, 64277, 64277, 64277, 64277, 64277 }, /* 1 */
+ { 60547, 60547, 60547, 60547, 60547, 60547, 60547, 60547 }, /* 2 */
+ { 54491, 54491, 54491, 54491, 54491, 54491, 54491, 54491 }, /* 3 */
+ { 46341, 46341, 46341, 46341, 46341, 46341, 46341, 46341 }, /* 4 */
+ { 36410, 36410, 36410, 36410, 36410, 36410, 36410, 36410 }, /* 5 */
+ { 25080, 25080, 25080, 25080, 25080, 25080, 25080, 25080 }, /* 6 */
+ { 12785, 12785, 12785, 12785, 12785, 12785, 12785, 12785 }, /* 7 */
+};
+
+#define MUL1(T,X) _mm_add_epi16(_mm_mulhi_epi16(X, C[T]), X)
+#define MUL0(T,X) _mm_mulhi_epi16(X, C[T])
+
+static __inline void IDCT_R_8_SSE2(
+ const INT16* x,
+ INT16* y)
+{
+ const __m128i* C = (const __m128i*)COS[0];
+ const __m128i* X = (const __m128i*)x;
+ __m128i* Y = (__m128i*)y;
+
+ __m128i s0;
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ /* Stage.1 */
+
+ s0 = _mm_add_epi16(X[0], X[4]);
+ t0 = MUL1(4, s0);
+
+ s0 = _mm_sub_epi16(X[0], X[4]);
+ t1 = MUL1(4, s0);
+
+ t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+ t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+ t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+ t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+ t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+ t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_epi16(t4, t5);
+ t4 = _mm_add_epi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_epi16(t7, t6);
+ t7 = _mm_add_epi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_epi16(t0, t3);
+ t0 = _mm_add_epi16(t0, t3);
+
+ t3 = _mm_sub_epi16(t1, t2);
+ t1 = _mm_add_epi16(t1, t2);
+
+ t2 = _mm_sub_epi16(t6, t5);
+ t6 = _mm_add_epi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[0] = _mm_add_epi16(t0, t7);
+ Y[1] = _mm_add_epi16(t1, t6);
+ Y[2] = _mm_add_epi16(t3, t2);
+ Y[3] = _mm_add_epi16(s0, t4);
+ Y[4] = _mm_sub_epi16(s0, t4);
+ Y[5] = _mm_sub_epi16(t3, t2);
+ Y[6] = _mm_sub_epi16(t1, t6);
+ Y[7] = _mm_sub_epi16(t0, t7);
+}
+
+static __inline void IDCT_C_8_SSE2(
+ const INT16* x,
+ INT16* y)
+{
+ const __m128i* C = (const __m128i*)COS[0];
+ const __m128i* X = (const __m128i*)x;
+ __m128i* Y = (__m128i*)y;
+
+ __m128i s0;
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ /* Stage.1 */
+
+ s0 = _mm_add_epi16(X[0], X[4]);
+ t0 = MUL1(4, s0);
+
+ s0 = _mm_sub_epi16(X[0], X[4]);
+ t1 = MUL1(4, s0);
+
+ t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+ t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+ t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+ t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+ t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+ t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_epi16(t4, t5);
+ t4 = _mm_add_epi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_epi16(t7, t6);
+ t7 = _mm_add_epi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_epi16(t0, t3);
+ t0 = _mm_add_epi16(t0, t3);
+
+ t3 = _mm_sub_epi16(t1, t2);
+ t1 = _mm_add_epi16(t1, t2);
+
+ t2 = _mm_sub_epi16(t6, t5);
+ t6 = _mm_add_epi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4);
+ Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4);
+ Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4);
+ Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4);
+ Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4);
+ Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4);
+ Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4);
+ Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4);
+}
+
+static __inline void Transpose_U_SSE2(
+ const INT16* x,
+ INT16* y)
+{
+ const __m128i* X = (const __m128i*)x;
+ __m128i* Y = (__m128i*)y;
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ u0 = _mm_unpacklo_epi16(X[0], X[1]);
+ u1 = _mm_unpackhi_epi16(X[0], X[1]);
+ u2 = _mm_unpacklo_epi16(X[2], X[3]);
+ u3 = _mm_unpackhi_epi16(X[2], X[3]);
+ u4 = _mm_unpacklo_epi16(X[4], X[5]);
+ u5 = _mm_unpackhi_epi16(X[4], X[5]);
+ u6 = _mm_unpacklo_epi16(X[6], X[7]);
+ u7 = _mm_unpackhi_epi16(X[6], X[7]);
+
+ t0 = _mm_unpacklo_epi32(u0, u2);
+ t1 = _mm_unpacklo_epi32(u1, u3);
+ t2 = _mm_unpackhi_epi32(u0, u2);
+ t3 = _mm_unpackhi_epi32(u1, u3);
+ t4 = _mm_unpacklo_epi32(u4, u6);
+ t5 = _mm_unpacklo_epi32(u5, u7);
+ t6 = _mm_unpackhi_epi32(u4, u6);
+ t7 = _mm_unpackhi_epi32(u5, u7);
+
+ Y[0] = _mm_unpacklo_epi64(t0, t4);
+ Y[1] = _mm_unpackhi_epi64(t0, t4);
+ Y[2] = _mm_unpacklo_epi64(t2, t6);
+ Y[3] = _mm_unpackhi_epi64(t2, t6);
+ Y[4] = _mm_unpacklo_epi64(t1, t5);
+ Y[5] = _mm_unpackhi_epi64(t1, t5);
+ Y[6] = _mm_unpacklo_epi64(t3, t7);
+ Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+/* */
+
+static const UINT8 TZZ[64] = {
+ 0, 2, 3, 9, 10, 20, 21, 35,
+ 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48,
+ 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57,
+ 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62,
+ 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+static __inline void DequantizeIDCT8x8_SSE2(
+ const INT16* block,
+ const INT16* matrix,
+ INT16* coeff)
+{
+ ALIGN(0x10) INT16 c0[64];
+
+ { /* Reorder */
+ const UINT8* t = TZZ;
+
+ INT16* c = c0;
+ INT16* e = c + 64;
+ for (; c < e; c += 4, t += 4) {
+ c[0] = block[t[0]];
+ c[1] = block[t[1]];
+ c[2] = block[t[2]];
+ c[3] = block[t[3]];
+ }
+ }
+
+ { /* Dequantize */
+ const __m128i* m = (const __m128i*)matrix;
+ __m128i* d = (__m128i*)c0;
+
+ d[0] = _mm_mullo_epi16(d[0], m[0]);
+ d[1] = _mm_mullo_epi16(d[1], m[1]);
+ d[2] = _mm_mullo_epi16(d[2], m[2]);
+ d[3] = _mm_mullo_epi16(d[3], m[3]);
+ d[4] = _mm_mullo_epi16(d[4], m[4]);
+ d[5] = _mm_mullo_epi16(d[5], m[5]);
+ d[6] = _mm_mullo_epi16(d[6], m[6]);
+ d[7] = _mm_mullo_epi16(d[7], m[7]);
+ }
+
+ /* iDCT Row */
+ IDCT_R_8_SSE2(c0, coeff);
+
+ /* Transpose */
+ Transpose_U_SSE2(coeff, c0);
+
+ /* iDCT Colum */
+ IDCT_C_8_SSE2(c0, coeff);
+}
+
+/* */
+
+struct DecodeCoefficientsContext {
+
+ INT32 EOB_Run[64];
+
+ INT8* Run [64];
+ INT16* Coeff[64];
+
+}; /* DecodeCoefficientsContext */
+
+typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+
+static INT32 DecodeCoefficients(
+ FrameDecoder_t* t,
+ DecodeCoefficientsContext_t* ctx,
+ INT16* block)
+{
+ INT16* b = block;
+ INT16* e = b + 64;
+
+ INT32 i = 0;
+
+ while (b < e) {
+ if (ctx->EOB_Run[i] > 0) {
+ ctx->EOB_Run[i] -= 1;
+ break;
+
+ } else {
+ INT32 run = *((ctx->Run [i])++);
+ INT16 coeff = *((ctx->Coeff[i])++);
+
+ if (run < 0) {
+ ctx->EOB_Run[i] = coeff;
+
+ } else {
+ INT16* p = b + run;
+ if (p >= e) {
+ break;
+ }
+
+ while (b < p) {
+ *(b++) = 0;
+ }
+
+ *(b++) = coeff;
+
+ i = b - block;
+ }
+ }
+ }
+
+ while (b < e) {
+ *(b++) = 0;
+ }
+
+ return i;
+}
+
+/* */
+
+static void Reconstruct_IntraBlock(
+ FrameDecoder_t* t,
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ INT16 dc,
+ INT32 plane,
+ Plane_t* r,
+ DecodeCoefficientsContext_t* ctx)
+{
+ ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 coeff[64];
+
+ const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+
+ if (dc == NOT_CODED) {
+ Block_CopyPlane8x8_SSE2(p, x, y, r);
+ return;
+ }
+
+ DecodeCoefficients(t, ctx, block);
+
+ block[0] = dc;
+
+ DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+ Block_CopyIntra8x8_SSE2(p, x, y, coeff);
+}
+
+static void Reconstruct_InterBlock(
+ FrameDecoder_t* t,
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ INT16 dc,
+ INT32 plane,
+ Plane_t* r,
+ DecodeCoefficientsContext_t* ctx)
+{
+ ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 coeff[64];
+
+ const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+
+ if (dc == NOT_CODED) {
+ if (r != NULL) {
+ Block_CopyPlane8x8_SSE2(p, x, y, r);
+ }
+ return;
+ }
+
+ DecodeCoefficients(t, ctx, block);
+
+ block[0] = dc;
+
+ DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+ Block_ReviseInter8x8_SSE2(p, x, y, coeff);
+}
+
+/* */
+
+/* */
+
+static const INT8 S_PX[16] = {
+ 0*8, 1*8, 1*8, 0*8,
+ 0*8, 0*8, 1*8, 1*8,
+ 2*8, 2*8, 3*8, 3*8,
+ 3*8, 2*8, 2*8, 3*8
+};
+
+static const INT8 S_PY[16] = {
+ 0*8, 0*8, 1*8, 1*8,
+ 2*8, 3*8, 3*8, 2*8,
+ 2*8, 3*8, 3*8, 2*8,
+ 1*8, 1*8, 0*8, 0*8
+};
+
+static const INT8 M_PX[4] = {
+ 0*16, 0*16,
+ 1*16, 1*16
+};
+
+static const INT8 M_PY[4] = {
+ 0*16, 1*16,
+ 1*16, 0*16
+};
+
+/* */
+
+static void Reconstruct_YPlane_SSE2(
+ FrameDecoder_t* t)
+{
+ INT32 x, y;
+
+ INT32 sx = t->Index->SX[0] * 32;
+ INT32 sy = t->Index->SY[0] * 32;
+
+ INT32 mx = t->Index->MX * 16;
+ INT32 my = t->Index->MY * 16;
+
+ INT32 bx = t->Index->BX[0];
+
+ const UINT16* bi = t->Index->BIndex[0];
+
+ Plane_t* g = t->Frame[0];
+ Plane_t* p = t->Frame[1];
+ Plane_t* r = t->Frame[2];
+
+ const UINT8* mm = t->MBMode;
+ const MotionVector_t* mv = t->MV;
+
+ DecodeCoefficientsContext_t ctx = { 0 };
+
+ INT32 i;
+ for (i = 0; i < 64; i++) {
+ ctx.Run [i] = t->BRun [0][i];
+ ctx.Coeff[i] = t->BCoeff[0][i];
+ }
+
+ for (y = 0; y < sy; y += 32) {
+ for (x = 0; x < sx; x += 32) {
+ INT32 i = 0;
+
+ INT32 m;
+ for (m = 0; m < 4; m++, i += 4) {
+ INT32 x0 = x + M_PX[m];
+ INT32 y0 = y + M_PY[m];
+ if (x0 < mx && y0 < my) {
+ switch (*mm) {
+ case 0: /* INTER_NOMV */
+ Block_CopyPlane16x16_SSE2(p, x0, y0, r);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx);
+ break;
+
+ case 1: /* INTRA */
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 2: /* INTER_MV */
+ case 3: /* INTER_MV_LAST */
+ case 4: /* INTER_MV_LAST2 */
+ MotionComp_Block16x16_SSE2(p, x0, y0, r, mv);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 5: /* INTER_GOLDEN_NOMV */
+ Block_CopyPlane16x16_SSE2(p, x0, y0, g);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 6: /* INTER_GOLDEN_MV */
+ MotionComp_Block16x16_SSE2(p, x0, y0, g, mv);
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+
+ case 7: /* INTER_MV_FOUR */
+ {
+ const MotionVector_t* v = mv;
+
+ const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;
+
+ if (dc[0] != NOT_CODED) {
+ MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 0, r, v++);
+ }
+
+ if (dc[1] != NOT_CODED) {
+ MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 0, r, v++);
+ }
+
+ if (dc[0 + bx] != NOT_CODED) {
+ MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 8, r, v++);
+ }
+
+ if (dc[1 + bx] != NOT_CODED) {
+ MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 8, r, v++);
+ }
+
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+ Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+ break;
+ }
+
+ } /* switch */
+
+ bi += 4;
+ mm += 1;
+ mv += 4;
+ }
+ }
+ }
+ }
+}
+
+/* */
+
+static void Reconstruct_CPlane_SSE2(
+ FrameDecoder_t* t)
+{
+ INT32 x, y;
+
+ INT32 sx = t->Index->SX[1] * 32;
+ INT32 sy = t->Index->SY[1] * 32;
+
+ INT32 mx = t->Index->MX * 8;
+ INT32 my = t->Index->MY * 8;
+
+ INT32 bx = t->Index->BX[1];
+
+ const UINT16* bi = t->Index->BIndex[1];
+
+ Plane_t* g = t->Frame[0];
+ Plane_t* p = t->Frame[1];
+ Plane_t* r = t->Frame[2];
+
+ const INT16* DC0 = t->DC + t->Index->BC[0];
+ const INT16* DC1 = DC0 + t->Index->BC[1];
+
+ const UINT8* m = t->BMode + t->Index->BC[0];
+
+ DecodeCoefficientsContext_t ctx[2] = { 0 };
+
+ INT32 i;
+ for (i = 0; i < 64; i++) {
+ ctx[0].Run [i] = t->BRun [1][i];
+ ctx[0].Coeff[i] = t->BCoeff[1][i];
+
+ ctx[1].Run [i] = t->BRun [2][i];
+ ctx[1].Coeff[i] = t->BCoeff[2][i];
+ }
+
+ for (y = 0; y < sy; y += 32) {
+ for (x = 0; x < sx; x += 32) {
+ INT32 i;
+ for (i = 0; i < 16; i++) {
+ INT32 xx = x + S_PX[i];
+ INT32 yy = y + S_PY[i];
+
+ if (xx < mx && yy < my) {
+ INT32 idx = (xx >> 3) + (yy >> 3) * bx;
+
+ switch (m[idx]) {
+ case 0: /* INTER_NOMV */
+ Block_CopyPlane8x8_SSE2(p + 1, xx, yy, r + 1);
+ Block_CopyPlane8x8_SSE2(p + 2, xx, yy, r + 2);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1);
+ break;
+
+ case 1: /* INTRA */
+ Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 2: /* INTER_MV */
+ case 3: /* INTER_MV_LAST */
+ case 4: /* INTER_MV_LAST2 */
+ MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+ MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 5: /* INTER_GOLDEN_NOMV */
+ Block_CopyPlane8x8_SSE2(p + 1, xx, yy, g + 1);
+ Block_CopyPlane8x8_SSE2(p + 2, xx, yy, g + 2);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 6: /* INTER_GOLDEN_MV */
+ MotionComp_Block8x8C_SSE2(p + 1, xx, yy, g + 1, t->MVC + idx);
+ MotionComp_Block8x8C_SSE2(p + 2, xx, yy, g + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ case 7: /* INTER_MV_FOUR */
+ MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+ MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+ Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+ Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+ break;
+
+ } /* switch */
+ }
+ }
+ }
+ }
+}
+
+/* */
+
+void QT_ReconstructFrame_SSE2(
+ FrameDecoder_t* t)
+{
+ Reconstruct_YPlane_SSE2(t);
+
+ Reconstruct_CPlane_SSE2(t);
+
+ if (t->Filter.Limit > 0) {
+ QT_FrameLoopFilter(t);
+ }
+}
+
+/* */
+
--- /dev/null
+/* MotionComp_SSE2.c */
+/* 2009/07/02 */
+
+#include "StdAfx.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Block_Extract8x8(
+ const Plane_t* plane,
+ INT32 x,
+ INT32 y,
+ UINT8* block)
+{
+ INT32 i, j;
+
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ INT32 xx = x + j;
+ INT32 yy = y + i;
+
+ if (xx < 0) {
+ xx = 0;
+ } else if (xx >= plane->CX) {
+ xx = plane->CX - 1;
+ }
+
+ if (yy < 0) {
+ yy = 0;
+ } else if (yy >= plane->CY) {
+ yy = plane->CY - 1;
+ }
+
+ block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
+ }
+ }
+}
+
+/* */
+
+void MotionComp_Compensate16x16_SSE2(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y)
+{
+ if (x >= 0 && x + 16 < r->CX &&
+ y >= 0 && y + 16 < r->CY) {
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ UINT8* d = p;
+
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+
+ _mm_store_si128((__m128i*)d, s0); d += pitch;
+ _mm_store_si128((__m128i*)d, s1); d += pitch;
+ _mm_store_si128((__m128i*)d, s2); d += pitch;
+ _mm_store_si128((__m128i*)d, s3); d += pitch;
+ _mm_store_si128((__m128i*)d, s4); d += pitch;
+ _mm_store_si128((__m128i*)d, s5); d += pitch;
+ _mm_store_si128((__m128i*)d, s6); d += pitch;
+ _mm_store_si128((__m128i*)d, s7); d += pitch;
+
+ s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+ s7 = _mm_loadu_si128((const __m128i*)s);
+
+ _mm_store_si128((__m128i*)d, s0); d += pitch;
+ _mm_store_si128((__m128i*)d, s1); d += pitch;
+ _mm_store_si128((__m128i*)d, s2); d += pitch;
+ _mm_store_si128((__m128i*)d, s3); d += pitch;
+ _mm_store_si128((__m128i*)d, s4); d += pitch;
+ _mm_store_si128((__m128i*)d, s5); d += pitch;
+ _mm_store_si128((__m128i*)d, s6); d += pitch;
+ _mm_store_si128((__m128i*)d, s7);
+
+ } else {
+ MotionComp_Compensate8x8_SSE2(p, pitch, r, x, y );
+ MotionComp_Compensate8x8_SSE2(p + 8, pitch, r, x + 8, y );
+ MotionComp_Compensate8x8_SSE2(p + 8 * pitch, pitch, r, x, y + 8);
+ MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
+ }
+}
+
+void MotionComp_Compensate8x8_SSE2(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x,
+ INT32 y)
+{
+ ALIGN(0x10) UINT8 b[64];
+
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ INT32 p0 = r->Pitch;
+ UINT8* d = p;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ if (x < 0 || x + 8 >= r->CX ||
+ y < 0 || y + 8 >= r->CY) {
+ s = b;
+ p0 = 8;
+ Block_Extract8x8(r, x, y, b);
+ }
+
+ s0 = *((const __m64*)s); s += p0;
+ s1 = *((const __m64*)s); s += p0;
+ s2 = *((const __m64*)s); s += p0;
+ s3 = *((const __m64*)s); s += p0;
+ s4 = *((const __m64*)s); s += p0;
+ s5 = *((const __m64*)s); s += p0;
+ s6 = *((const __m64*)s); s += p0;
+ s7 = *((const __m64*)s);
+
+ *((__m64*)d) = s0; d += pitch;
+ *((__m64*)d) = s1; d += pitch;
+ *((__m64*)d) = s2; d += pitch;
+ *((__m64*)d) = s3; d += pitch;
+ *((__m64*)d) = s4; d += pitch;
+ *((__m64*)d) = s5; d += pitch;
+ *((__m64*)d) = s6; d += pitch;
+ *((__m64*)d) = s7;
+}
+
+/* */
+
+ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+void MotionComp_Compensate16x16H_SSE2(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1)
+{
+ if (x0 >= 0 && x0 + 16 < r->CX &&
+ y0 >= 0 && y0 + 16 < r->CY &&
+ x1 >= 0 && x1 + 16 < r->CX &&
+ y1 >= 0 && y1 + 16 < r->CY) {
+ const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+ const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+
+ UINT8* d = p;
+
+ __m128i S0, S1, D;
+ const __m128i M = *((const __m128i*)MASK_1);
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+ S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D); d += pitch;
+
+ S0 = _mm_loadu_si128((const __m128i*)s0);
+ S1 = _mm_loadu_si128((const __m128i*)s1);
+ D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+ _mm_store_si128((__m128i*)d, D);
+
+ } else {
+ MotionComp_Compensate8x8H_SSE2(p, pitch, r, x0, y0 , x1, y1 );
+ MotionComp_Compensate8x8H_SSE2(p + 8, pitch, r, x0 + 8, y0 , x1 + 8, y1 );
+ MotionComp_Compensate8x8H_SSE2(p + 8 * pitch, pitch, r, x0, y0 + 8, x1, y1 + 8);
+ MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
+ }
+}
+
+void MotionComp_Compensate8x8H_SSE2(
+ UINT8* p,
+ INT32 pitch,
+ const Plane_t* r,
+ INT32 x0,
+ INT32 y0,
+ INT32 x1,
+ INT32 y1)
+{
+ ALIGN(0x10) UINT8 b0[64], b1[64];
+
+ const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+ INT32 p0 = r->Pitch;
+
+ const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+ INT32 p1 = r->Pitch;
+
+ UINT8* d = p;
+
+ __m64 S0, S1, D;
+ const __m64 M = *((const __m64*)MASK_1);
+
+ if (x0 < 0 || x0 + 8 >= r->CX ||
+ y0 < 0 || y0 + 8 >= r->CY ||
+ x1 < 0 || x1 + 8 >= r->CX ||
+ y1 < 0 || y1 + 8 >= r->CY) {
+ s0 = b0;
+ p0 = 8;
+
+ s1 = b1;
+ p1 = 8;
+
+ Block_Extract8x8(r, x0, y0, b0);
+ Block_Extract8x8(r, x1, y1, b1);
+ }
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0); s0 += p0;
+ S1 = *((const __m64*)s1); s1 += p1;
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D; d += pitch;
+
+ S0 = *((const __m64*)s0);
+ S1 = *((const __m64*)s1);
+ D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+ *((__m64*)d) = D;
+}
+
+/* */
+
+void MotionComp_Block16x16_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv)
+{
+ INT32 dx = ((mv->X & 1) != 0);
+ INT32 dy = ((mv->Y & 1) != 0);
+
+ INT32 vx[2] = { mv->X >> 1 };
+ INT32 vy[2] = { mv->Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate16x16_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv->X >= 0] += dx;
+ vy[mv->Y >= 0] += dy;
+
+ MotionComp_Compensate16x16H_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+void MotionComp_Block8x8Y_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv)
+{
+ INT32 dx = ((mv->X & 1) != 0);
+ INT32 dy = ((mv->Y & 1) != 0);
+
+ INT32 vx[2] = { mv->X >> 1 };
+ INT32 vy[2] = { mv->Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate8x8_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv->X >= 0] += dx;
+ vy[mv->Y >= 0] += dy;
+
+ MotionComp_Compensate8x8H_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+void MotionComp_Block8x8C_SSE2(
+ Plane_t* p,
+ INT32 x,
+ INT32 y,
+ const Plane_t* r,
+ const MotionVector_t* mv0)
+{
+ MotionVector_t mv = {
+ (mv0->X >> 1) | (mv0->X & 1),
+ (mv0->Y >> 1) | (mv0->Y & 1)
+ };
+
+ INT32 dx = ((mv.X & 1) != 0);
+ INT32 dy = ((mv.Y & 1) != 0);
+
+ INT32 vx[2] = { mv.X >> 1 };
+ INT32 vy[2] = { mv.Y >> 1 };
+
+ UINT8* d = p->Plane + y * p->Pitch + x;
+
+ if (dx == 0 && dy == 0) {
+ MotionComp_Compensate8x8_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0]);
+
+ } else {
+ vx[1] = vx[0];
+ vy[1] = vy[0];
+
+ vx[mv.X >= 0] += dx;
+ vy[mv.Y >= 0] += dy;
+
+ MotionComp_Compensate8x8H_SSE2(
+ d,
+ p->Pitch,
+ r,
+ x + vx[0],
+ y + vy[0],
+ x + vx[1],
+ y + vy[1]);
+ }
+}
+
+/* */
+