From: Noumi Akira Date: Thu, 2 Jul 2009 08:34:43 +0000 (+0900) Subject: add FrameReconstructor_SSE2. X-Git-Tag: ex-1-preview-1~3 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=6744cea84bd5cff9946494599b34f82d592c0202;p=qtheora%2Fmain.git add FrameReconstructor_SSE2. --- diff --git a/Lib/QTheoraEx/FrameDecoder.c b/Lib/QTheoraEx/FrameDecoder.c index 08b7afd..a7d4647 100644 --- a/Lib/QTheoraEx/FrameDecoder.c +++ b/Lib/QTheoraEx/FrameDecoder.c @@ -630,6 +630,15 @@ BOOL QT_FrameDecoder_Setup( /* */ + if (g_QT_Enable_SSE2) { + t->Reconstructor = (FrameReconstructor_SSE2_t*)QT_MemoryPool_Allocate(pool, sizeof(FrameReconstructor_SSE2_t)); + if (t->Reconstructor == NULL) { + return FALSE; + } + } + + /* */ + t->QIndex = -1; /* */ @@ -691,6 +700,20 @@ BOOL QT_FrameDecoder_Setup( /* */ + if (g_QT_Enable_SSE2) { + t->UpdateDequantizeMatrix = QT_UpdateDequantizeMatrix_SSE2; + } else { + t->UpdateDequantizeMatrix = NULL; + } + + if (g_QT_Enable_SSE2) { + t->Reconstruct = QT_ReconstructFrame_SSE2; + } else { + t->Reconstruct = QT_ReconstructFrame; + } + + /* */ + return TRUE; } diff --git a/Lib/QTheoraEx/FrameDecoder.h b/Lib/QTheoraEx/FrameDecoder.h index 9e58844..0255c1f 100644 --- a/Lib/QTheoraEx/FrameDecoder.h +++ b/Lib/QTheoraEx/FrameDecoder.h @@ -75,6 +75,15 @@ struct MotionVector { typedef struct MotionVector MotionVector_t; +/* FrameReconstructor_SSE2 */ +struct FrameReconstructor_SSE2 { + + INT16 Matrix[2][3][64]; + +}; /* FrameReconstructor_SSE2 */ + +typedef struct FrameReconstructor_SSE2 FrameReconstructor_SSE2_t; + /* QT_FrameDecoder */ struct QT_FrameDecoder; @@ -94,6 +103,8 @@ struct QT_FrameDecoder { DequantizeMatrix_t Dequantize; + FrameReconstructor_SSE2_t* Reconstructor; + LoopFilter_t Filter; INT32 QIndex; @@ -126,6 +137,10 @@ struct QT_FrameDecoder { BOOL (*Decode)(FrameDecoder_t*, const VOID*, SIZE_T); + VOID (*UpdateDequantizeMatrix)(FrameDecoder_t*); + + VOID (*Reconstruct)(FrameDecoder_t*); + }; /* QT_FrameDecoder */ /* */ diff --git a/Lib/QTheoraEx/FrameDecoder_Impl.h b/Lib/QTheoraEx/FrameDecoder_Impl.h index 9bcb6be..a7e6dcf 100644 --- a/Lib/QTheoraEx/FrameDecoder_Impl.h +++ b/Lib/QTheoraEx/FrameDecoder_Impl.h @@ -610,6 +610,10 @@ static BOOL FrameDecoder_Decode( &(t->Filter), &(t->Setup->Filter), t->QIndex); + + if (t->UpdateDequantizeMatrix != NULL) { + t->UpdateDequantizeMatrix(t); + } } /* */ @@ -717,7 +721,7 @@ static BOOL FrameDecoder_Decode( /* */ - QT_ReconstructFrame(t); + t->Reconstruct(t); /* */ diff --git a/Lib/QTheoraEx/FrameReconstructor.h b/Lib/QTheoraEx/FrameReconstructor.h index 68f9ea8..754dc1c 100644 --- a/Lib/QTheoraEx/FrameReconstructor.h +++ b/Lib/QTheoraEx/FrameReconstructor.h @@ -5,9 +5,21 @@ #include "FrameDecoder.h" +/* */ + void QT_FrameLoopFilter( FrameDecoder_t* t); void QT_ReconstructFrame( FrameDecoder_t* t); +/* */ + +void QT_UpdateDequantizeMatrix_SSE2( + FrameDecoder_t* t); + +void QT_ReconstructFrame_SSE2( + FrameDecoder_t* t); + +/* */ + diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c new file mode 100644 index 0000000..8b5e765 --- /dev/null +++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c @@ -0,0 +1,872 @@ +/* FrameReconstructor_SSE2.c */ +/* 2009/07/02 */ + +#include "StdAfx.h" + +#include "FrameReconstructor.h" + +#include "MotionComp_SSE2.h" + +/* */ + +#pragma warning(disable : 4799) + +/* */ + +static void Transpose_SSE2( + const INT16* x, + INT16* y) +{ + const __m128i* X = (const __m128i*)x; + __m128i* Y = (__m128i*)y; + + __m128i t0, t1, t2, t3, t4, t5, t6, t7; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + + t0 = _mm_loadu_si128(X + 0); + t1 = _mm_loadu_si128(X + 1); + t2 = _mm_loadu_si128(X + 2); + t3 = _mm_loadu_si128(X + 3); + t4 = _mm_loadu_si128(X + 4); + t5 = _mm_loadu_si128(X + 5); + t6 = _mm_loadu_si128(X + 6); + t7 = _mm_loadu_si128(X + 7); + + u0 = _mm_unpacklo_epi16(t0, t1); + u1 = _mm_unpackhi_epi16(t0, t1); + u2 = _mm_unpacklo_epi16(t2, t3); + u3 = _mm_unpackhi_epi16(t2, t3); + u4 = _mm_unpacklo_epi16(t4, t5); + u5 = _mm_unpackhi_epi16(t4, t5); + u6 = _mm_unpacklo_epi16(t6, t7); + u7 = _mm_unpackhi_epi16(t6, t7); + + t0 = _mm_unpacklo_epi32(u0, u2); + t1 = _mm_unpacklo_epi32(u1, u3); + t2 = _mm_unpackhi_epi32(u0, u2); + t3 = _mm_unpackhi_epi32(u1, u3); + t4 = _mm_unpacklo_epi32(u4, u6); + t5 = _mm_unpacklo_epi32(u5, u7); + t6 = _mm_unpackhi_epi32(u4, u6); + t7 = _mm_unpackhi_epi32(u5, u7); + + Y[0] = _mm_unpacklo_epi64(t0, t4); + Y[1] = _mm_unpackhi_epi64(t0, t4); + Y[2] = _mm_unpacklo_epi64(t2, t6); + Y[3] = _mm_unpackhi_epi64(t2, t6); + Y[4] = _mm_unpacklo_epi64(t1, t5); + Y[5] = _mm_unpackhi_epi64(t1, t5); + Y[6] = _mm_unpacklo_epi64(t3, t7); + Y[7] = _mm_unpackhi_epi64(t3, t7); +} + +void QT_UpdateDequantizeMatrix_SSE2( + FrameDecoder_t* t) +{ + FrameReconstructor_SSE2_t* r = t->Reconstructor; + + INT32 i, p; + + for (i = 0; i < 2; i++) { + for (p = 0; p < 3; p++) { + const INT16* x = t->Dequantize.Matrix[i][p]; + INT16* y = r->Matrix[i][p]; + Transpose_SSE2(x, y); + } + } +} + +/* */ + +static __inline void Block_CopyPlane8x8_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + Plane_t* r) +{ + const UINT8* s = r->Plane + y * r->Pitch + x; + UINT8* d = p->Plane + y * p->Pitch + x; + + __m64 s0, s1, s2, s3, s4, s5, s6, s7; + + s0 = *((const __m64*)s); s += r->Pitch; + s1 = *((const __m64*)s); s += r->Pitch; + s2 = *((const __m64*)s); s += r->Pitch; + s3 = *((const __m64*)s); s += r->Pitch; + s4 = *((const __m64*)s); s += r->Pitch; + s5 = *((const __m64*)s); s += r->Pitch; + s6 = *((const __m64*)s); s += r->Pitch; + s7 = *((const __m64*)s); + + *((__m64*)d) = s0; d += p->Pitch; + *((__m64*)d) = s1; d += p->Pitch; + *((__m64*)d) = s2; d += p->Pitch; + *((__m64*)d) = s3; d += p->Pitch; + *((__m64*)d) = s4; d += p->Pitch; + *((__m64*)d) = s5; d += p->Pitch; + *((__m64*)d) = s6; d += p->Pitch; + *((__m64*)d) = s7; +} + +static __inline void Block_CopyPlane16x16_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + Plane_t* r) +{ + const UINT8* s = r->Plane + y * r->Pitch + x; + UINT8* d = p->Plane + y * p->Pitch + x; + + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s7 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + + _mm_store_si128((__m128i*)d, s0); d += p->Pitch; + _mm_store_si128((__m128i*)d, s1); d += p->Pitch; + _mm_store_si128((__m128i*)d, s2); d += p->Pitch; + _mm_store_si128((__m128i*)d, s3); d += p->Pitch; + _mm_store_si128((__m128i*)d, s4); d += p->Pitch; + _mm_store_si128((__m128i*)d, s5); d += p->Pitch; + _mm_store_si128((__m128i*)d, s6); d += p->Pitch; + _mm_store_si128((__m128i*)d, s7); d += p->Pitch; + + s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch; + s7 = _mm_load_si128((const __m128i*)s); + + _mm_store_si128((__m128i*)d, s0); d += p->Pitch; + _mm_store_si128((__m128i*)d, s1); d += p->Pitch; + _mm_store_si128((__m128i*)d, s2); d += p->Pitch; + _mm_store_si128((__m128i*)d, s3); d += p->Pitch; + _mm_store_si128((__m128i*)d, s4); d += p->Pitch; + _mm_store_si128((__m128i*)d, s5); d += p->Pitch; + _mm_store_si128((__m128i*)d, s6); d += p->Pitch; + _mm_store_si128((__m128i*)d, s7); +} + +/* */ + +ALIGN(0x10) static const UINT16 IPRED[8] = { + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static __inline void Block_CopyIntra8x8_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const INT16* c) +{ + UINT8* d = p->Plane + y * p->Pitch + x; + + const __m128i* B = (const __m128i*)IPRED; + const __m128i* C = (const __m128i*)c; + + __m128i s0, s1, s2, s3; + const __m128i z = _mm_setzero_si128(); + + s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], B[0]), z); + s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], B[0]), z); + s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], B[0]), z); + s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], B[0]), z); + + _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch; + + s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], B[0]), z); + s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], B[0]), z); + s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], B[0]), z); + s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], B[0]), z); + + _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s3); +} + +static __inline void Block_ReviseInter8x8_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const INT16* c) +{ + UINT8* d = p->Plane + y * p->Pitch + x; + const UINT8* s = d; + + const __m128i* C = (const __m128i*)c; + + __m128i b0, b1, b2, b3; + __m128i s0, s1, s2, s3; + const __m128i z = _mm_setzero_si128(); + + b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + + s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], b0), z); + s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], b1), z); + s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], b2), z); + s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], b3), z); + + _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s3); d += p->Pitch; + + b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch; + b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); + + s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], b0), z); + s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], b1), z); + s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], b2), z); + s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], b3), z); + + _mm_storel_epi64((__m128i*)d, s0); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s1); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s2); d += p->Pitch; + _mm_storel_epi64((__m128i*)d, s3); +} + +/* */ + +ALIGN(0x10) static const UINT16 COS[8][8] = { + { 8, 8, 8, 8, 8, 8, 8, 8 }, /* 0 */ + { 64277, 64277, 64277, 64277, 64277, 64277, 64277, 64277 }, /* 1 */ + { 60547, 60547, 60547, 60547, 60547, 60547, 60547, 60547 }, /* 2 */ + { 54491, 54491, 54491, 54491, 54491, 54491, 54491, 54491 }, /* 3 */ + { 46341, 46341, 46341, 46341, 46341, 46341, 46341, 46341 }, /* 4 */ + { 36410, 36410, 36410, 36410, 36410, 36410, 36410, 36410 }, /* 5 */ + { 25080, 25080, 25080, 25080, 25080, 25080, 25080, 25080 }, /* 6 */ + { 12785, 12785, 12785, 12785, 12785, 12785, 12785, 12785 }, /* 7 */ +}; + +#define MUL1(T,X) _mm_add_epi16(_mm_mulhi_epi16(X, C[T]), X) +#define MUL0(T,X) _mm_mulhi_epi16(X, C[T]) + +static __inline void IDCT_R_8_SSE2( + const INT16* x, + INT16* y) +{ + const __m128i* C = (const __m128i*)COS[0]; + const __m128i* X = (const __m128i*)x; + __m128i* Y = (__m128i*)y; + + __m128i s0; + __m128i t0, t1, t2, t3, t4, t5, t6, t7; + + /* Stage.1 */ + + s0 = _mm_add_epi16(X[0], X[4]); + t0 = MUL1(4, s0); + + s0 = _mm_sub_epi16(X[0], X[4]); + t1 = MUL1(4, s0); + + t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6])); + t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6])); + + t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7])); + t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3])); + + t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3])); + t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7])); + + /* Stage.2 */ + + s0 = _mm_sub_epi16(t4, t5); + t4 = _mm_add_epi16(t4, t5); + t5 = MUL1(4, s0); + + s0 = _mm_sub_epi16(t7, t6); + t7 = _mm_add_epi16(t7, t6); + t6 = MUL1(4, s0); + + /* Stage.3 */ + + s0 = _mm_sub_epi16(t0, t3); + t0 = _mm_add_epi16(t0, t3); + + t3 = _mm_sub_epi16(t1, t2); + t1 = _mm_add_epi16(t1, t2); + + t2 = _mm_sub_epi16(t6, t5); + t6 = _mm_add_epi16(t6, t5); + + /* Stage.4 */ + + Y[0] = _mm_add_epi16(t0, t7); + Y[1] = _mm_add_epi16(t1, t6); + Y[2] = _mm_add_epi16(t3, t2); + Y[3] = _mm_add_epi16(s0, t4); + Y[4] = _mm_sub_epi16(s0, t4); + Y[5] = _mm_sub_epi16(t3, t2); + Y[6] = _mm_sub_epi16(t1, t6); + Y[7] = _mm_sub_epi16(t0, t7); +} + +static __inline void IDCT_C_8_SSE2( + const INT16* x, + INT16* y) +{ + const __m128i* C = (const __m128i*)COS[0]; + const __m128i* X = (const __m128i*)x; + __m128i* Y = (__m128i*)y; + + __m128i s0; + __m128i t0, t1, t2, t3, t4, t5, t6, t7; + + /* Stage.1 */ + + s0 = _mm_add_epi16(X[0], X[4]); + t0 = MUL1(4, s0); + + s0 = _mm_sub_epi16(X[0], X[4]); + t1 = MUL1(4, s0); + + t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6])); + t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6])); + + t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7])); + t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3])); + + t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3])); + t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7])); + + /* Stage.2 */ + + s0 = _mm_sub_epi16(t4, t5); + t4 = _mm_add_epi16(t4, t5); + t5 = MUL1(4, s0); + + s0 = _mm_sub_epi16(t7, t6); + t7 = _mm_add_epi16(t7, t6); + t6 = MUL1(4, s0); + + /* Stage.3 */ + + s0 = _mm_sub_epi16(t0, t3); + t0 = _mm_add_epi16(t0, t3); + + t3 = _mm_sub_epi16(t1, t2); + t1 = _mm_add_epi16(t1, t2); + + t2 = _mm_sub_epi16(t6, t5); + t6 = _mm_add_epi16(t6, t5); + + /* Stage.4 */ + + Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4); + Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4); + Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4); + Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4); + Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4); + Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4); + Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4); + Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4); +} + +static __inline void Transpose_U_SSE2( + const INT16* x, + INT16* y) +{ + const __m128i* X = (const __m128i*)x; + __m128i* Y = (__m128i*)y; + + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i t0, t1, t2, t3, t4, t5, t6, t7; + + u0 = _mm_unpacklo_epi16(X[0], X[1]); + u1 = _mm_unpackhi_epi16(X[0], X[1]); + u2 = _mm_unpacklo_epi16(X[2], X[3]); + u3 = _mm_unpackhi_epi16(X[2], X[3]); + u4 = _mm_unpacklo_epi16(X[4], X[5]); + u5 = _mm_unpackhi_epi16(X[4], X[5]); + u6 = _mm_unpacklo_epi16(X[6], X[7]); + u7 = _mm_unpackhi_epi16(X[6], X[7]); + + t0 = _mm_unpacklo_epi32(u0, u2); + t1 = _mm_unpacklo_epi32(u1, u3); + t2 = _mm_unpackhi_epi32(u0, u2); + t3 = _mm_unpackhi_epi32(u1, u3); + t4 = _mm_unpacklo_epi32(u4, u6); + t5 = _mm_unpacklo_epi32(u5, u7); + t6 = _mm_unpackhi_epi32(u4, u6); + t7 = _mm_unpackhi_epi32(u5, u7); + + Y[0] = _mm_unpacklo_epi64(t0, t4); + Y[1] = _mm_unpackhi_epi64(t0, t4); + Y[2] = _mm_unpacklo_epi64(t2, t6); + Y[3] = _mm_unpackhi_epi64(t2, t6); + Y[4] = _mm_unpacklo_epi64(t1, t5); + Y[5] = _mm_unpackhi_epi64(t1, t5); + Y[6] = _mm_unpacklo_epi64(t3, t7); + Y[7] = _mm_unpackhi_epi64(t3, t7); +} + +/* */ + +static const UINT8 TZZ[64] = { + 0, 2, 3, 9, 10, 20, 21, 35, + 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, + 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, + 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, + 28, 42, 43, 53, 54, 60, 61, 63 +}; + +static __inline void DequantizeIDCT8x8_SSE2( + const INT16* block, + const INT16* matrix, + INT16* coeff) +{ + ALIGN(0x10) INT16 c0[64]; + + { /* Reorder */ + const UINT8* t = TZZ; + + INT16* c = c0; + INT16* e = c + 64; + for (; c < e; c += 4, t += 4) { + c[0] = block[t[0]]; + c[1] = block[t[1]]; + c[2] = block[t[2]]; + c[3] = block[t[3]]; + } + } + + { /* Dequantize */ + const __m128i* m = (const __m128i*)matrix; + __m128i* d = (__m128i*)c0; + + d[0] = _mm_mullo_epi16(d[0], m[0]); + d[1] = _mm_mullo_epi16(d[1], m[1]); + d[2] = _mm_mullo_epi16(d[2], m[2]); + d[3] = _mm_mullo_epi16(d[3], m[3]); + d[4] = _mm_mullo_epi16(d[4], m[4]); + d[5] = _mm_mullo_epi16(d[5], m[5]); + d[6] = _mm_mullo_epi16(d[6], m[6]); + d[7] = _mm_mullo_epi16(d[7], m[7]); + } + + /* iDCT Row */ + IDCT_R_8_SSE2(c0, coeff); + + /* Transpose */ + Transpose_U_SSE2(coeff, c0); + + /* iDCT Colum */ + IDCT_C_8_SSE2(c0, coeff); +} + +/* */ + +struct DecodeCoefficientsContext { + + INT32 EOB_Run[64]; + + INT8* Run [64]; + INT16* Coeff[64]; + +}; /* DecodeCoefficientsContext */ + +typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t; + +static INT32 DecodeCoefficients( + FrameDecoder_t* t, + DecodeCoefficientsContext_t* ctx, + INT16* block) +{ + INT16* b = block; + INT16* e = b + 64; + + INT32 i = 0; + + while (b < e) { + if (ctx->EOB_Run[i] > 0) { + ctx->EOB_Run[i] -= 1; + break; + + } else { + INT32 run = *((ctx->Run [i])++); + INT16 coeff = *((ctx->Coeff[i])++); + + if (run < 0) { + ctx->EOB_Run[i] = coeff; + + } else { + INT16* p = b + run; + if (p >= e) { + break; + } + + while (b < p) { + *(b++) = 0; + } + + *(b++) = coeff; + + i = b - block; + } + } + } + + while (b < e) { + *(b++) = 0; + } + + return i; +} + +/* */ + +static void Reconstruct_IntraBlock( + FrameDecoder_t* t, + Plane_t* p, + INT32 x, + INT32 y, + INT16 dc, + INT32 plane, + Plane_t* r, + DecodeCoefficientsContext_t* ctx) +{ + ALIGN(0x10) INT16 block[64]; + ALIGN(0x10) INT16 coeff[64]; + + const INT16 (*mat)[64] = t->Reconstructor->Matrix[0]; + + if (dc == NOT_CODED) { + Block_CopyPlane8x8_SSE2(p, x, y, r); + return; + } + + DecodeCoefficients(t, ctx, block); + + block[0] = dc; + + DequantizeIDCT8x8_SSE2(block, mat[plane], coeff); + + Block_CopyIntra8x8_SSE2(p, x, y, coeff); +} + +static void Reconstruct_InterBlock( + FrameDecoder_t* t, + Plane_t* p, + INT32 x, + INT32 y, + INT16 dc, + INT32 plane, + Plane_t* r, + DecodeCoefficientsContext_t* ctx) +{ + ALIGN(0x10) INT16 block[64]; + ALIGN(0x10) INT16 coeff[64]; + + const INT16 (*mat)[64] = t->Reconstructor->Matrix[1]; + + if (dc == NOT_CODED) { + if (r != NULL) { + Block_CopyPlane8x8_SSE2(p, x, y, r); + } + return; + } + + DecodeCoefficients(t, ctx, block); + + block[0] = dc; + + DequantizeIDCT8x8_SSE2(block, mat[plane], coeff); + + Block_ReviseInter8x8_SSE2(p, x, y, coeff); +} + +/* */ + +/* */ + +static const INT8 S_PX[16] = { + 0*8, 1*8, 1*8, 0*8, + 0*8, 0*8, 1*8, 1*8, + 2*8, 2*8, 3*8, 3*8, + 3*8, 2*8, 2*8, 3*8 +}; + +static const INT8 S_PY[16] = { + 0*8, 0*8, 1*8, 1*8, + 2*8, 3*8, 3*8, 2*8, + 2*8, 3*8, 3*8, 2*8, + 1*8, 1*8, 0*8, 0*8 +}; + +static const INT8 M_PX[4] = { + 0*16, 0*16, + 1*16, 1*16 +}; + +static const INT8 M_PY[4] = { + 0*16, 1*16, + 1*16, 0*16 +}; + +/* */ + +static void Reconstruct_YPlane_SSE2( + FrameDecoder_t* t) +{ + INT32 x, y; + + INT32 sx = t->Index->SX[0] * 32; + INT32 sy = t->Index->SY[0] * 32; + + INT32 mx = t->Index->MX * 16; + INT32 my = t->Index->MY * 16; + + INT32 bx = t->Index->BX[0]; + + const UINT16* bi = t->Index->BIndex[0]; + + Plane_t* g = t->Frame[0]; + Plane_t* p = t->Frame[1]; + Plane_t* r = t->Frame[2]; + + const UINT8* mm = t->MBMode; + const MotionVector_t* mv = t->MV; + + DecodeCoefficientsContext_t ctx = { 0 }; + + INT32 i; + for (i = 0; i < 64; i++) { + ctx.Run [i] = t->BRun [0][i]; + ctx.Coeff[i] = t->BCoeff[0][i]; + } + + for (y = 0; y < sy; y += 32) { + for (x = 0; x < sx; x += 32) { + INT32 i = 0; + + INT32 m; + for (m = 0; m < 4; m++, i += 4) { + INT32 x0 = x + M_PX[m]; + INT32 y0 = y + M_PY[m]; + if (x0 < mx && y0 < my) { + switch (*mm) { + case 0: /* INTER_NOMV */ + Block_CopyPlane16x16_SSE2(p, x0, y0, r); + + Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx); + break; + + case 1: /* INTRA */ + Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx); + Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx); + Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx); + Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx); + break; + + case 2: /* INTER_MV */ + case 3: /* INTER_MV_LAST */ + case 4: /* INTER_MV_LAST2 */ + MotionComp_Block16x16_SSE2(p, x0, y0, r, mv); + + Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx); + break; + + case 5: /* INTER_GOLDEN_NOMV */ + Block_CopyPlane16x16_SSE2(p, x0, y0, g); + + Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx); + break; + + case 6: /* INTER_GOLDEN_MV */ + MotionComp_Block16x16_SSE2(p, x0, y0, g, mv); + + Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx); + break; + + case 7: /* INTER_MV_FOUR */ + { + const MotionVector_t* v = mv; + + const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx; + + if (dc[0] != NOT_CODED) { + MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 0, r, v++); + } + + if (dc[1] != NOT_CODED) { + MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 0, r, v++); + } + + if (dc[0 + bx] != NOT_CODED) { + MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 8, r, v++); + } + + if (dc[1 + bx] != NOT_CODED) { + MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 8, r, v++); + } + + Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx); + Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx); + break; + } + + } /* switch */ + + bi += 4; + mm += 1; + mv += 4; + } + } + } + } +} + +/* */ + +static void Reconstruct_CPlane_SSE2( + FrameDecoder_t* t) +{ + INT32 x, y; + + INT32 sx = t->Index->SX[1] * 32; + INT32 sy = t->Index->SY[1] * 32; + + INT32 mx = t->Index->MX * 8; + INT32 my = t->Index->MY * 8; + + INT32 bx = t->Index->BX[1]; + + const UINT16* bi = t->Index->BIndex[1]; + + Plane_t* g = t->Frame[0]; + Plane_t* p = t->Frame[1]; + Plane_t* r = t->Frame[2]; + + const INT16* DC0 = t->DC + t->Index->BC[0]; + const INT16* DC1 = DC0 + t->Index->BC[1]; + + const UINT8* m = t->BMode + t->Index->BC[0]; + + DecodeCoefficientsContext_t ctx[2] = { 0 }; + + INT32 i; + for (i = 0; i < 64; i++) { + ctx[0].Run [i] = t->BRun [1][i]; + ctx[0].Coeff[i] = t->BCoeff[1][i]; + + ctx[1].Run [i] = t->BRun [2][i]; + ctx[1].Coeff[i] = t->BCoeff[2][i]; + } + + for (y = 0; y < sy; y += 32) { + for (x = 0; x < sx; x += 32) { + INT32 i; + for (i = 0; i < 16; i++) { + INT32 xx = x + S_PX[i]; + INT32 yy = y + S_PY[i]; + + if (xx < mx && yy < my) { + INT32 idx = (xx >> 3) + (yy >> 3) * bx; + + switch (m[idx]) { + case 0: /* INTER_NOMV */ + Block_CopyPlane8x8_SSE2(p + 1, xx, yy, r + 1); + Block_CopyPlane8x8_SSE2(p + 2, xx, yy, r + 2); + + Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0); + Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1); + break; + + case 1: /* INTRA */ + Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0); + Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1); + break; + + case 2: /* INTER_MV */ + case 3: /* INTER_MV_LAST */ + case 4: /* INTER_MV_LAST2 */ + MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx); + MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx); + + Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0); + Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1); + break; + + case 5: /* INTER_GOLDEN_NOMV */ + Block_CopyPlane8x8_SSE2(p + 1, xx, yy, g + 1); + Block_CopyPlane8x8_SSE2(p + 2, xx, yy, g + 2); + + Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0); + Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1); + break; + + case 6: /* INTER_GOLDEN_MV */ + MotionComp_Block8x8C_SSE2(p + 1, xx, yy, g + 1, t->MVC + idx); + MotionComp_Block8x8C_SSE2(p + 2, xx, yy, g + 2, t->MVC + idx); + + Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0); + Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1); + break; + + case 7: /* INTER_MV_FOUR */ + MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx); + MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx); + + Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0); + Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1); + break; + + } /* switch */ + } + } + } + } +} + +/* */ + +void QT_ReconstructFrame_SSE2( + FrameDecoder_t* t) +{ + Reconstruct_YPlane_SSE2(t); + + Reconstruct_CPlane_SSE2(t); + + if (t->Filter.Limit > 0) { + QT_FrameLoopFilter(t); + } +} + +/* */ + diff --git a/Lib/QTheoraEx/MotionComp_SSE2.c b/Lib/QTheoraEx/MotionComp_SSE2.c new file mode 100644 index 0000000..078145f --- /dev/null +++ b/Lib/QTheoraEx/MotionComp_SSE2.c @@ -0,0 +1,466 @@ +/* MotionComp_SSE2.c */ +/* 2009/07/02 */ + +#include "StdAfx.h" + +#include "MotionComp_SSE2.h" + +/* */ + +#pragma warning(disable : 4799) + +/* */ + +static void Block_Extract8x8( + const Plane_t* plane, + INT32 x, + INT32 y, + UINT8* block) +{ + INT32 i, j; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + INT32 xx = x + j; + INT32 yy = y + i; + + if (xx < 0) { + xx = 0; + } else if (xx >= plane->CX) { + xx = plane->CX - 1; + } + + if (yy < 0) { + yy = 0; + } else if (yy >= plane->CY) { + yy = plane->CY - 1; + } + + block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx]; + } + } +} + +/* */ + +void MotionComp_Compensate16x16_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x, + INT32 y) +{ + if (x >= 0 && x + 16 < r->CX && + y >= 0 && y + 16 < r->CY) { + const UINT8* s = r->Plane + y * r->Pitch + x; + UINT8* d = p; + + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + + _mm_store_si128((__m128i*)d, s0); d += pitch; + _mm_store_si128((__m128i*)d, s1); d += pitch; + _mm_store_si128((__m128i*)d, s2); d += pitch; + _mm_store_si128((__m128i*)d, s3); d += pitch; + _mm_store_si128((__m128i*)d, s4); d += pitch; + _mm_store_si128((__m128i*)d, s5); d += pitch; + _mm_store_si128((__m128i*)d, s6); d += pitch; + _mm_store_si128((__m128i*)d, s7); d += pitch; + + s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch; + s7 = _mm_loadu_si128((const __m128i*)s); + + _mm_store_si128((__m128i*)d, s0); d += pitch; + _mm_store_si128((__m128i*)d, s1); d += pitch; + _mm_store_si128((__m128i*)d, s2); d += pitch; + _mm_store_si128((__m128i*)d, s3); d += pitch; + _mm_store_si128((__m128i*)d, s4); d += pitch; + _mm_store_si128((__m128i*)d, s5); d += pitch; + _mm_store_si128((__m128i*)d, s6); d += pitch; + _mm_store_si128((__m128i*)d, s7); + + } else { + MotionComp_Compensate8x8_SSE2(p, pitch, r, x, y ); + MotionComp_Compensate8x8_SSE2(p + 8, pitch, r, x + 8, y ); + MotionComp_Compensate8x8_SSE2(p + 8 * pitch, pitch, r, x, y + 8); + MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8); + } +} + +void MotionComp_Compensate8x8_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x, + INT32 y) +{ + ALIGN(0x10) UINT8 b[64]; + + const UINT8* s = r->Plane + y * r->Pitch + x; + INT32 p0 = r->Pitch; + UINT8* d = p; + + __m64 s0, s1, s2, s3, s4, s5, s6, s7; + + if (x < 0 || x + 8 >= r->CX || + y < 0 || y + 8 >= r->CY) { + s = b; + p0 = 8; + Block_Extract8x8(r, x, y, b); + } + + s0 = *((const __m64*)s); s += p0; + s1 = *((const __m64*)s); s += p0; + s2 = *((const __m64*)s); s += p0; + s3 = *((const __m64*)s); s += p0; + s4 = *((const __m64*)s); s += p0; + s5 = *((const __m64*)s); s += p0; + s6 = *((const __m64*)s); s += p0; + s7 = *((const __m64*)s); + + *((__m64*)d) = s0; d += pitch; + *((__m64*)d) = s1; d += pitch; + *((__m64*)d) = s2; d += pitch; + *((__m64*)d) = s3; d += pitch; + *((__m64*)d) = s4; d += pitch; + *((__m64*)d) = s5; d += pitch; + *((__m64*)d) = s6; d += pitch; + *((__m64*)d) = s7; +} + +/* */ + +ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + +void MotionComp_Compensate16x16H_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x0, + INT32 y0, + INT32 x1, + INT32 y1) +{ + if (x0 >= 0 && x0 + 16 < r->CX && + y0 >= 0 && y0 + 16 < r->CY && + x1 >= 0 && x1 + 16 < r->CX && + y1 >= 0 && y1 + 16 < r->CY) { + const UINT8* s0 = r->Plane + y0 * r->Pitch + x0; + const UINT8* s1 = r->Plane + y1 * r->Pitch + x1; + + UINT8* d = p; + + __m128i S0, S1, D; + const __m128i M = *((const __m128i*)MASK_1); + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch; + S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch; + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); d += pitch; + + S0 = _mm_loadu_si128((const __m128i*)s0); + S1 = _mm_loadu_si128((const __m128i*)s1); + D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M)); + _mm_store_si128((__m128i*)d, D); + + } else { + MotionComp_Compensate8x8H_SSE2(p, pitch, r, x0, y0 , x1, y1 ); + MotionComp_Compensate8x8H_SSE2(p + 8, pitch, r, x0 + 8, y0 , x1 + 8, y1 ); + MotionComp_Compensate8x8H_SSE2(p + 8 * pitch, pitch, r, x0, y0 + 8, x1, y1 + 8); + MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8); + } +} + +void MotionComp_Compensate8x8H_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x0, + INT32 y0, + INT32 x1, + INT32 y1) +{ + ALIGN(0x10) UINT8 b0[64], b1[64]; + + const UINT8* s0 = r->Plane + y0 * r->Pitch + x0; + INT32 p0 = r->Pitch; + + const UINT8* s1 = r->Plane + y1 * r->Pitch + x1; + INT32 p1 = r->Pitch; + + UINT8* d = p; + + __m64 S0, S1, D; + const __m64 M = *((const __m64*)MASK_1); + + if (x0 < 0 || x0 + 8 >= r->CX || + y0 < 0 || y0 + 8 >= r->CY || + x1 < 0 || x1 + 8 >= r->CX || + y1 < 0 || y1 + 8 >= r->CY) { + s0 = b0; + p0 = 8; + + s1 = b1; + p1 = 8; + + Block_Extract8x8(r, x0, y0, b0); + Block_Extract8x8(r, x1, y1, b1); + } + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); s0 += p0; + S1 = *((const __m64*)s1); s1 += p1; + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; d += pitch; + + S0 = *((const __m64*)s0); + S1 = *((const __m64*)s1); + D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M)); + *((__m64*)d) = D; +} + +/* */ + +void MotionComp_Block16x16_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv) +{ + INT32 dx = ((mv->X & 1) != 0); + INT32 dy = ((mv->Y & 1) != 0); + + INT32 vx[2] = { mv->X >> 1 }; + INT32 vy[2] = { mv->Y >> 1 }; + + UINT8* d = p->Plane + y * p->Pitch + x; + + if (dx == 0 && dy == 0) { + MotionComp_Compensate16x16_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0]); + + } else { + vx[1] = vx[0]; + vy[1] = vy[0]; + + vx[mv->X >= 0] += dx; + vy[mv->Y >= 0] += dy; + + MotionComp_Compensate16x16H_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0], + x + vx[1], + y + vy[1]); + } +} + +void MotionComp_Block8x8Y_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv) +{ + INT32 dx = ((mv->X & 1) != 0); + INT32 dy = ((mv->Y & 1) != 0); + + INT32 vx[2] = { mv->X >> 1 }; + INT32 vy[2] = { mv->Y >> 1 }; + + UINT8* d = p->Plane + y * p->Pitch + x; + + if (dx == 0 && dy == 0) { + MotionComp_Compensate8x8_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0]); + + } else { + vx[1] = vx[0]; + vy[1] = vy[0]; + + vx[mv->X >= 0] += dx; + vy[mv->Y >= 0] += dy; + + MotionComp_Compensate8x8H_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0], + x + vx[1], + y + vy[1]); + } +} + +void MotionComp_Block8x8C_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv0) +{ + MotionVector_t mv = { + (mv0->X >> 1) | (mv0->X & 1), + (mv0->Y >> 1) | (mv0->Y & 1) + }; + + INT32 dx = ((mv.X & 1) != 0); + INT32 dy = ((mv.Y & 1) != 0); + + INT32 vx[2] = { mv.X >> 1 }; + INT32 vy[2] = { mv.Y >> 1 }; + + UINT8* d = p->Plane + y * p->Pitch + x; + + if (dx == 0 && dy == 0) { + MotionComp_Compensate8x8_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0]); + + } else { + vx[1] = vx[0]; + vy[1] = vy[0]; + + vx[mv.X >= 0] += dx; + vy[mv.Y >= 0] += dy; + + MotionComp_Compensate8x8H_SSE2( + d, + p->Pitch, + r, + x + vx[0], + y + vy[0], + x + vx[1], + y + vy[1]); + } +} + +/* */ + diff --git a/Lib/QTheoraEx/MotionComp_SSE2.h b/Lib/QTheoraEx/MotionComp_SSE2.h new file mode 100644 index 0000000..59020a0 --- /dev/null +++ b/Lib/QTheoraEx/MotionComp_SSE2.h @@ -0,0 +1,66 @@ +/* MotionComp_SSE2.h */ +/* 2009/07/02 */ + +#pragma once + +#include "FrameDecoder.h" + +/* */ + +void MotionComp_Compensate16x16_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x, + INT32 y); + +void MotionComp_Compensate8x8_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x, + INT32 y); + +void MotionComp_Compensate16x16H_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x0, + INT32 y0, + INT32 x1, + INT32 y1); + +void MotionComp_Compensate8x8H_SSE2( + UINT8* p, + INT32 pitch, + const Plane_t* r, + INT32 x0, + INT32 y0, + INT32 x1, + INT32 y1); + +/* */ + +void MotionComp_Block16x16_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv); + +void MotionComp_Block8x8Y_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv); + +void MotionComp_Block8x8C_SSE2( + Plane_t* p, + INT32 x, + INT32 y, + const Plane_t* r, + const MotionVector_t* mv); + +/* */ + diff --git a/Lib/QTheoraEx/QTheoraEx.vcproj b/Lib/QTheoraEx/QTheoraEx.vcproj index 275e3f8..fc350d8 100644 --- a/Lib/QTheoraEx/QTheoraEx.vcproj +++ b/Lib/QTheoraEx/QTheoraEx.vcproj @@ -165,10 +165,18 @@ > + + + + + +