Y[7] = _mm_sub_epi16(t0, t7);
}
+static __inline void IDCT_R_8_4_SSE2(
+ const INT16* x,
+ INT16* y)
+{
+ const __m128i* C = (const __m128i*)COS[0];
+ const __m128i* X = (const __m128i*)x;
+ __m128i* Y = (__m128i*)y;
+
+ __m128i s0;
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ /* Stage.1 */
+
+ t1 = t0 = MUL1(4, X[0]);
+
+ t2 = MUL0(6, X[2]);
+ t3 = MUL1(2, X[2]);
+
+ t4 = MUL0(7, X[1]);
+ t5 = _mm_sub_epi16(_mm_setzero_si128(), MUL1(5, X[3]));
+
+ t6 = MUL1(3, X[3]);
+ t7 = MUL1(1, X[1]);
+
+ /* Stage.2 */
+
+ s0 = _mm_sub_epi16(t4, t5);
+ t4 = _mm_add_epi16(t4, t5);
+ t5 = MUL1(4, s0);
+
+ s0 = _mm_sub_epi16(t7, t6);
+ t7 = _mm_add_epi16(t7, t6);
+ t6 = MUL1(4, s0);
+
+ /* Stage.3 */
+
+ s0 = _mm_sub_epi16(t0, t3);
+ t0 = _mm_add_epi16(t0, t3);
+
+ t3 = _mm_sub_epi16(t1, t2);
+ t1 = _mm_add_epi16(t1, t2);
+
+ t2 = _mm_sub_epi16(t6, t5);
+ t6 = _mm_add_epi16(t6, t5);
+
+ /* Stage.4 */
+
+ Y[0] = _mm_add_epi16(t0, t7);
+ Y[1] = _mm_add_epi16(t1, t6);
+ Y[2] = _mm_add_epi16(t3, t2);
+ Y[3] = _mm_add_epi16(s0, t4);
+ Y[4] = _mm_sub_epi16(s0, t4);
+ Y[5] = _mm_sub_epi16(t3, t2);
+ Y[6] = _mm_sub_epi16(t1, t6);
+ Y[7] = _mm_sub_epi16(t0, t7);
+}
+
static __inline void IDCT_C_8_SSE2(
const INT16* x,
INT16* y)
/* */
+static __inline void DequantizeIDCT8x8_16_SSE2(
+ const INT16* block,
+ const INT16* matrix,
+ INT16* coeff)
+{
+ ALIGN(0x10) INT16 c0[64];
+
+ const __m128i z = _mm_setzero_si128();
+
+ _mm_store_si128((__m128i*)(c0 + 0x00), z);
+ _mm_store_si128((__m128i*)(c0 + 0x08), z);
+ _mm_store_si128((__m128i*)(c0 + 0x10), z);
+ _mm_store_si128((__m128i*)(c0 + 0x18), z);
+ _mm_store_si128((__m128i*)(c0 + 0x20), z);
+ _mm_store_si128((__m128i*)(c0 + 0x28), z);
+ _mm_store_si128((__m128i*)(c0 + 0x30), z);
+ _mm_store_si128((__m128i*)(c0 + 0x38), z);
+
+ { /* Reorder */
+ const UINT8* t = TZZ;
+
+ INT16* c = c0;
+ INT16* e = c + 32;
+ for (; c < e; c += 8, t += 8) {
+ c[0] = block[t[0]];
+ c[1] = block[t[1]];
+ c[2] = block[t[2]];
+ c[3] = block[t[3]];
+ }
+ }
+
+ { /* Dequantize */
+ const __m128i* m = (const __m128i*)matrix;
+ __m128i* d = (__m128i*)c0;
+
+ d[0] = _mm_mullo_epi16(d[0], m[0]);
+ d[1] = _mm_mullo_epi16(d[1], m[1]);
+ d[2] = _mm_mullo_epi16(d[2], m[2]);
+ d[3] = _mm_mullo_epi16(d[3], m[3]);
+ }
+
+ /* iDCT Row */
+ IDCT_R_8_4_SSE2(c0, coeff);
+
+ /* Transpose */
+ Transpose_U_SSE2(coeff, c0);
+
+ /* iDCT Colum */
+ IDCT_C_8_SSE2(c0, coeff);
+}
+
+/* */
+
static __inline void DequantizeIDCT8x8_0_SSE2(
INT16 dc,
const INT16* matrix,
/* */
-struct DecodeCoefficientsContext {
+struct DecodeCoefficientsLeaf {
- INT32 EOB_Run[64];
+ INT32 EOB_Run;
- INT8* Run [64];
- INT16* Coeff[64];
+ INT8* Run;
+ INT16* Coeff;
+
+}; /* DecodeCoefficientsLeaf */
+
+typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;
+
+struct DecodeCoefficientsContext {
+
+ DecodeCoefficientsLeaf_t Leaf[64];
}; /* DecodeCoefficientsContext */
INT16* b = block;
INT16* e = b + 64;
- INT32 i = 0;
+ DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
const __m128i z = _mm_setzero_si128();
_mm_store_si128((__m128i*)(block + 0x38), z);
while (b < e) {
- if (ctx->EOB_Run[i] > 0) {
- ctx->EOB_Run[i] -= 1;
+ if (leaf->EOB_Run > 0) {
+ leaf->EOB_Run -= 1;
break;
} else {
- INT32 run = *((ctx->Run [i])++);
- INT16 coeff = *((ctx->Coeff[i])++);
+ INT32 run = *((leaf->Run )++);
+ INT32 coeff = *((leaf->Coeff)++);
if (run < 0) {
- ctx->EOB_Run[i] = coeff;
+ leaf->EOB_Run = coeff;
} else {
b += run;
*(b++) = coeff;
- i = b - block;
+ leaf = ctx->Leaf + (b - block);
}
}
}
- return i;
+ return b - block;
}
/* */
const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+ INT32 cs;
+
if (dc == NOT_CODED) {
Block_CopyPlane8x8_SSE2(p, x, y, r);
return;
}
- if (DecodeCoefficients_SSE2(t, ctx, block) >= 2) {
+ cs = DecodeCoefficients_SSE2(t, ctx, block);
+
+ if (cs >= 10) {
block[0] = dc;
DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+ } else if (cs >= 2) {
+ block[0] = dc;
+ DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);
+
} else {
DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
}
const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+ INT32 cs;
+
if (dc == NOT_CODED) {
if (r != NULL) {
Block_CopyPlane8x8_SSE2(p, x, y, r);
return;
}
- if (DecodeCoefficients_SSE2(t, ctx, block) >= 2) {
+ cs = DecodeCoefficients_SSE2(t, ctx, block);
+
+ if (cs >= 10) {
block[0] = dc;
DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+ } else if (cs >= 2) {
+ block[0] = dc;
+ DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);
+
} else {
DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
}
const UINT8* mm = t->MBMode;
const MotionVector_t* mv = t->MV;
- DecodeCoefficientsContext_t ctx = { 0 };
+ ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };
INT32 i;
for (i = 0; i < 64; i++) {
- ctx.Run [i] = t->BRun [0][i];
- ctx.Coeff[i] = t->BCoeff[0][i];
+ ctx.Leaf[i].Run = t->BRun [0][i];
+ ctx.Leaf[i].Coeff = t->BCoeff[0][i];
}
for (y = 0; y < sy; y += 32) {
const UINT8* m = t->BMode + t->Index->BC[0];
- DecodeCoefficientsContext_t ctx[2] = { 0 };
+ ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };
INT32 i;
for (i = 0; i < 64; i++) {
- ctx[0].Run [i] = t->BRun [1][i];
- ctx[0].Coeff[i] = t->BCoeff[1][i];
+ ctx[0].Leaf[i].Run = t->BRun [1][i];
+ ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];
- ctx[1].Run [i] = t->BRun [2][i];
- ctx[1].Coeff[i] = t->BCoeff[2][i];
+ ctx[1].Leaf[i].Run = t->BRun [2][i];
+ ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
}
for (y = 0; y < sy; y += 32) {