/* */
-static const UINT8 TZZ[64] = {
- 0, 2, 3, 9, 10, 20, 21, 35,
- 1, 4, 8, 11, 19, 22, 34, 36,
- 5, 7, 12, 18, 23, 33, 37, 48,
- 6, 13, 17, 24, 32, 38, 47, 49,
- 14, 16, 25, 31, 39, 46, 50, 57,
- 15, 26, 30, 40, 45, 51, 56, 58,
- 27, 29, 41, 44, 52, 55, 59, 62,
- 28, 42, 43, 53, 54, 60, 61, 63
-};
-
static __inline void DequantizeIDCT8x8_MMX(
const INT16* block,
const INT16* matrix,
{
ALIGN(0x10) INT16 c0[64];
- { /* Reorder */
- const UINT8* t = TZZ;
-
- INT16* c = c0;
- INT16* e = c + 64;
- for (; c < e; c += 8, t += 8) {
- c[0] = block[t[0]];
- c[1] = block[t[1]];
- c[2] = block[t[2]];
- c[3] = block[t[3]];
- c[4] = block[t[4]];
- c[5] = block[t[5]];
- c[6] = block[t[6]];
- c[7] = block[t[7]];
- }
- }
-
{ /* Dequantize */
- __m64* d = (__m64*) c0;
- __m64* e = (__m64*)(c0 + 64);
+ const __m64* b = (const __m64*)block;
const __m64* m = (const __m64*)matrix;
+ __m64* d = (__m64*) c0;
- for (; d < e; d += 4, m += 4) {
- d[0] = _mm_mullo_pi16(d[0], m[0]);
- d[1] = _mm_mullo_pi16(d[1], m[1]);
- d[2] = _mm_mullo_pi16(d[2], m[2]);
- d[3] = _mm_mullo_pi16(d[3], m[3]);
- }
+ d[ 0] = _mm_mullo_pi16(b[ 0], m[ 0]);
+ d[ 1] = _mm_mullo_pi16(b[ 1], m[ 1]);
+ d[ 2] = _mm_mullo_pi16(b[ 2], m[ 2]);
+ d[ 3] = _mm_mullo_pi16(b[ 3], m[ 3]);
+
+ d[ 4] = _mm_mullo_pi16(b[ 4], m[ 4]);
+ d[ 5] = _mm_mullo_pi16(b[ 5], m[ 5]);
+ d[ 6] = _mm_mullo_pi16(b[ 6], m[ 6]);
+ d[ 7] = _mm_mullo_pi16(b[ 7], m[ 7]);
+
+ d[ 8] = _mm_mullo_pi16(b[ 8], m[ 8]);
+ d[ 9] = _mm_mullo_pi16(b[ 9], m[ 9]);
+ d[10] = _mm_mullo_pi16(b[10], m[10]);
+ d[11] = _mm_mullo_pi16(b[11], m[11]);
+
+ d[12] = _mm_mullo_pi16(b[12], m[12]);
+ d[13] = _mm_mullo_pi16(b[13], m[13]);
+ d[14] = _mm_mullo_pi16(b[14], m[14]);
+ d[15] = _mm_mullo_pi16(b[15], m[15]);
}
/* iDCT Row */
{
ALIGN(0x10) INT16 c0[64];
- const __m64 z = _mm_setzero_si64();
-
- *((__m64*)(c0 + 0x00)) = z;
- *((__m64*)(c0 + 0x08)) = z;
- *((__m64*)(c0 + 0x10)) = z;
- *((__m64*)(c0 + 0x18)) = z;
-
- /* Reorder */
- c0[ 0 + 0] = block[TZZ[ 0 + 0]];
- c0[ 0 + 1] = block[TZZ[ 0 + 1]];
- c0[ 0 + 2] = block[TZZ[ 0 + 2]];
- c0[ 0 + 3] = block[TZZ[ 0 + 3]];
-
- c0[ 8 + 0] = block[TZZ[ 8 + 0]];
- c0[ 8 + 1] = block[TZZ[ 8 + 1]];
- c0[ 8 + 2] = block[TZZ[ 8 + 2]];
-
- c0[16 + 0] = block[TZZ[16 + 0]];
- c0[16 + 1] = block[TZZ[16 + 1]];
-
- c0[24 + 0] = block[TZZ[24 + 0]];
-
{ /* Dequantize */
+ const __m64* b = (const __m64*)block;
const __m64* m = (const __m64*)matrix;
__m64* d = (__m64*)c0;
- d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
- d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
- d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
- d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+ d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
+ d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
+ d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
+ d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
}
/* iDCT Row */
typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+ALIGN(0x10) static const UINT8 IZZ[64] = {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63
+};
+
static INT32 DecodeCoefficients_MMX(
FrameDecoder_t* t,
DecodeCoefficientsContext_t* ctx,
INT16* block)
{
- INT16* b = block;
- INT16* e = b + 64;
+ const INT8* bi = IZZ;
+ const INT8* ei = IZZ + 64;
DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
const __m64 z = _mm_setzero_si64();
- for (; b < e; b += 16) {
- *((__m64*)(b + 0)) = z;
- *((__m64*)(b + 4)) = z;
- *((__m64*)(b + 8)) = z;
- *((__m64*)(b + 12)) = z;
- }
+ *((__m64*)(block + 0x00)) = z;
+ *((__m64*)(block + 0x04)) = z;
+ *((__m64*)(block + 0x08)) = z;
+ *((__m64*)(block + 0x0c)) = z;
+
+ *((__m64*)(block + 0x10)) = z;
+ *((__m64*)(block + 0x14)) = z;
+ *((__m64*)(block + 0x18)) = z;
+ *((__m64*)(block + 0x1c)) = z;
- b = block;
+ *((__m64*)(block + 0x20)) = z;
+ *((__m64*)(block + 0x24)) = z;
+ *((__m64*)(block + 0x28)) = z;
+ *((__m64*)(block + 0x2c)) = z;
- while (b < e) {
+ *((__m64*)(block + 0x30)) = z;
+ *((__m64*)(block + 0x34)) = z;
+ *((__m64*)(block + 0x38)) = z;
+ *((__m64*)(block + 0x3c)) = z;
+
+ while (bi < ei) {
if (leaf->EOB_Run > 0) {
leaf->EOB_Run -= 1;
break;
leaf->EOB_Run = coeff;
} else {
- b += run;
- if (b >= e) {
- break;
- }
+ bi += run;
- *(b++) = coeff;
+ block[*(bi++)] = coeff;
- leaf = ctx->Leaf + (b - block);
+ leaf = ctx->Leaf + (bi - IZZ);
}
}
}
- return b - block;
+ return bi - IZZ;
}
/* */
Plane_t* r,
DecodeCoefficientsContext_t* ctx)
{
- ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 block[64 + 64];
ALIGN(0x10) INT16 coeff[64];
const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
Plane_t* r,
DecodeCoefficientsContext_t* ctx)
{
- ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 block[64 + 64];
ALIGN(0x10) INT16 coeff[64];
const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];
/* */
-static const UINT8 TZZ[64] = {
- 0, 2, 3, 9, 10, 20, 21, 35,
- 1, 4, 8, 11, 19, 22, 34, 36,
- 5, 7, 12, 18, 23, 33, 37, 48,
- 6, 13, 17, 24, 32, 38, 47, 49,
- 14, 16, 25, 31, 39, 46, 50, 57,
- 15, 26, 30, 40, 45, 51, 56, 58,
- 27, 29, 41, 44, 52, 55, 59, 62,
- 28, 42, 43, 53, 54, 60, 61, 63
-};
-
static __inline void DequantizeIDCT8x8_SSE2(
const INT16* block,
const INT16* matrix,
{
ALIGN(0x10) INT16 c0[64];
- { /* Reorder */
- const UINT8* t = TZZ;
-
- INT16* c = c0;
- INT16* e = c + 64;
- for (; c < e; c += 8, t += 8) {
- c[0] = block[t[0]];
- c[1] = block[t[1]];
- c[2] = block[t[2]];
- c[3] = block[t[3]];
- c[4] = block[t[4]];
- c[5] = block[t[5]];
- c[6] = block[t[6]];
- c[7] = block[t[7]];
- }
- }
-
{ /* Dequantize */
+ const __m128i* b = (const __m128i*)block;
const __m128i* m = (const __m128i*)matrix;
__m128i* d = (__m128i*)c0;
- d[0] = _mm_mullo_epi16(d[0], m[0]);
- d[1] = _mm_mullo_epi16(d[1], m[1]);
- d[2] = _mm_mullo_epi16(d[2], m[2]);
- d[3] = _mm_mullo_epi16(d[3], m[3]);
- d[4] = _mm_mullo_epi16(d[4], m[4]);
- d[5] = _mm_mullo_epi16(d[5], m[5]);
- d[6] = _mm_mullo_epi16(d[6], m[6]);
- d[7] = _mm_mullo_epi16(d[7], m[7]);
+ d[0] = _mm_mullo_epi16(b[0], m[0]);
+ d[1] = _mm_mullo_epi16(b[1], m[1]);
+ d[2] = _mm_mullo_epi16(b[2], m[2]);
+ d[3] = _mm_mullo_epi16(b[3], m[3]);
+ d[4] = _mm_mullo_epi16(b[4], m[4]);
+ d[5] = _mm_mullo_epi16(b[5], m[5]);
+ d[6] = _mm_mullo_epi16(b[6], m[6]);
+ d[7] = _mm_mullo_epi16(b[7], m[7]);
}
/* iDCT Row */
{
ALIGN(0x10) INT16 c0[64];
- const __m128i z = _mm_setzero_si128();
-
- _mm_store_si128((__m128i*)(c0 + 0x00), z);
- _mm_store_si128((__m128i*)(c0 + 0x08), z);
- _mm_store_si128((__m128i*)(c0 + 0x10), z);
- _mm_store_si128((__m128i*)(c0 + 0x18), z);
-
- /* Reorder */
- c0[ 0 + 0] = block[TZZ[ 0 + 0]];
- c0[ 0 + 1] = block[TZZ[ 0 + 1]];
- c0[ 0 + 2] = block[TZZ[ 0 + 2]];
- c0[ 0 + 3] = block[TZZ[ 0 + 3]];
-
- c0[ 8 + 0] = block[TZZ[ 8 + 0]];
- c0[ 8 + 1] = block[TZZ[ 8 + 1]];
- c0[ 8 + 2] = block[TZZ[ 8 + 2]];
-
- c0[16 + 0] = block[TZZ[16 + 0]];
- c0[16 + 1] = block[TZZ[16 + 1]];
-
- c0[24 + 0] = block[TZZ[24 + 0]];
-
{ /* Dequantize */
+ const __m64* b = (__m64*)block;
const __m64* m = (const __m64*)matrix;
__m64* d = (__m64*)c0;
- d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
- d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
- d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
- d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
+ d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
+ d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
+ d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
+ d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
}
/* iDCT Row */
typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+ALIGN(0x10) static const UINT8 IZZ[64] = {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63
+};
+
static INT32 DecodeCoefficients_SSE2(
FrameDecoder_t* t,
DecodeCoefficientsContext_t* ctx,
INT16* block)
{
- INT16* b = block;
- INT16* e = b + 64;
+ const INT8* bi = IZZ;
+ const INT8* ei = IZZ + 64;
DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
_mm_store_si128((__m128i*)(block + 0x30), z);
_mm_store_si128((__m128i*)(block + 0x38), z);
- while (b < e) {
+ while (bi < ei) {
if (leaf->EOB_Run > 0) {
leaf->EOB_Run -= 1;
break;
leaf->EOB_Run = coeff;
} else {
- b += run;
- if (b >= e) {
- break;
- }
+ bi += run;
- *(b++) = coeff;
+ block[*(bi++)] = coeff;
- leaf = ctx->Leaf + (b - block);
+ leaf = ctx->Leaf + (bi - IZZ);
}
}
}
- return b - block;
+ return bi - IZZ;
}
/* */
Plane_t* r,
DecodeCoefficientsContext_t* ctx)
{
- ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 block[64 + 64];
ALIGN(0x10) INT16 coeff[64];
const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
Plane_t* r,
DecodeCoefficientsContext_t* ctx)
{
- ALIGN(0x10) INT16 block[64];
+ ALIGN(0x10) INT16 block[64 + 64];
ALIGN(0x10) INT16 coeff[64];
const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];