/* */
-static void Block_Extract8x8(
+static void Block_Extract8x8_MMX(
const Plane_t* plane,
INT32 x,
INT32 y,
- UINT8* block)
+ UINT8* block,
+ INT32 pitch)
+{
+ ALIGN(0x10) UINT8 r[64 * 4];
+
+ INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
+ INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);
+
+ const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;
+
+ INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
+ INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);
+
+ INT32 xy = (xf << 2) | yf;
+
+ UINT8* rr = r;
+
+ {
+ const UINT8* s = NULL;
+ UINT8* d = NULL;
+
+ switch (xy) {
+ case 10: /* 10 10 */
+ s = ss;
+ d = r;
+ break;
+
+ case 6: /* 01 10 */
+ s = ss + 7;
+ d = r + 8;
+ break;
+
+ case 9: /* 10 01 */
+ s = ss + 7 * plane->Pitch;
+ d = r + 8 * 16;
+ break;
+
+ case 5: /* 01 01 */
+ s = ss + 7 * plane->Pitch + 7;
+ d = r + 8 * 16 + 8;
+ break;
+ }
+
+ if (d != NULL) {
+ __m64 pix = _mm_set1_pi8(s[0]);
+ *((__m64*)(d + 16 * 0)) = pix;
+ *((__m64*)(d + 16 * 1)) = pix;
+ *((__m64*)(d + 16 * 2)) = pix;
+ *((__m64*)(d + 16 * 3)) = pix;
+ *((__m64*)(d + 16 * 4)) = pix;
+ *((__m64*)(d + 16 * 5)) = pix;
+ *((__m64*)(d + 16 * 6)) = pix;
+ *((__m64*)(d + 16 * 7)) = pix;
+ }
+ }
+
+ {
+ const UINT8* sx = NULL;
+ UINT8* dx = r;
+
+ const UINT8* sy = NULL;
+ UINT8* dy = r;
+
+ if (xf == 2) {
+ sx = ss;
+ dy += 8;
+ rr += 8;
+ } else if (xf == 1) {
+ sx = ss + 7;
+ dx += 8;
+ }
+
+ if (yf == 2) {
+ sy = ss;
+ dx += 64 * 2;
+ rr += 64 * 2;
+ } else if (yf == 1) {
+ sy = ss + 7 * plane->Pitch;
+ dy += 64 * 2;
+ }
+
+ if (sx != NULL) {
+ *((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
+ *((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
+ }
+
+ if (sy != NULL) {
+ __m64 pix = *((const __m64*)sy);
+ *((__m64*)(dy + 16 * 0)) = pix;
+ *((__m64*)(dy + 16 * 1)) = pix;
+ *((__m64*)(dy + 16 * 2)) = pix;
+ *((__m64*)(dy + 16 * 3)) = pix;
+ *((__m64*)(dy + 16 * 4)) = pix;
+ *((__m64*)(dy + 16 * 5)) = pix;
+ *((__m64*)(dy + 16 * 6)) = pix;
+ *((__m64*)(dy + 16 * 7)) = pix;
+ }
+ }
+
+ *((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
+ *((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
+ *((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
+ *((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
+ *((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
+ *((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
+ *((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
+ *((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));
+
+ if (x < 0) {
+ if (x <= -8) x = -8;
+ rr += x;
+ } else if (x > plane->CX - 8) {
+ x -= plane->CX - 8;
+ if (x >= 8) x = 8;
+ rr += x;
+ }
+
+ if (y < 0) {
+ if (y <= -8) y = -8;
+ rr += y * 16;
+ } else if (y > plane->CY - 8) {
+ y -= plane->CY - 8;
+ if (y >= 8) y = 8;
+ rr += y * 16;
+ }
+
+ *((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
+ *((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
+ *((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
+ *((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
+ *((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
+ *((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
+ *((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
+ *((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
+}
+
+/* */
+
+#if 0
+static void Block_Extract8x8_C(
+ const Plane_t* plane,
+ INT32 x,
+ INT32 y,
+ UINT8* block,
+ INT32 pitch)
{
INT32 i, j;
yy = plane->CY - 1;
}
- block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
+ block[i * pitch + j] = plane->Plane[yy * plane->Pitch + xx];
}
}
}
+static void Block_Extract8x8(
+ const Plane_t* plane,
+ INT32 x,
+ INT32 y,
+ UINT8* block,
+ INT32 pitch)
+{
+ ALIGN(0x10) UINT8 b0[64], b1[64];
+
+ Block_Extract8x8_MMX(plane, x, y, b0, 8);
+ Block_Extract8x8_C (plane, x, y, b1, 8);
+
+ if (memcmp(b0, b1, 64) != 0) {
+ __asm int 3;
+ }
+
+ Block_Extract8x8_C(plane, x, y, block, pitch);
+}
+#endif
+
/* */
void MotionComp_Compensate16x16_SSE2(
INT32 x,
INT32 y)
{
- ALIGN(0x10) UINT8 b[64];
-
- const UINT8* s = r->Plane + y * r->Pitch + x;
- INT32 p0 = r->Pitch;
- UINT8* d = p;
-
- __m64 s0, s1, s2, s3, s4, s5, s6, s7;
-
if (x < 0 || x + 8 >= r->CX ||
y < 0 || y + 8 >= r->CY) {
- s = b;
- p0 = 8;
- Block_Extract8x8(r, x, y, b);
- }
+ Block_Extract8x8_MMX(r, x, y, p, pitch);
- s0 = *((const __m64*)s); s += p0;
- s1 = *((const __m64*)s); s += p0;
- s2 = *((const __m64*)s); s += p0;
- s3 = *((const __m64*)s); s += p0;
- s4 = *((const __m64*)s); s += p0;
- s5 = *((const __m64*)s); s += p0;
- s6 = *((const __m64*)s); s += p0;
- s7 = *((const __m64*)s);
-
- *((__m64*)d) = s0; d += pitch;
- *((__m64*)d) = s1; d += pitch;
- *((__m64*)d) = s2; d += pitch;
- *((__m64*)d) = s3; d += pitch;
- *((__m64*)d) = s4; d += pitch;
- *((__m64*)d) = s5; d += pitch;
- *((__m64*)d) = s6; d += pitch;
- *((__m64*)d) = s7;
+ } else {
+ const UINT8* s = r->Plane + y * r->Pitch + x;
+ INT32 p0 = r->Pitch;
+ UINT8* d = p;
+
+ __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+ s0 = *((const __m64*)s); s += p0;
+ s1 = *((const __m64*)s); s += p0;
+ s2 = *((const __m64*)s); s += p0;
+ s3 = *((const __m64*)s); s += p0;
+ s4 = *((const __m64*)s); s += p0;
+ s5 = *((const __m64*)s); s += p0;
+ s6 = *((const __m64*)s); s += p0;
+ s7 = *((const __m64*)s);
+
+ *((__m64*)d) = s0; d += pitch;
+ *((__m64*)d) = s1; d += pitch;
+ *((__m64*)d) = s2; d += pitch;
+ *((__m64*)d) = s3; d += pitch;
+ *((__m64*)d) = s4; d += pitch;
+ *((__m64*)d) = s5; d += pitch;
+ *((__m64*)d) = s6; d += pitch;
+ *((__m64*)d) = s7;
+ }
}
/* */
s1 = b1;
p1 = 8;
- Block_Extract8x8(r, x0, y0, b0);
- Block_Extract8x8(r, x1, y1, b1);
+ Block_Extract8x8_MMX(r, x0, y0, b0, 8);
+ Block_Extract8x8_MMX(r, x1, y1, b1, 8);
}
S0 = *((const __m64*)s0); s0 += p0;