OSDN Git Service

optimize Block_Extract8x8_MMX.
authorNoumi Akira <noumiakira@users.sourceforge.jp>
Tue, 7 Jul 2009 06:36:55 +0000 (15:36 +0900)
committerNoumi Akira <noumiakira@users.sourceforge.jp>
Tue, 7 Jul 2009 06:36:55 +0000 (15:36 +0900)
Lib/QTheoraEx/MotionComp_SSE2.c
Lib/QTheoraEx/QTheoraEx.vcproj

index 078145f..d2ca154 100644 (file)
 
 /* */
 
-static void Block_Extract8x8(
+static void Block_Extract8x8_MMX(
        const Plane_t* plane,
        INT32          x,
        INT32          y,
-       UINT8*         block)
+       UINT8*         block,
+       INT32          pitch)
+{
+       ALIGN(0x10) UINT8 r[64 * 4];
+
+       INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
+       INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);
+
+       const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;
+
+       INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
+       INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);
+
+       INT32 xy = (xf << 2) | yf;
+
+       UINT8* rr = r;
+
+       {
+               const UINT8* s = NULL;
+               UINT8*       d = NULL;
+
+               switch (xy) {
+               case 10: /* 10 10 */
+                       s = ss;
+                       d = r;
+                       break;
+
+               case  6: /* 01 10 */
+                       s = ss + 7;
+                       d = r + 8;
+                       break;
+
+               case  9: /* 10 01 */
+                       s = ss + 7 * plane->Pitch;
+                       d = r + 8 * 16;
+                       break;
+
+               case  5: /* 01 01 */
+                       s = ss + 7 * plane->Pitch + 7;
+                       d = r + 8 * 16 + 8;
+                       break;
+               }
+
+               if (d != NULL) {
+                       __m64 pix = _mm_set1_pi8(s[0]);
+                       *((__m64*)(d + 16 * 0)) = pix;
+                       *((__m64*)(d + 16 * 1)) = pix;
+                       *((__m64*)(d + 16 * 2)) = pix;
+                       *((__m64*)(d + 16 * 3)) = pix;
+                       *((__m64*)(d + 16 * 4)) = pix;
+                       *((__m64*)(d + 16 * 5)) = pix;
+                       *((__m64*)(d + 16 * 6)) = pix;
+                       *((__m64*)(d + 16 * 7)) = pix;
+               }
+       }
+
+       {
+               const UINT8* sx = NULL;
+               UINT8*       dx = r;
+
+               const UINT8* sy = NULL;
+               UINT8*       dy = r;
+
+               if (xf == 2) {
+                       sx = ss;
+                       dy += 8;
+                       rr += 8;
+               } else if (xf == 1) {
+                       sx = ss + 7;
+                       dx += 8;
+               }
+
+               if (yf == 2) {
+                       sy = ss;
+                       dx += 64 * 2;
+                       rr += 64 * 2;
+               } else if (yf == 1) {
+                       sy = ss + 7 * plane->Pitch;
+                       dy += 64 * 2;
+               }
+
+               if (sx != NULL) {
+                       *((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
+                       *((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
+               }
+
+               if (sy != NULL) {
+                       __m64 pix = *((const __m64*)sy);
+                       *((__m64*)(dy + 16 * 0)) = pix;
+                       *((__m64*)(dy + 16 * 1)) = pix;
+                       *((__m64*)(dy + 16 * 2)) = pix;
+                       *((__m64*)(dy + 16 * 3)) = pix;
+                       *((__m64*)(dy + 16 * 4)) = pix;
+                       *((__m64*)(dy + 16 * 5)) = pix;
+                       *((__m64*)(dy + 16 * 6)) = pix;
+                       *((__m64*)(dy + 16 * 7)) = pix;
+               }
+       }
+
+       *((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
+       *((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
+       *((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
+       *((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
+       *((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
+       *((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
+       *((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
+       *((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));
+
+       if (x < 0) {
+               if (x <= -8) x = -8;
+               rr += x;
+       } else if (x > plane->CX - 8) {
+               x -= plane->CX - 8;
+               if (x >= 8) x = 8;
+               rr += x;
+       }
+
+       if (y < 0) {
+               if (y <= -8) y = -8;
+               rr += y * 16;
+       } else if (y > plane->CY - 8) {
+               y -= plane->CY - 8;
+               if (y >= 8) y = 8;
+               rr += y * 16;
+       }
+
+       *((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
+       *((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
+       *((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
+       *((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
+       *((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
+       *((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
+       *((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
+       *((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
+}
+
+/* */
+
+#if 0
+static void Block_Extract8x8_C(
+       const Plane_t* plane,
+       INT32          x,
+       INT32          y,
+       UINT8*         block,
+       INT32          pitch)
 {
        INT32 i, j;
 
@@ -36,11 +186,31 @@ static void Block_Extract8x8(
                                yy = plane->CY - 1;
                        }
 
-                       block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
+                       block[i * pitch + j] = plane->Plane[yy * plane->Pitch + xx];
                }
        }
 }
 
+static void Block_Extract8x8(
+       const Plane_t* plane,
+       INT32          x,
+       INT32          y,
+       UINT8*         block,
+       INT32          pitch)
+{
+       ALIGN(0x10) UINT8 b0[64], b1[64];
+
+       Block_Extract8x8_MMX(plane, x, y, b0, 8);
+       Block_Extract8x8_C  (plane, x, y, b1, 8);
+
+       if (memcmp(b0, b1, 64) != 0) {
+               __asm int 3;
+       }
+
+       Block_Extract8x8_C(plane, x, y, block, pitch);
+}
+#endif
+
 /* */
 
 void MotionComp_Compensate16x16_SSE2(
@@ -108,38 +278,35 @@ void MotionComp_Compensate8x8_SSE2(
        INT32          x,
        INT32          y)
 {
-       ALIGN(0x10) UINT8 b[64];
-
-       const UINT8* s  = r->Plane + y * r->Pitch + x;
-       INT32        p0 = r->Pitch;
-       UINT8*       d  = p;
-
-       __m64 s0, s1, s2, s3, s4, s5, s6, s7;
-
        if (x < 0 || x + 8 >= r->CX ||
                y < 0 || y + 8 >= r->CY) {
-               s  = b;
-               p0 = 8;
-               Block_Extract8x8(r, x, y, b);
-       }
+               Block_Extract8x8_MMX(r, x, y, p, pitch);
 
-       s0 = *((const __m64*)s); s += p0;
-       s1 = *((const __m64*)s); s += p0;
-       s2 = *((const __m64*)s); s += p0;
-       s3 = *((const __m64*)s); s += p0;
-       s4 = *((const __m64*)s); s += p0;
-       s5 = *((const __m64*)s); s += p0;
-       s6 = *((const __m64*)s); s += p0;
-       s7 = *((const __m64*)s);
-
-       *((__m64*)d) = s0; d += pitch;
-       *((__m64*)d) = s1; d += pitch;
-       *((__m64*)d) = s2; d += pitch;
-       *((__m64*)d) = s3; d += pitch;
-       *((__m64*)d) = s4; d += pitch;
-       *((__m64*)d) = s5; d += pitch;
-       *((__m64*)d) = s6; d += pitch;
-       *((__m64*)d) = s7;
+       } else {
+               const UINT8* s  = r->Plane + y * r->Pitch + x;
+               INT32        p0 = r->Pitch;
+               UINT8*       d  = p;
+
+               __m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+               s0 = *((const __m64*)s); s += p0;
+               s1 = *((const __m64*)s); s += p0;
+               s2 = *((const __m64*)s); s += p0;
+               s3 = *((const __m64*)s); s += p0;
+               s4 = *((const __m64*)s); s += p0;
+               s5 = *((const __m64*)s); s += p0;
+               s6 = *((const __m64*)s); s += p0;
+               s7 = *((const __m64*)s);
+
+               *((__m64*)d) = s0; d += pitch;
+               *((__m64*)d) = s1; d += pitch;
+               *((__m64*)d) = s2; d += pitch;
+               *((__m64*)d) = s3; d += pitch;
+               *((__m64*)d) = s4; d += pitch;
+               *((__m64*)d) = s5; d += pitch;
+               *((__m64*)d) = s6; d += pitch;
+               *((__m64*)d) = s7;
+       }
 }
 
 /* */
@@ -287,8 +454,8 @@ void MotionComp_Compensate8x8H_SSE2(
                s1 = b1;
                p1 = 8;
 
-               Block_Extract8x8(r, x0, y0, b0);
-               Block_Extract8x8(r, x1, y1, b1);
+               Block_Extract8x8_MMX(r, x0, y0, b0, 8);
+               Block_Extract8x8_MMX(r, x1, y1, b1, 8);
        }
 
        S0 = *((const __m64*)s0); s0 += p0;
index 400c4b4..cca8e3b 100644 (file)
                        <File
                                RelativePath=".\MotionComp_SSE2.c"
                                >
+                               <FileConfiguration
+                                       Name="Debug|Win32"
+                                       >
+                                       <Tool
+                                               Name="VCCLCompilerTool"
+                                               AssemblerOutput="2"
+                                       />
+                               </FileConfiguration>
+                               <FileConfiguration
+                                       Name="Release|Win32"
+                                       >
+                                       <Tool
+                                               Name="VCCLCompilerTool"
+                                               AssemblerOutput="2"
+                                       />
+                               </FileConfiguration>
                        </File>
                        <File
                                RelativePath=".\SetupDecoder.c"