ff_h264_idct8_add_sse2.

author Loren Merritt <lorenm@u.washington.edu>

Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)

committer Loren Merritt <lorenm@u.washington.edu>

Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)
author Loren Merritt <lorenm@u.washington.edu>
Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)
committer Loren Merritt <lorenm@u.washington.edu>
Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)
diff --git a/libavcodec/h264.h b/libavcodec/h264.h

index 6d0486b..f45b3a6 100644 (file)
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -348,7 +348,7 @@ typedef struct H264Context{
      GetBitContext *intra_gb_ptr;
      GetBitContext *inter_gb_ptr;
  
-    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
+    DECLARE_ALIGNED_16(DCTELEM, mb[16*24]);
      DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
  
      /**
diff --git a/libavcodec/i386/dsputil_h264_template_mmx.c b/libavcodec/i386/dsputil_h264_template_mmx.c

index a66f51e..79ad562 100644 (file)
--- a/libavcodec/i386/dsputil_h264_template_mmx.c
+++ b/libavcodec/i386/dsputil_h264_template_mmx.c
@@ -98,7 +98,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
      }
  
      /* general case, bilinear */
-    rnd_reg = rnd ? &ff_pw_32 : &ff_pw_28;
+    rnd_reg = rnd ? ff_pw_32 : &ff_pw_28;
      asm volatile("movd %2, %%mm4\n\t"
                   "movd %3, %%mm6\n\t"
                   "punpcklwd %%mm4, %%mm4\n\t"
@@ -250,7 +250,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*
          "sub $2, %2                 \n\t"
          "jnz 1b                     \n\t"
          : "+r"(dst), "+r"(src), "+r"(h)
-        : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y)
+        : "r"((long)stride), "m"(*ff_pw_32), "m"(x), "m"(y)
      );
  }
  
@@ -301,7 +301,7 @@ static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*
          "sub $1, %2\n\t"
          "jnz 1b\n\t"
          : "+r" (dst), "+r"(src), "+r"(h)
-        : "m" (ff_pw_32), "r"((long)stride)
+        : "m" (*ff_pw_32), "r"((long)stride)
          : "%esi");
  
  }
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c

index fbe695d..70da94f 100644 (file)
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -54,7 +54,7 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
+DECLARE_ALIGNED_16(const uint64_t, ff_pw_32[2]) = {0x0020002000200020ULL, 0x0020002000200020ULL};
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
  DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
@@ -3328,6 +3328,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
          c->h264_idct_add= ff_h264_idct_add_mmx;
          c->h264_idct8_dc_add=
          c->h264_idct8_add= ff_h264_idct8_add_mmx;
+        if (mm_flags & MM_SSE2)
+            c->h264_idct8_add= ff_h264_idct8_add_sse2;
  
          if (mm_flags & MM_MMXEXT) {
              c->prefetch = prefetch_mmx2;
diff --git a/libavcodec/i386/dsputil_mmx.h b/libavcodec/i386/dsputil_mmx.h

index a1571fc..7a6e62d 100644 (file)
--- a/libavcodec/i386/dsputil_mmx.h
+++ b/libavcodec/i386/dsputil_mmx.h
@@ -36,7 +36,7 @@ extern const uint64_t ff_pw_8;
  extern const uint64_t ff_pw_15;
  extern const uint64_t ff_pw_16;
  extern const uint64_t ff_pw_20;
-extern const uint64_t ff_pw_32;
+extern const uint64_t ff_pw_32[2];
  extern const uint64_t ff_pw_42;
  extern const uint64_t ff_pw_64;
  extern const uint64_t ff_pw_96;
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c

index 8d96ba7..e210d4b 100644 (file)
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -75,7 +75,7 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
          IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
  
          "pxor %%mm7, %%mm7    \n\t"
-    :: "m"(ff_pw_32));
+    :: "m"(*ff_pw_32));
  
      asm volatile(
      STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
@@ -211,6 +211,93 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
      add_pixels_clamped_mmx(b2, dst, stride);
  }
  
+#define STORE_DIFF_8P( p, d, t, z )\
+        "movq       "#d", "#t" \n"\
+        "psraw       $6,  "#p" \n"\
+        "punpcklbw  "#z", "#t" \n"\
+        "paddsw     "#t", "#p" \n"\
+        "packuswb   "#p", "#p" \n"\
+        "movq       "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+        "movdqa     "#c", "#a" \n"\
+        "movdqa     "#g", "#e" \n"\
+        "psraw       $1,  "#c" \n"\
+        "psraw       $1,  "#g" \n"\
+        "psubw      "#e", "#c" \n"\
+        "paddw      "#a", "#g" \n"\
+        "movdqa     "#b", "#e" \n"\
+        "psraw       $1,  "#e" \n"\
+        "paddw      "#b", "#e" \n"\
+        "paddw      "#d", "#e" \n"\
+        "paddw      "#f", "#e" \n"\
+        "movdqa     "#f", "#a" \n"\
+        "psraw       $1,  "#a" \n"\
+        "paddw      "#f", "#a" \n"\
+        "paddw      "#h", "#a" \n"\
+        "psubw      "#b", "#a" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#d", "#f" \n"\
+        "paddw      "#h", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "psraw       $1,  "#d" \n"\
+        "psraw       $1,  "#h" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "movdqa     "#e", "#d" \n"\
+        "movdqa     "#a", "#h" \n"\
+        "psraw       $2,  "#d" \n"\
+        "psraw       $2,  "#h" \n"\
+        "paddw      "#f", "#d" \n"\
+        "paddw      "#b", "#h" \n"\
+        "psraw       $2,  "#f" \n"\
+        "psraw       $2,  "#b" \n"\
+        "psubw      "#f", "#e" \n"\
+        "psubw      "#a", "#b" \n"\
+        "movdqa 0x00(%1), "#a" \n"\
+        "movdqa 0x40(%1), "#f" \n"\
+        SUMSUB_BA(f, a)\
+        SUMSUB_BA(g, f)\
+        SUMSUB_BA(c, a)\
+        SUMSUB_BA(e, g)\
+        SUMSUB_BA(b, c)\
+        SUMSUB_BA(h, a)\
+        SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+    asm volatile(
+        "movdqa   0x10(%1), %%xmm1 \n"
+        "movdqa   0x20(%1), %%xmm2 \n"
+        "movdqa   0x30(%1), %%xmm3 \n"
+        "movdqa   0x50(%1), %%xmm5 \n"
+        "movdqa   0x60(%1), %%xmm6 \n"
+        "movdqa   0x70(%1), %%xmm7 \n"
+        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+        "paddw          %4, %%xmm4 \n"
+        "movdqa     %%xmm4, 0x00(%1) \n"
+        "movdqa     %%xmm2, 0x40(%1) \n"
+        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+        "movdqa     %%xmm6, 0x60(%1) \n"
+        "movdqa     %%xmm7, 0x70(%1) \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
+        "lea     (%0,%2,4), %0 \n"
+        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
+        "movdqa   0x60(%1), %%xmm0 \n"
+        "movdqa   0x70(%1), %%xmm1 \n"
+        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
+        :"+r"(dst)
+        :"r"(block), "r"((long)stride), "r"(3L*stride), "m"(*ff_pw_32)
+    );
+}
+
  static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  {
      int dc = (block[0] + 32) >> 6;
@@ -839,7 +926,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
          "decl %2                    \n\t"\
          " jnz 1b                    \n\t"\
          : "+a"(tmp), "+c"(dst), "+m"(h)\
-        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "S"((long)dstStride), "m"(*ff_pw_32)\
          : "memory"\
      );\
  }\
@@ -1113,7 +1200,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
          "decl %2                    \n\t"\
          " jnz 1b                    \n\t"\
          : "+a"(tmp), "+c"(dst), "+m"(h)\
-        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "S"((long)dstStride), "m"(*ff_pw_32)\
          : "memory"\
      );\
      tmp += 8 - size*24;\
author	Loren Merritt <lorenm@u.washington.edu>
	Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)
committer	Loren Merritt <lorenm@u.washington.edu>
	Sun, 3 Feb 2008 07:05:11 +0000 (07:05 +0000)
libavcodec/h264.h		patch \| blob \| history
libavcodec/i386/dsputil_h264_template_mmx.c		patch \| blob \| history
libavcodec/i386/dsputil_mmx.c		patch \| blob \| history
libavcodec/i386/dsputil_mmx.h		patch \| blob \| history
libavcodec/i386/h264dsp_mmx.c		patch \| blob \| history