DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
+DECLARE_ALIGNED_16(const uint64_t, ff_pw_32[2]) = {0x0020002000200020ULL, 0x0020002000200020ULL};
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
c->h264_idct_add= ff_h264_idct_add_mmx;
c->h264_idct8_dc_add=
c->h264_idct8_add= ff_h264_idct8_add_mmx;
+ if (mm_flags & MM_SSE2)
+ c->h264_idct8_add= ff_h264_idct8_add_sse2;
if (mm_flags & MM_MMXEXT) {
c->prefetch = prefetch_mmx2;
IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
"pxor %%mm7, %%mm7 \n\t"
- :: "m"(ff_pw_32));
+ :: "m"(*ff_pw_32));
asm volatile(
STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
add_pixels_clamped_mmx(b2, dst, stride);
}
+#define STORE_DIFF_8P( p, d, t, z )\
+ "movq "#d", "#t" \n"\
+ "psraw $6, "#p" \n"\
+ "punpcklbw "#z", "#t" \n"\
+ "paddsw "#t", "#p" \n"\
+ "packuswb "#p", "#p" \n"\
+ "movq "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+ "movdqa "#c", "#a" \n"\
+ "movdqa "#g", "#e" \n"\
+ "psraw $1, "#c" \n"\
+ "psraw $1, "#g" \n"\
+ "psubw "#e", "#c" \n"\
+ "paddw "#a", "#g" \n"\
+ "movdqa "#b", "#e" \n"\
+ "psraw $1, "#e" \n"\
+ "paddw "#b", "#e" \n"\
+ "paddw "#d", "#e" \n"\
+ "paddw "#f", "#e" \n"\
+ "movdqa "#f", "#a" \n"\
+ "psraw $1, "#a" \n"\
+ "paddw "#f", "#a" \n"\
+ "paddw "#h", "#a" \n"\
+ "psubw "#b", "#a" \n"\
+ "psubw "#d", "#b" \n"\
+ "psubw "#d", "#f" \n"\
+ "paddw "#h", "#b" \n"\
+ "psubw "#h", "#f" \n"\
+ "psraw $1, "#d" \n"\
+ "psraw $1, "#h" \n"\
+ "psubw "#d", "#b" \n"\
+ "psubw "#h", "#f" \n"\
+ "movdqa "#e", "#d" \n"\
+ "movdqa "#a", "#h" \n"\
+ "psraw $2, "#d" \n"\
+ "psraw $2, "#h" \n"\
+ "paddw "#f", "#d" \n"\
+ "paddw "#b", "#h" \n"\
+ "psraw $2, "#f" \n"\
+ "psraw $2, "#b" \n"\
+ "psubw "#f", "#e" \n"\
+ "psubw "#a", "#b" \n"\
+ "movdqa 0x00(%1), "#a" \n"\
+ "movdqa 0x40(%1), "#f" \n"\
+ SUMSUB_BA(f, a)\
+ SUMSUB_BA(g, f)\
+ SUMSUB_BA(c, a)\
+ SUMSUB_BA(e, g)\
+ SUMSUB_BA(b, c)\
+ SUMSUB_BA(h, a)\
+ SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+ asm volatile(
+ "movdqa 0x10(%1), %%xmm1 \n"
+ "movdqa 0x20(%1), %%xmm2 \n"
+ "movdqa 0x30(%1), %%xmm3 \n"
+ "movdqa 0x50(%1), %%xmm5 \n"
+ "movdqa 0x60(%1), %%xmm6 \n"
+ "movdqa 0x70(%1), %%xmm7 \n"
+ H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+ TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+ "paddw %4, %%xmm4 \n"
+ "movdqa %%xmm4, 0x00(%1) \n"
+ "movdqa %%xmm2, 0x40(%1) \n"
+ H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+ "movdqa %%xmm6, 0x60(%1) \n"
+ "movdqa %%xmm7, 0x70(%1) \n"
+ "pxor %%xmm7, %%xmm7 \n"
+ STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
+ "lea (%0,%2,4), %0 \n"
+ STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
+ "movdqa 0x60(%1), %%xmm0 \n"
+ "movdqa 0x70(%1), %%xmm1 \n"
+ STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
+ :"+r"(dst)
+ :"r"(block), "r"((long)stride), "r"(3L*stride), "m"(*ff_pw_32)
+ );
+}
+
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
{
int dc = (block[0] + 32) >> 6;
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\
- : "S"((long)dstStride), "m"(ff_pw_32)\
+ : "S"((long)dstStride), "m"(*ff_pw_32)\
: "memory"\
);\
}\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\
- : "S"((long)dstStride), "m"(ff_pw_32)\
+ : "S"((long)dstStride), "m"(*ff_pw_32)\
: "memory"\
);\
tmp += 8 - size*24;\