From: Ronald S. Bultje Date: Sun, 22 Sep 2013 02:03:00 +0000 (-0400) Subject: Full-pixel MC functions. X-Git-Tag: android-x86-4.4-r1~819^2 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=f1548c008fc5d50488bb7bbc2aa4cf49d89a0bda;p=android-x86%2Fexternal-ffmpeg.git Full-pixel MC functions. Decoding time of ped1080p.webm goes from 11.3sec to 11.1sec. --- diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm index 30db0ca3af..740d67cfa1 100644 --- a/libavcodec/x86/vp9dsp.asm +++ b/libavcodec/x86/vp9dsp.asm @@ -219,3 +219,60 @@ filter_v_fn avg INIT_XMM ssse3 filter_v_fn put filter_v_fn avg + +%macro fpel_fn 6 +%if %2 == 4 +%define %%srcfn movh +%define %%dstfn movh +%else +%define %%srcfn movu +%define %%dstfn mova +%endif + +%if %2 <= 16 +cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 + lea sstride3q, [sstrideq*3] + lea dstride3q, [dstrideq*3] +%else +cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h +%endif +.loop: + %%srcfn m0, [srcq] + %%srcfn m1, [srcq+s%3] + %%srcfn m2, [srcq+s%4] + %%srcfn m3, [srcq+s%5] + lea srcq, [srcq+sstrideq*%6] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+d%3] + pavgb m2, [dstq+d%4] + pavgb m3, [dstq+d%5] +%endif + %%dstfn [dstq], m0 + %%dstfn [dstq+d%3], m1 + %%dstfn [dstq+d%4], m2 + %%dstfn [dstq+d%5], m3 + lea dstq, [dstq+dstrideq*%6] + sub hd, %6 + jnz .loop + RET +%endmacro + +%define d16 16 +%define s16 16 +INIT_MMX mmx +fpel_fn put, 4, strideq, strideq*2, stride3q, 4 +fpel_fn put, 8, strideq, strideq*2, stride3q, 4 +INIT_MMX sse +fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 +INIT_XMM sse +fpel_fn put, 16, strideq, strideq*2, stride3q, 4 +fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 +%undef s16 +%undef d16 diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index d135cf4bae..cf7a1a4bf4 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -27,6 +27,22 @@ #if HAVE_YASM +#define fpel_func(avg, sz, opt) \ +void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) +fpel_func(put, 4, mmx); +fpel_func(put, 8, mmx); +fpel_func(put, 16, sse); +fpel_func(put, 32, sse); +fpel_func(put, 64, sse); +fpel_func(avg, 4, sse); +fpel_func(avg, 8, sse); +fpel_func(avg, 16, sse2); +fpel_func(avg, 32, sse2); +fpel_func(avg, 64, sse2); +#undef fpel_func + #define mc_func(avg, sz, dir, opt) \ void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ @@ -141,6 +157,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) #if HAVE_YASM int cpu_flags = av_get_cpu_flags(); +#define init_fpel(idx1, idx2, sz, type, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt + + #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ @@ -158,11 +181,31 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_subpel2(idx, 0, 1, v, type, opt); \ init_subpel2(idx, 1, 0, h, type, opt) + if (cpu_flags & AV_CPU_FLAG_MMX) { + init_fpel(4, 0, 4, put, mmx); + init_fpel(3, 0, 8, put, mmx); + } + + if (cpu_flags & AV_CPU_FLAG_SSE) { + init_fpel(2, 0, 16, put, sse); + init_fpel(1, 0, 32, put, sse); + init_fpel(0, 0, 64, put, sse); + init_fpel(4, 1, 4, avg, sse); + init_fpel(3, 1, 8, avg, sse); + } + + if (cpu_flags & AV_CPU_FLAG_SSE2) { + init_fpel(2, 1, 16, avg, sse2); + init_fpel(1, 1, 32, avg, sse2); + init_fpel(0, 1, 64, avg, sse2); + } + if (cpu_flags & AV_CPU_FLAG_SSSE3) { init_subpel3(0, put, ssse3); init_subpel3(1, avg, ssse3); } +#undef init_fpel #undef init_subpel1 #undef init_subpel2 #undef init_subpel3