From: Christophe Gisquet Date: Sun, 25 Jan 2015 21:13:10 +0000 (-0300) Subject: x86/sbrdsp: Use different mem moves X-Git-Tag: android-x86-6.0-r1~774 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=7aeafacfd0da7c373ecdd5d56338be2d1051e338;p=android-x86%2Fexternal-ffmpeg.git x86/sbrdsp: Use different mem moves Before 2843 decicycles in ff_sbr_autocorrelate_sse3, 262086 runs, 58 skips After 2693 decicycles in ff_sbr_autocorrelate_sse3, 262117 runs, 27 skips Signed-off-by: James Almer --- diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm index a8ec7ed408..083461a107 100644 --- a/libavcodec/x86/sbrdsp.asm +++ b/libavcodec/x86/sbrdsp.asm @@ -454,13 +454,15 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt neg cntq %if cpuflag(sse3) +%define MOVH movsd movddup m5, [xq+cntq] %else +%define MOVH movlps movlps m5, [xq+cntq] movlhps m5, m5 %endif - movlps m7, [xq+cntq+8 ] - movlps m1, [xq+cntq+16] + MOVH m7, [xq+cntq+8 ] + MOVH m1, [xq+cntq+16] shufps m7, m7, q0110 shufps m1, m1, q0110 mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] @@ -470,7 +472,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt movaps [rsp+16], m4 add cntq, 8 - movlps m2, [xq+cntq+16] + MOVH m2, [xq+cntq+16] movlhps m7, m7 shufps m2, m2, q0110 mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0] @@ -481,7 +483,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt align 16 .loop: add cntq, 8 - movlps m0, [xq+cntq+16] + MOVH m0, [xq+cntq+16] movlhps m1, m1 shufps m0, m0, q0110 mulps m3, m1, m2 @@ -491,7 +493,7 @@ align 16 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; add cntq, 8 - movlps m1, [xq+cntq+16] + MOVH m1, [xq+cntq+16] movlhps m2, m2 shufps m1, m1, q0110 mulps m3, m2, m0 @@ -501,7 +503,7 @@ align 16 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; add cntq, 8 - movlps m2, [xq+cntq+16] + MOVH m2, [xq+cntq+16] movlhps m0, m0 shufps m2, m2, q0110 mulps m3, m0, m1