From ba6836c966debc56314ce2ef133c7f0c1fdfdeac Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Fri, 19 Jul 2013 11:03:32 +0300 Subject: [PATCH] arm: Add VFP-accelerated version of dca_lfe_fir MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö --- libavcodec/arm/Makefile | 3 +- libavcodec/arm/dcadsp_init_arm.c | 4 + libavcodec/arm/dcadsp_vfp.S | 220 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 libavcodec/arm/dcadsp_vfp.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index e95d94ace8..9bb8795bf7 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -52,7 +52,8 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ arm/vp8dsp_init_armv6.o \ arm/vp8dsp_armv6.o -VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o +VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \ + arm/synth_filter_vfp.o VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c index f0375c9710..824b909aae 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/dcadsp_init_arm.c @@ -24,6 +24,8 @@ #include "libavutil/attributes.h" #include "libavcodec/dcadsp.h" +void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, + int decifactor, float scale); void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, int decifactor, float scale); @@ -31,6 +33,8 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) + s->lfe_fir = ff_dca_lfe_fir_vfp; if (have_neon(cpu_flags)) s->lfe_fir = ff_dca_lfe_fir_neon; } diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S new file mode 100644 index 0000000000..57e16196f7 --- /dev/null +++ b/libavcodec/arm/dcadsp_vfp.S @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +POUT .req a1 +PIN .req a2 +PCOEF .req a3 +DECIFACTOR .req a4 +OLDFPSCR .req a4 +COUNTER .req ip + +SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8 +SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4 +IN0 .req s4 +IN1 .req s5 +IN2 .req s6 +IN3 .req s7 +IN4 .req s0 +IN5 .req s1 +IN6 .req s2 +IN7 .req s3 +COEF0 .req s8 @ coefficient elements +COEF1 .req s9 +COEF2 .req s10 +COEF3 .req s11 +COEF4 .req s12 +COEF5 .req s13 +COEF6 .req s14 +COEF7 .req s15 +ACCUM0 .req s16 @ double-buffered multiply-accumulate results +ACCUM4 .req s20 +POST0 .req s24 @ do long-latency post-multiply in this vector in parallel +POST1 .req s25 +POST2 .req s26 +POST3 .req s27 + + +.macro inner_loop decifactor, dir, tail, head + .ifc "\dir","up" + .set X, 0 + .set Y, 4 + .else + .set X, 4*JMAX*4 - 4 + .set Y, -4 + .endif + .ifnc "\head","" + vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] + .endif + .ifnc "\tail","" + vadd.f POST0, ACCUM0, ACCUM4 @ vector operation + .endif + .ifnc "\head","" + vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] + .endif + .ifnc "\tail","" + vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar) + .endif + .ifnc "\head","" + vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] + .ifc "\tail","" + vmul.f ACCUM4, COEF4, IN1 @ vector operation + .endif + vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] + .ifnc "\tail","" + vmul.f ACCUM4, COEF4, IN1 @ vector operation + .endif + vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] + .endif + .ifnc "\tail","" + vstmia POUT!, {POST0-POST3} + .endif + .ifnc "\head","" + vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] + vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar + .if \decifactor == 32 + vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] + vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] + vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar + vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] + vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] + vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar + .endif + .endif +.endm + +.macro dca_lfe_fir decifactor + .if \decifactor == 32 + .set JMAX, 8 + vpush {s16-s31} + vmov SCALE32, s0 @ duplicate scalar across vector + vldr IN4, [PIN, #-4*4] + vldr IN5, [PIN, #-5*4] + vldr IN6, [PIN, #-6*4] + vldr IN7, [PIN, #-7*4] + .else + .set JMAX, 4 + vpush {s16-s27} + .endif + + mov COUNTER, #\decifactor/4 - 1 + inner_loop \decifactor, up,, head +1: add PCOEF, PCOEF, #4*JMAX*4 + subs COUNTER, COUNTER, #1 + inner_loop \decifactor, up, tail, head + bne 1b + inner_loop \decifactor, up, tail + + mov COUNTER, #\decifactor/4 - 1 + inner_loop \decifactor, down,, head +1: sub PCOEF, PCOEF, #4*JMAX*4 + subs COUNTER, COUNTER, #1 + inner_loop \decifactor, down, tail, head + bne 1b + inner_loop \decifactor, down, tail + + .if \decifactor == 32 + vpop {s16-s31} + .else + vpop {s16-s27} + .endif + fmxr FPSCR, OLDFPSCR + bx lr +.endm + + +/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, + * int decifactor, float scale) + */ +function ff_dca_lfe_fir_vfp, export=1 + teq DECIFACTOR, #32 + fmrx OLDFPSCR, FPSCR + ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, ip +NOVFP vldr s0, [sp] + vldr IN0, [PIN, #-0*4] + vldr IN1, [PIN, #-1*4] + vldr IN2, [PIN, #-2*4] + vldr IN3, [PIN, #-3*4] + beq 32f +64: dca_lfe_fir 64 + .ltorg +32: dca_lfe_fir 32 +endfunc + + .unreq POUT + .unreq PIN + .unreq PCOEF + .unreq DECIFACTOR + .unreq OLDFPSCR + .unreq COUNTER + + .unreq SCALE32 + .unreq SCALE64 + .unreq IN0 + .unreq IN1 + .unreq IN2 + .unreq IN3 + .unreq IN4 + .unreq IN5 + .unreq IN6 + .unreq IN7 + .unreq COEF0 + .unreq COEF1 + .unreq COEF2 + .unreq COEF3 + .unreq COEF4 + .unreq COEF5 + .unreq COEF6 + .unreq COEF7 + .unreq ACCUM0 + .unreq ACCUM4 + .unreq POST0 + .unreq POST1 + .unreq POST2 + .unreq POST3 -- 2.11.0