From 6eabb0d3ad42b91c1b4c298718c29961f7c1653a Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Thu, 13 Jan 2011 15:28:06 -0500 Subject: [PATCH] Change DSPContext.vector_fmul() from dst=dst*src to dest=src0*src1. Signed-off-by: Mans Rullgard --- libavcodec/aacenc.c | 2 +- libavcodec/arm/dsputil_init_neon.c | 2 +- libavcodec/arm/dsputil_init_vfp.c | 3 ++- libavcodec/arm/dsputil_neon.S | 45 +++++++++++++++++++------------------- libavcodec/arm/dsputil_vfp.S | 29 ++++++++++++------------ libavcodec/atrac3.c | 2 +- libavcodec/dsputil.c | 4 ++-- libavcodec/dsputil.h | 2 +- libavcodec/nellymoserenc.c | 6 ++--- libavcodec/ppc/float_altivec.c | 10 ++++----- libavcodec/twinvq.c | 4 ++-- libavcodec/vorbis_dec.c | 2 +- libavcodec/x86/dsputil_mmx.c | 24 ++++++++++---------- 13 files changed, 67 insertions(+), 68 deletions(-) diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c index f8f0eb45f..6a113ef30 100644 --- a/libavcodec/aacenc.c +++ b/libavcodec/aacenc.c @@ -256,7 +256,7 @@ static void apply_window_and_mdct(AVCodecContext *avctx, AACEncContext *s, s->output[i - 448 - k] = (i < 1024) ? sce->saved[i] : audio[(i-1024)*chans]; - s->dsp.vector_fmul (s->output, k ? swindow : pwindow, 128); + s->dsp.vector_fmul (s->output, s->output, k ? swindow : pwindow, 128); s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128); ff_mdct_calc(&s->mdct128, sce->coeffs + k, s->output); } diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 04ebb0057..221183cef 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -138,7 +138,7 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); -void ff_vector_fmul_neon(float *dst, const float *src, int len); +void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_window_neon(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c index 9f8c1b784..76ef6b417 100644 --- a/libavcodec/arm/dsputil_init_vfp.c +++ b/libavcodec/arm/dsputil_init_vfp.c @@ -21,7 +21,8 @@ #include "libavcodec/dsputil.h" #include "dsputil_arm.h" -void ff_vector_fmul_vfp(float *dst, const float *src, int len); +void ff_vector_fmul_vfp(float *dst, const float *src0, + const float *src1, int len); void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 2bcdb397f..42fb38de5 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -738,42 +738,41 @@ function ff_float_to_int16_interleave_neon, export=1 endfunc function ff_vector_fmul_neon, export=1 - mov r3, r0 - subs r2, r2, #8 - vld1.64 {d0-d3}, [r0,:128]! - vld1.64 {d4-d7}, [r1,:128]! + subs r3, r3, #8 + vld1.64 {d0-d3}, [r1,:128]! + vld1.64 {d4-d7}, [r2,:128]! vmul.f32 q8, q0, q2 vmul.f32 q9, q1, q3 beq 3f - bics ip, r2, #15 + bics ip, r3, #15 beq 2f 1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! + vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! vmul.f32 q10, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! vmul.f32 q11, q1, q3 - vst1.64 {d16-d19},[r3,:128]! - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! + vst1.64 {d16-d19},[r0,:128]! + vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! vmul.f32 q9, q1, q3 - vst1.64 {d20-d23},[r3,:128]! + vst1.64 {d20-d23},[r0,:128]! bne 1b - ands r2, r2, #15 + ands r3, r3, #15 beq 3f -2: vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vst1.64 {d16-d17},[r3,:128]! +2: vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! + vst1.64 {d16-d17},[r0,:128]! vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vst1.64 {d18-d19},[r3,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! + vst1.64 {d18-d19},[r0,:128]! vmul.f32 q9, q1, q3 -3: vst1.64 {d16-d19},[r3,:128]! +3: vst1.64 {d16-d19},[r0,:128]! bx lr endfunc diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S index b704ba914..a65b69e20 100644 --- a/libavcodec/arm/dsputil_vfp.S +++ b/libavcodec/arm/dsputil_vfp.S @@ -41,34 +41,33 @@ * ARM VFP optimized implementation of 'vector_fmul_c' function. * Assume that len is a positive number and is multiple of 8 */ -@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) +@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) function ff_vector_fmul_vfp, export=1 vpush {d8-d15} - mov r3, r0 fmrx r12, fpscr orr r12, r12, #(3 << 16) /* set vector size to 4 */ fmxr fpscr, r12 - vldmia r3!, {s0-s3} - vldmia r1!, {s8-s11} - vldmia r3!, {s4-s7} - vldmia r1!, {s12-s15} + vldmia r1!, {s0-s3} + vldmia r2!, {s8-s11} + vldmia r1!, {s4-s7} + vldmia r2!, {s12-s15} vmul.f32 s8, s0, s8 1: - subs r2, r2, #16 + subs r3, r3, #16 vmul.f32 s12, s4, s12 - vldmiage r3!, {s16-s19} - vldmiage r1!, {s24-s27} - vldmiage r3!, {s20-s23} - vldmiage r1!, {s28-s31} + vldmiage r1!, {s16-s19} + vldmiage r2!, {s24-s27} + vldmiage r1!, {s20-s23} + vldmiage r2!, {s28-s31} vmulge.f32 s24, s16, s24 vstmia r0!, {s8-s11} vstmia r0!, {s12-s15} vmulge.f32 s28, s20, s28 - vldmiagt r3!, {s0-s3} - vldmiagt r1!, {s8-s11} - vldmiagt r3!, {s4-s7} - vldmiagt r1!, {s12-s15} + vldmiagt r1!, {s0-s3} + vldmiagt r2!, {s8-s11} + vldmiagt r1!, {s4-s7} + vldmiagt r2!, {s12-s15} vmulge.f32 s8, s0, s8 vstmiage r0!, {s24-s27} vstmiage r0!, {s28-s31} diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c index 797e1f199..cc13b730a 100644 --- a/libavcodec/atrac3.c +++ b/libavcodec/atrac3.c @@ -159,7 +159,7 @@ static void IMLT(ATRAC3Context *q, float *pInput, float *pOutput, int odd_band) ff_imdct_calc(&q->mdct_ctx,pOutput,pInput); /* Perform windowing on the output. */ - dsp.vector_fmul(pOutput,mdct_window,512); + dsp.vector_fmul(pOutput, pOutput, mdct_window, 512); } diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 57b264068..2ed005297 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3750,10 +3750,10 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) WRAPPER8_16_SQ(rd8x8_c, rd16_c) WRAPPER8_16_SQ(bit8x8_c, bit16_c) -static void vector_fmul_c(float *dst, const float *src, int len){ +static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){ int i; for(i=0; iin_buff, s->buf[s->bufsel], NELLY_BUF_LEN * sizeof(float)); - s->dsp.vector_fmul(s->in_buff, ff_sine_128, NELLY_BUF_LEN); + s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN); s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN); ff_mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff); - s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN); + s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, + ff_sine_128, NELLY_BUF_LEN); s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128, NELLY_BUF_LEN); ff_mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN); diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c index d1f9f1ade..188e03ea2 100644 --- a/libavcodec/ppc/float_altivec.c +++ b/libavcodec/ppc/float_altivec.c @@ -23,16 +23,16 @@ #include "dsputil_altivec.h" #include "util_altivec.h" -static void vector_fmul_altivec(float *dst, const float *src, int len) +static void vector_fmul_altivec(float *dst, const float *src0, const float *src1, int len) { int i; vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); for(i=0; itmp_buf, gain[sub*i+j], ftype); - tctx->dsp.vector_fmul(chunk + block_size*j, tctx->tmp_buf, + tctx->dsp.vector_fmul(chunk + block_size*j, chunk + block_size*j, tctx->tmp_buf, block_size); } @@ -805,7 +805,7 @@ static void read_and_decode_spectrum(TwinContext *tctx, GetBitContext *gb, dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf); for (j = 0; j < mtab->fmode[ftype].sub; j++) { - tctx->dsp.vector_fmul(chunk, tctx->tmp_buf, block_size); + tctx->dsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size); chunk += block_size; } } diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c index 4e16c4a7b..8f15a2100 100644 --- a/libavcodec/vorbis_dec.c +++ b/libavcodec/vorbis_dec.c @@ -1578,7 +1578,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) for (j = vc->audio_channels-1;j >= 0; j--) { ch_floor_ptr = vc->channel_floors + j * blocksize / 2; ch_res_ptr = vc->channel_residues + res_chan[j] * blocksize / 2; - vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize / 2); + vc->dsp.vector_fmul(ch_floor_ptr, ch_floor_ptr, ch_res_ptr, blocksize / 2); ff_imdct_half(&vc->mdct[blockflag], ch_res_ptr, ch_floor_ptr); } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 825149e4a..5ddfecae2 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2074,38 +2074,38 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c } } -static void vector_fmul_3dnow(float *dst, const float *src, int len){ +static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){ x86_reg i = (len-4)*4; __asm__ volatile( "1: \n\t" - "movq (%1,%0), %%mm0 \n\t" - "movq 8(%1,%0), %%mm1 \n\t" - "pfmul (%2,%0), %%mm0 \n\t" - "pfmul 8(%2,%0), %%mm1 \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" "movq %%mm0, (%1,%0) \n\t" "movq %%mm1, 8(%1,%0) \n\t" "sub $16, %0 \n\t" "jge 1b \n\t" "femms \n\t" :"+r"(i) - :"r"(dst), "r"(src) + :"r"(dst), "r"(src0), "r"(src1) :"memory" ); } -static void vector_fmul_sse(float *dst, const float *src, int len){ +static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){ x86_reg i = (len-8)*4; __asm__ volatile( "1: \n\t" - "movaps (%1,%0), %%xmm0 \n\t" - "movaps 16(%1,%0), %%xmm1 \n\t" - "mulps (%2,%0), %%xmm0 \n\t" - "mulps 16(%2,%0), %%xmm1 \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" "movaps %%xmm0, (%1,%0) \n\t" "movaps %%xmm1, 16(%1,%0) \n\t" "sub $32, %0 \n\t" "jge 1b \n\t" :"+r"(i) - :"r"(dst), "r"(src) + :"r"(dst), "r"(src0), "r"(src1) :"memory" ); } -- 2.11.0