From 98f24ecd6cfc9c57a555aae6bfcd3d9a4ce9503d Mon Sep 17 00:00:00 2001 From: Christophe GISQUET Date: Sun, 1 Jan 2012 15:28:47 +0100 Subject: [PATCH] rv34: joint coefficient decoding and dequantization Perform dequantization while decoding coefficients instead of performing it on the entire coefficients buffer. Since quantized coefficients are very sparse, this usually causes a small speedup. Speedup of around 1% on Panda board compared to the removed here neon code. Global speedup is probably around 3%. Signed-off-by: Kostya Shishkov --- libavcodec/arm/rv34dsp_init_neon.c | 3 -- libavcodec/arm/rv34dsp_neon.S | 24 ------------ libavcodec/rv34.c | 80 +++++++++++++++++++++----------------- libavcodec/rv34data.h | 10 ----- libavcodec/rv34dsp.c | 16 -------- libavcodec/rv34dsp.h | 1 - 6 files changed, 44 insertions(+), 90 deletions(-) diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c index acf2a7dcd3..9a09fde7a9 100644 --- a/libavcodec/arm/rv34dsp_init_neon.c +++ b/libavcodec/arm/rv34dsp_init_neon.c @@ -25,12 +25,9 @@ void ff_rv34_inv_transform_neon(DCTELEM *block); void ff_rv34_inv_transform_noround_neon(DCTELEM *block); -void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q); void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) { c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; - - c->rv34_dequant4x4 = ff_rv34_dequant4x4_neon; } diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S index 423b537fb9..f700f5c321 100644 --- a/libavcodec/arm/rv34dsp_neon.S +++ b/libavcodec/arm/rv34dsp_neon.S @@ -107,27 +107,3 @@ function ff_rv34_inv_transform_noround_neon, export=1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1 bx lr endfunc - -function ff_rv34_dequant4x4_neon, export=1 - mov r3, r0 - mov r12, #16 - vdup.16 q0, r2 - vmov.16 d0[0], r1 - vld1.16 {d2}, [r0,:64], r12 - vld1.16 {d4}, [r0,:64], r12 - vld1.16 {d6}, [r0,:64], r12 - vld1.16 {d16}, [r0,:64], r12 - vmull.s16 q1, d2, d0 - vmull.s16 q2, d4, d1 - vmull.s16 q3, d6, d1 - vmull.s16 q8, d16, d1 - vqrshrn.s32 d2, q1, #4 - vqrshrn.s32 d4, q2, #4 - vqrshrn.s32 d6, q3, #4 - vqrshrn.s32 d16, q8, #4 - vst1.16 {d2}, [r3,:64], r12 - vst1.16 {d4}, [r3,:64], r12 - vst1.16 {d6}, [r3,:64], r12 - vst1.16 {d16}, [r3,:64], r12 - bx lr -endfunc diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c index 1a126be2f9..db2e53aaa5 100644 --- a/libavcodec/rv34.c +++ b/libavcodec/rv34.c @@ -212,7 +212,7 @@ static int rv34_decode_cbp(GetBitContext *gb, RV34VLC *vlc, int table) /** * Get one coefficient value from the bistream and store it. */ -static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc) +static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc, int q) { if(coef){ if(coef == esc){ @@ -225,14 +225,14 @@ static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext * } if(get_bits1(gb)) coef = -coef; - *dst = coef; + *dst = (coef*q + 8) >> 4; } } /** * Decode 2x2 subblock of coefficients. */ -static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc) +static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc, int q) { int coeffs[4]; @@ -240,15 +240,35 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, coeffs[1] = modulo_three_table[code][1]; coeffs[2] = modulo_three_table[code][2]; coeffs[3] = modulo_three_table[code][3]; - decode_coeff(dst , coeffs[0], 3, gb, vlc); + decode_coeff(dst , coeffs[0], 3, gb, vlc, q); if(is_block2){ - decode_coeff(dst+8, coeffs[1], 2, gb, vlc); - decode_coeff(dst+1, coeffs[2], 2, gb, vlc); + decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q); + decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q); }else{ - decode_coeff(dst+1, coeffs[1], 2, gb, vlc); - decode_coeff(dst+8, coeffs[2], 2, gb, vlc); + decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q); + decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q); } - decode_coeff(dst+9, coeffs[3], 2, gb, vlc); + decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q); +} + +static inline void decode_subblock3(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc, + int q_dc, int q_ac1, int q_ac2) +{ + int coeffs[4]; + + coeffs[0] = modulo_three_table[code][0]; + coeffs[1] = modulo_three_table[code][1]; + coeffs[2] = modulo_three_table[code][2]; + coeffs[3] = modulo_three_table[code][3]; + decode_coeff(dst , coeffs[0], 3, gb, vlc, q_dc); + if(is_block2){ + decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q_ac1); + decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q_ac1); + }else{ + decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q_ac1); + decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q_ac1); + } + decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q_ac2); } /** @@ -262,7 +282,7 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, * o--o */ -static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc) +static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc, int q_dc, int q_ac1, int q_ac2) { int code, pattern; @@ -271,40 +291,24 @@ static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *r pattern = code & 0x7; code >>= 3; - decode_subblock(dst, code, 0, gb, &rvlc->coefficient); + decode_subblock3(dst, code, 0, gb, &rvlc->coefficient, q_dc, q_ac1, q_ac2); if(pattern & 4){ code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2); - decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient); + decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient, q_ac2); } if(pattern & 2){ // Looks like coefficients 1 and 2 are swapped for this block code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2); - decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient); + decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient, q_ac2); } if(pattern & 1){ code = get_vlc2(gb, rvlc->third_pattern[sc].table, 9, 2); - decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient); + decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient, q_ac2); } } /** - * Dequantize 4x4 block of DC values for 16x16 macroblock. - * @todo optimize - */ -static inline void rv34_dequant4x4_16x16(DCTELEM *block, int Qdc, int Q) -{ - int i; - - for(i = 0; i < 3; i++) - block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Qdc + 8) >> 4; - for(; i < 16; i++) - block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Q + 8) >> 4; -} -/** @} */ //block functions - - -/** * @name RV30/40 bitstream parsing * @{ */ @@ -1097,6 +1101,7 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types) MpegEncContext *s = &r->s; GetBitContext *gb = &s->gb; int cbp, cbp2; + int q_dc, q_ac; int i, blknum, blkoff; LOCAL_ALIGNED_16(DCTELEM, block16, [64]); int luma_dc_quant; @@ -1133,31 +1138,34 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types) luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16 ? r->luma_dc_quant_p[s->qscale] : r->luma_dc_quant_i[s->qscale]; if(r->is16){ + q_dc = rv34_qscale_tab[luma_dc_quant]; + q_ac = rv34_qscale_tab[s->qscale]; memset(block16, 0, 64 * sizeof(*block16)); - rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0); - rv34_dequant4x4_16x16(block16, rv34_qscale_tab[luma_dc_quant],rv34_qscale_tab[s->qscale]); + rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac); r->rdsp.rv34_inv_transform_tab[1](block16); } + q_ac = rv34_qscale_tab[s->qscale]; for(i = 0; i < 16; i++, cbp >>= 1){ if(!r->is16 && !(cbp & 1)) continue; blknum = ((i & 2) >> 1) + ((i & 8) >> 2); blkoff = ((i & 1) << 2) + ((i & 4) << 3); if(cbp & 1) - rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->luma_vlc, 0); - r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[s->qscale],rv34_qscale_tab[s->qscale]); + rv34_decode_block(s->block[blknum] + blkoff, gb, + r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac); if(r->is16) //FIXME: optimize s->block[blknum][blkoff] = block16[(i & 3) | ((i & 0xC) << 1)]; r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff); } if(r->block_type == RV34_MB_P_MIX16x16) r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1); + q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]]; + q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]]; for(; i < 24; i++, cbp >>= 1){ if(!(cbp & 1)) continue; blknum = ((i & 4) >> 2) + 4; blkoff = ((i & 1) << 2) + ((i & 2) << 4); - rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1); - r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]],rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]]); + rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac); r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff); } if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos])) diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h index f8f941d061..fa41a882a6 100644 --- a/libavcodec/rv34data.h +++ b/libavcodec/rv34data.h @@ -101,16 +101,6 @@ static const uint16_t rv34_qscale_tab[32] = { }; /** - * 4x4 dezigzag pattern - */ -static const uint8_t rv34_dezigzag[16] = { - 0, 1, 8, 16, - 9, 2, 3, 10, - 17, 24, 25, 18, - 11, 19, 26, 27 -}; - -/** * tables used to translate a quantizer value into a VLC set for decoding * The first table is used for intraframes. */ diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c index 974bf9ec16..1f4cea8544 100644 --- a/libavcodec/rv34dsp.c +++ b/libavcodec/rv34dsp.c @@ -100,26 +100,10 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){ /** @} */ // transform -/** - * Dequantize ordinary 4x4 block. - */ -void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q); -static void rv34_dequant4x4_c(DCTELEM *block, int Qdc, int Q) -{ - int i, j; - - block[0] = (block[0] * Qdc + 8) >> 4; - for (i = 0; i < 4; i++) - for (j = !i; j < 4; j++) - block[j + i*8] = (block[j + i*8] * Q + 8) >> 4; -} - av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) { c->rv34_inv_transform_tab[0] = rv34_inv_transform_c; c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c; - c->rv34_dequant4x4 = rv34_dequant4x4_c; - if (HAVE_NEON) ff_rv34dsp_init_neon(c, dsp); } diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h index 01352ea793..f2bc20e911 100644 --- a/libavcodec/rv34dsp.h +++ b/libavcodec/rv34dsp.h @@ -56,7 +56,6 @@ typedef struct RV34DSPContext { h264_chroma_mc_func avg_chroma_pixels_tab[3]; rv40_weight_func rv40_weight_pixels_tab[2]; rv34_inv_transform_func rv34_inv_transform_tab[2]; - void (*rv34_dequant4x4)(DCTELEM *block, int Qdc, int Q); rv40_weak_loop_filter_func rv40_weak_loop_filter[2]; rv40_strong_loop_filter_func rv40_strong_loop_filter[2]; rv40_loop_filter_strength_func rv40_loop_filter_strength[2]; -- 2.11.0