From 1f821750f0b8d0c87cbf88a28ad699b92db5ec88 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 24 Jul 2015 08:24:21 +0200 Subject: [PATCH] hevcdsp: split the qpel functions by width instead of by the subpixel fraction This should allow for more efficient SIMD. Keep the C versions as they are now, to allow the compiler to inline the interpolation coefficients. --- libavcodec/hevc.c | 19 ++++++---- libavcodec/hevcdsp.c | 30 ++++++++-------- libavcodec/hevcdsp.h | 6 ++-- libavcodec/hevcdsp_template.c | 82 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 108 insertions(+), 29 deletions(-) diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index e1b1be3c3a..f2303ac6f5 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -1479,7 +1479,7 @@ static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) */ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride, AVFrame *ref, const Mv *mv, int x_off, int y_off, - int block_w, int block_h) + int block_w, int block_h, int pred_idx) { HEVCLocalContext *lc = &s->HEVClc; uint8_t *src = ref->data[0]; @@ -1513,8 +1513,8 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride, src = lc->edge_emu_buffer + buf_offset; srcstride = edge_emu_stride; } - s->hevcdsp.put_hevc_qpel[my][mx](dst, dststride, src, srcstride, block_w, - block_h, lc->mc_buffer); + s->hevcdsp.put_hevc_qpel[!!my][!!mx][pred_idx](dst, dststride, src, srcstride, + block_h, mx, my, lc->mc_buffer); } /** @@ -1651,6 +1651,11 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW, int nPbH, int log2_cb_size, int partIdx) { + static const int pred_indices[] = { + [4] = 0, [8] = 1, [12] = 2, [16] = 3, [24] = 4, [32] = 5, [48] = 6, [64] = 7, + }; + const int pred_idx = pred_indices[nPbW]; + #define POS(c_idx, x, y) \ &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \ (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)] @@ -1719,7 +1724,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]); luma_mc(s, tmp, tmpstride, ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH); + ¤t_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { @@ -1755,7 +1760,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]); luma_mc(s, tmp, tmpstride, ref1->frame, - ¤t_mv.mv[1], x0, y0, nPbW, nPbH); + ¤t_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { @@ -1792,9 +1797,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]); luma_mc(s, tmp, tmpstride, ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH); + ¤t_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx); luma_mc(s, tmp2, tmpstride, ref1->frame, - ¤t_mv.mv[1], x0, y0, nPbW, nPbH); + ¤t_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 216101a083..86d9e85b92 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -116,6 +116,12 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) #undef FUNC #define FUNC(a, depth) a ## _ ## depth +#define QPEL_FUNC(i, width, depth) \ + hevcdsp->put_hevc_qpel[0][0][i] = FUNC(put_hevc_qpel_pixels_ ## width, depth); \ + hevcdsp->put_hevc_qpel[0][1][i] = FUNC(put_hevc_qpel_h_ ## width, depth); \ + hevcdsp->put_hevc_qpel[1][0][i] = FUNC(put_hevc_qpel_v_ ## width, depth); \ + hevcdsp->put_hevc_qpel[1][1][i] = FUNC(put_hevc_qpel_hv_ ## width, depth); \ + #define HEVC_DSP(depth) \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \ @@ -139,22 +145,14 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth); \ hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth); \ \ - hevcdsp->put_hevc_qpel[0][0] = FUNC(put_hevc_qpel_pixels, depth); \ - hevcdsp->put_hevc_qpel[0][1] = FUNC(put_hevc_qpel_h1, depth); \ - hevcdsp->put_hevc_qpel[0][2] = FUNC(put_hevc_qpel_h2, depth); \ - hevcdsp->put_hevc_qpel[0][3] = FUNC(put_hevc_qpel_h3, depth); \ - hevcdsp->put_hevc_qpel[1][0] = FUNC(put_hevc_qpel_v1, depth); \ - hevcdsp->put_hevc_qpel[1][1] = FUNC(put_hevc_qpel_h1v1, depth); \ - hevcdsp->put_hevc_qpel[1][2] = FUNC(put_hevc_qpel_h2v1, depth); \ - hevcdsp->put_hevc_qpel[1][3] = FUNC(put_hevc_qpel_h3v1, depth); \ - hevcdsp->put_hevc_qpel[2][0] = FUNC(put_hevc_qpel_v2, depth); \ - hevcdsp->put_hevc_qpel[2][1] = FUNC(put_hevc_qpel_h1v2, depth); \ - hevcdsp->put_hevc_qpel[2][2] = FUNC(put_hevc_qpel_h2v2, depth); \ - hevcdsp->put_hevc_qpel[2][3] = FUNC(put_hevc_qpel_h3v2, depth); \ - hevcdsp->put_hevc_qpel[3][0] = FUNC(put_hevc_qpel_v3, depth); \ - hevcdsp->put_hevc_qpel[3][1] = FUNC(put_hevc_qpel_h1v3, depth); \ - hevcdsp->put_hevc_qpel[3][2] = FUNC(put_hevc_qpel_h2v3, depth); \ - hevcdsp->put_hevc_qpel[3][3] = FUNC(put_hevc_qpel_h3v3, depth); \ + QPEL_FUNC(0, 4, depth); \ + QPEL_FUNC(1, 8, depth); \ + QPEL_FUNC(2, 12, depth); \ + QPEL_FUNC(3, 16, depth); \ + QPEL_FUNC(4, 24, depth); \ + QPEL_FUNC(5, 32, depth); \ + QPEL_FUNC(6, 48, depth); \ + QPEL_FUNC(7, 64, depth); \ \ hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth); \ hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth); \ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index 72784645e9..c250385341 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -58,9 +58,9 @@ typedef struct HEVCDSPContext { int height, int c_idx, uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge); - void (*put_hevc_qpel[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, - ptrdiff_t srcstride, int width, int height, - int16_t *mcbuffer); + void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, + ptrdiff_t srcstride, int height, + int mx, int my, int16_t *mcbuffer); void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t *mcbuffer); diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 390f683295..84503ec2f6 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -775,9 +775,11 @@ static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src, #undef TR_16 #undef TR_32 -static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride, - uint8_t *_src, ptrdiff_t _srcstride, - int width, int height, int16_t* mcbuffer) +static av_always_inline void +FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, int my, + int16_t* mcbuffer) { int x, y; pixel *src = (pixel *)_src; @@ -906,6 +908,80 @@ PUT_HEVC_QPEL_HV(3, 1) PUT_HEVC_QPEL_HV(3, 2) PUT_HEVC_QPEL_HV(3, 3) +#define QPEL(W) \ +static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height, \ + mx, my, mcbuffer); \ +} \ + \ +static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + if (mx == 1) \ + FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else if (mx == 2) \ + FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else \ + FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer); \ +} \ + \ +static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + if (my == 1) \ + FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else if (my == 2) \ + FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else \ + FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \ +} \ + \ +static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + if (my == 1) { \ + if (mx == 1) \ + FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else if (mx == 2) \ + FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else \ + FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + } else if (my == 2) { \ + if (mx == 1) \ + FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else if (mx == 2) \ + FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else \ + FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + } else { \ + if (mx == 1) \ + FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else if (mx == 2) \ + FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + else \ + FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \ + } \ +} + +QPEL(64) +QPEL(48) +QPEL(32) +QPEL(24) +QPEL(16) +QPEL(12) +QPEL(8) +QPEL(4) + static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, int my, -- 2.11.0