From d29237e5578a187c5a8d91338cd70ce0fd6f6003 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Fri, 15 Jan 2016 20:35:05 +0100 Subject: [PATCH] v210: Add avx2 version of the 8-bit line encoder Around 35% faster than the avx version. Signed-off-by: Henrik Gramner Signed-off-by: Luca Barbato --- libavcodec/v210enc.c | 5 ++-- libavcodec/v210enc.h | 1 + libavcodec/x86/v210enc.asm | 57 +++++++++++++++++++++++++++---------------- libavcodec/x86/v210enc_init.c | 7 ++++++ 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index ca6ad2ee2f..ce690f1634 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -86,6 +86,7 @@ av_cold void ff_v210enc_init(V210EncContext *s) { s->pack_line_8 = v210_planar_pack_8_c; s->pack_line_10 = v210_planar_pack_10_c; + s->sample_factor = 1; if (ARCH_X86) ff_v210enc_init_x86(s); @@ -172,13 +173,13 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, const uint8_t *v = pic->data[2]; for (h = 0; h < avctx->height; h++) { uint32_t val; - w = (avctx->width / 12) * 12; + w = (avctx->width / (12 * s->sample_factor)) * 12 * s->sample_factor; s->pack_line_8(y, u, v, dst, w); y += w; u += w >> 1; v += w >> 1; - dst += (w / 12) * 32; + dst += (w / (12 * s->sample_factor)) * 32 * s->sample_factor; for (; w < avctx->width - 5; w += 6) { WRITE_PIXELS8(u, y, v); diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h index 81a3531228..74b0514f2e 100644 --- a/libavcodec/v210enc.h +++ b/libavcodec/v210enc.h @@ -28,6 +28,7 @@ typedef struct V210EncContext { const uint8_t *v, uint8_t *dst, ptrdiff_t width); void (*pack_line_10)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); + int sample_factor; } V210EncContext; void ff_v210enc_init(V210EncContext *s); diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index bdefcdbfa1..7ff1f4948f 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -21,27 +21,26 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -v210_enc_min_10: times 8 dw 0x4 -v210_enc_max_10: times 8 dw 0x3fb +v210_enc_min_10: times 16 dw 0x4 +v210_enc_max_10: times 16 dw 0x3fb -v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 -v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 +v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0 +v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 -v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 -v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 +v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0 +v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 -v210_enc_min_8: times 16 db 0x1 -v210_enc_max_8: times 16 db 0xfe +v210_enc_min_8: times 32 db 0x1 +v210_enc_max_8: times 32 db 0xfe -v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 -v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 +v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0 +v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 -v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 -v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 - -v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 +v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0 +v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 +v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 SECTION .text @@ -102,7 +101,10 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width pxor m6, m6 .loop - movu m1, [yq+2*widthq] + movu xm1, [yq+2*widthq] +%if cpuflag(avx2) + vinserti128 m1, m1, [yq+2*widthq+12], 1 +%endif CLIPUB m1, m4, m5 punpcklbw m0, m1, m6 @@ -115,8 +117,13 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width pshufb m0, [v210_enc_luma_shuf_10] pshufb m1, [v210_enc_luma_shuf_10] - movq m3, [uq+widthq] - movhps m3, [vq+widthq] + movq xm3, [uq+widthq] + movhps xm3, [vq+widthq] +%if cpuflag(avx2) + movq xm2, [uq+widthq+6] + movhps xm2, [vq+widthq+6] + vinserti128 m3, m3, xm2, 1 +%endif CLIPUB m3, m4, m5 ; shuffle and multiply to get the same packing as in 10-bit @@ -131,11 +138,15 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width por m0, m2 por m1, m3 - movu [dstq], m0 - movu [dstq+mmsize], m1 + movu [dstq], xm0 + movu [dstq+16], xm1 +%if cpuflag(avx2) + vextracti128 [dstq+32], m0, 1 + vextracti128 [dstq+48], m1, 1 +%endif add dstq, 2*mmsize - add widthq, 6 + add widthq, (mmsize*3)/8 jl .loop RET @@ -149,3 +160,7 @@ v210_planar_pack_8 INIT_XMM avx v210_planar_pack_8 %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +v210_planar_pack_8 +%endif diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 95b999bc05..fd8508b43e 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -24,6 +24,8 @@ void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u, ptrdiff_t width); void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width); void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); @@ -39,4 +41,9 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s) if (EXTERNAL_AVX(cpu_flags)) s->pack_line_8 = ff_v210_planar_pack_8_avx; + + if (EXTERNAL_AVX2(cpu_flags)) { + s->sample_factor = 2; + s->pack_line_8 = ff_v210_planar_pack_8_avx2; + } } -- 2.11.0