From 81f2a3f4ffcc6935b8b8ada4954700b3f333ae4f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 31 Jan 2011 20:55:56 -0500 Subject: [PATCH] Implement a SIMD version of emulated_edge_mc() for x86. From ~550 cycles (C version) to 170 (SSE/x86-64), 206 (MMX/x86-32) and 196 (SSE2/x86-32) cycles. --- libavcodec/x86/dsputil_mmx.c | 110 +++++++- libavcodec/x86/dsputil_yasm.asm | 560 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 667 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 8257b3fa8..2eb7d85f1 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1664,8 +1664,80 @@ QPEL_2TAP(avg_, 8, 3dnow) static void just_return(void) { return; } #endif -static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ +#if HAVE_YASM +typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src, + x86_reg linesize, x86_reg start_y, + x86_reg end_y, x86_reg block_h, + x86_reg start_x, x86_reg end_x, + x86_reg block_w); +extern emu_edge_core_func ff_emu_edge_core_mmx; +extern emu_edge_core_func ff_emu_edge_core_sse; + +static av_always_inline +void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h, + emu_edge_core_func *core_fn) +{ + int start_y, start_x, end_y, end_x, src_y_add=0; + + if(src_y>= h){ + src_y_add = h-1-src_y; + src_y=h-1; + }else if(src_y<=-block_h){ + src_y_add = 1-block_h-src_y; + src_y=1-block_h; + } + if(src_x>= w){ + src+= (w-1-src_x); + src_x=w-1; + }else if(src_x<=-block_w){ + src+= (1-block_w-src_x); + src_x=1-block_w; + } + + start_y= FFMAX(0, -src_y); + start_x= FFMAX(0, -src_x); + end_y= FFMIN(block_h, h-src_y); + end_x= FFMIN(block_w, w-src_x); + assert(start_x < end_x && block_w > 0); + assert(start_y < end_y && block_h > 0); + + // fill in the to-be-copied part plus all above/below + src += (src_y_add+start_y)*linesize + start_x; + buf += start_x; + core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w); +} + +#if ARCH_X86_32 +static av_noinline +void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h) +{ + emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, + w, h, &ff_emu_edge_core_mmx); +} +#endif +static av_noinline +void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h) +{ + emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, + w, h, &ff_emu_edge_core_sse); +} +#endif /* HAVE_YASM */ + +typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src, + int linesize, int block_w, int block_h, + int src_x, int src_y, int w, int h); + +static av_always_inline +void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height, + emulated_edge_mc_func *emu_edge_fn) +{ const int w = 8; const int ix = ox>>(16+shift); const int iy = oy>>(16+shift); @@ -1701,7 +1773,7 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o if( (unsigned)ix >= width-w || (unsigned)iy >= height-h ) { - ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); + emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); src = edge_buf; } @@ -1782,6 +1854,30 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o } } +#if HAVE_YASM +#if ARCH_X86_32 +static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &emulated_edge_mc_mmx); +} +#endif +static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &emulated_edge_mc_sse); +} +#else +static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &ff_emulated_edge_mc); +} +#endif + #define PREFETCH(name, op) \ static void name(void *mem, int stride, int h){\ const uint8_t *p= mem;\ @@ -2626,7 +2722,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_HPEL_FUNCS(avg, 1, 8, mmx); SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); +#if ARCH_X86_32 || !HAVE_YASM c->gmc= gmc_mmx; +#endif +#if ARCH_X86_32 && HAVE_YASM + c->emulated_edge_mc = emulated_edge_mc_mmx; +#endif c->add_bytes= add_bytes_mmx; c->add_bytes_l2= add_bytes_l2_mmx; @@ -2913,6 +3014,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if HAVE_YASM c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; + + c->emulated_edge_mc = emulated_edge_mc_sse; + c->gmc= gmc_sse; #endif } if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index bda04727d..099f0a80d 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -421,3 +421,563 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset fld dword r0m %endif RET + +; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, +; x86_reg start_y, x86_reg end_y, x86_reg block_h, +; x86_reg start_x, x86_reg end_x, x86_reg block_w); +; +; The actual function itself is below. It basically wraps a very simple +; w = end_x - start_x +; if (w) { +; if (w > 22) { +; jump to the slow loop functions +; } else { +; jump to the fast loop functions +; } +; } +; +; ... and then the same for left/right extend also. See below for loop +; function implementations. Fast are fixed-width, slow is variable-width + +%macro EMU_EDGE_FUNC 1 +%ifdef ARCH_X86_64 +%define w_reg r10 +cglobal emu_edge_core_%1, 6, 7, 1 + mov r11, r5 ; save block_h +%else +%define w_reg r6 +cglobal emu_edge_core_%1, 2, 7, 0 + mov r4, r4m ; end_y + mov r5, r5m ; block_h +%endif + + ; start with vertical extend (top/bottom) and body pixel copy + mov w_reg, r7m + sub w_reg, r6m ; w = start_x - end_x + sub r5, r4 +%ifdef ARCH_X86_64 + sub r4, r3 +%else + sub r4, dword r3m +%endif + cmp w_reg, 22 + jg .slow_v_extend_loop +%ifdef ARCH_X86_32 + mov r2, r2m ; linesize +%endif + sal w_reg, 7 ; w * 128 +%ifdef PIC + lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] + add w_reg, rax +%else + lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] +%endif + call w_reg ; fast top extend, body copy and bottom extend +.v_extend_end: + + ; horizontal extend (left/right) + mov w_reg, r6m ; start_x + sub r0, w_reg +%ifdef ARCH_X86_64 + mov r3, r0 ; backup of buf+block_h*linesize + mov r5, r11 +%else + mov r0m, r0 ; backup of buf+block_h*linesize + mov r5, r5m +%endif + test w_reg, w_reg + jz .right_extend + cmp w_reg, 22 + jg .slow_left_extend_loop + mov r1, w_reg + dec w_reg + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar w_reg, 1 + sal w_reg, 6 + ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs + ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h +%ifdef PIC + lea rax, [.emuedge_extend_left_2] + add w_reg, rax +%else + lea w_reg, [.emuedge_extend_left_2+w_reg] +%endif + call w_reg + + ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w +.right_extend: +%ifdef ARCH_X86_32 + mov r0, r0m + mov r5, r5m +%endif + mov w_reg, r7m ; end_x + mov r1, r8m ; block_w + mov r4, r1 + sub r1, w_reg + jz .h_extend_end ; if (end_x == block_w) goto h_extend_end + cmp r1, 22 + jg .slow_right_extend_loop + dec r1 + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar r1, 1 + sal r1, 6 +%ifdef PIC + lea rax, [.emuedge_extend_right_2] + add r1, rax +%else + lea r1, [.emuedge_extend_right_2+r1] +%endif + call r1 +.h_extend_end: + RET + +%ifdef ARCH_X86_64 +%define vall al +%define valh ah +%define valw ax +%define valw2 r10w +%define valw3 r3w +%define vald eax +%else +%define vall bl +%define valh bh +%define valw bx +%define valw2 r6w +%define valw3 valw2 +%define vald ebx +%define stack_offset 0x14 +%endif + +%endmacro + +; macro to read/write a horizontal number of pixels (%2) to/from registers +; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 15 == 8) fills the last 8 bytes into rax +; - else if (%2 & 8) fills 8 bytes into mm0 +; - if (%2 & 7 == 4) fills the last 4 bytes into rax +; - else if (%2 & 4) fills 4 bytes into mm0-1 +; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax +; (note that we're using r3 for body/bottom because it's a shorter +; opcode, and then the loop fits in 128 bytes) +; - else fills remaining bytes into rax +; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 7 == 4) fills 4 bytes into ebx +; - else if (%2 & 4) fills 4 bytes into mm0-7 +; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx +; - else fills remaining bytes into ebx +; writing data out is in the same way +%macro READ_NUM_BYTES 3 +%assign %%src_off 0 ; offset in source buffer +%assign %%smidx 0 ; mmx register idx +%assign %%sxidx 0 ; xmm register idx + +%ifnidn %3, mmx +%rep %2/16 + movdqu xmm %+ %%sxidx, [r1+%%src_off] +%assign %%src_off %%src_off+16 +%assign %%sxidx %%sxidx+1 +%endrep ; %2/16 +%endif ; !mmx + +%ifdef ARCH_X86_64 +%if (%2-%%src_off) == 8 + mov rax, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%endif ; (%2-%%src_off) == 8 +%endif ; x86-64 + +%rep (%2-%%src_off)/8 + movq mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%assign %%smidx %%smidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%src_off) == 4 + mov vald, [r1+%%src_off] +%elif (%2-%%src_off) & 4 + movd mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+4 +%endif ; (%2-%%src_off) ==/& 4 + +%if (%2-%%src_off) == 1 + mov vall, [r1+%%src_off] +%elif (%2-%%src_off) == 2 + mov valw, [r1+%%src_off] +%elif (%2-%%src_off) == 3 +%ifidn %1, top + mov valw2, [r1+%%src_off] +%else ; %1 != top + mov valw3, [r1+%%src_off] +%endif ; %1 ==/!= top + mov vall, [r1+%%src_off+2] +%endif ; (%2-%%src_off) == 1/2/3 +%endmacro ; READ_NUM_BYTES + +%macro WRITE_NUM_BYTES 3 +%assign %%dst_off 0 ; offset in destination buffer +%assign %%dmidx 0 ; mmx register idx +%assign %%dxidx 0 ; xmm register idx + +%ifnidn %3, mmx +%rep %2/16 + movdqu [r0+%%dst_off], xmm %+ %%dxidx +%assign %%dst_off %%dst_off+16 +%assign %%dxidx %%dxidx+1 +%endrep ; %2/16 +%endif + +%ifdef ARCH_X86_64 +%if (%2-%%dst_off) == 8 + mov [r0+%%dst_off], rax +%assign %%dst_off %%dst_off+8 +%endif ; (%2-%%dst_off) == 8 +%endif ; x86-64 + +%rep (%2-%%dst_off)/8 + movq [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+8 +%assign %%dmidx %%dmidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%dst_off) == 4 + mov [r0+%%dst_off], vald +%elif (%2-%%dst_off) & 4 + movd [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+4 +%endif ; (%2-%%dst_off) ==/& 4 + +%if (%2-%%dst_off) == 1 + mov [r0+%%dst_off], vall +%elif (%2-%%dst_off) == 2 + mov [r0+%%dst_off], valw +%elif (%2-%%dst_off) == 3 +%ifidn %1, top + mov [r0+%%dst_off], valw2 +%else ; %1 != top + mov [r0+%%dst_off], valw3 +%endif ; %1 ==/!= top + mov [r0+%%dst_off+2], vall +%endif ; (%2-%%dst_off) == 1/2/3 +%endmacro ; WRITE_NUM_BYTES + +; vertical top/bottom extend and body copy fast loops +; these are function pointers to set-width line copy functions, i.e. +; they read a fixed number of pixels into set registers, and write +; those out into the destination buffer +; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r6(eax/64)/r3(ebx/32)=val_reg +%macro VERTICAL_EXTEND 1 +%assign %%n 1 +%rep 22 +ALIGN 128 +.emuedge_v_extend_ %+ %%n: + ; extend pixels above body +%ifdef ARCH_X86_64 + test r3 , r3 ; if (!start_y) + jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body +%else ; ARCH_X86_32 + cmp dword r3m, 0 + je .emuedge_copy_body_ %+ %%n %+ _loop +%endif ; ARCH_X86_64/32 + READ_NUM_BYTES top, %%n, %1 ; read bytes +.emuedge_extend_top_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES top, %%n, %1 ; write bytes + add r0 , r2 ; dst += linesize +%ifdef ARCH_X86_64 + dec r3 +%else ; ARCH_X86_32 + dec dword r3m +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) + + ; copy body pixels +.emuedge_copy_body_ %+ %%n %+ _loop: ; do { + READ_NUM_BYTES body, %%n, %1 ; read bytes + WRITE_NUM_BYTES body, %%n, %1 ; write bytes + add r0 , r2 ; dst += linesize + add r1 , r2 ; src += linesize + dec r4 + jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) + + ; copy bottom pixels + test r5 , r5 ; if (!block_h) + jz .emuedge_v_extend_end_ %+ %%n ; goto end + sub r1 , r2 ; src -= linesize + READ_NUM_BYTES bottom, %%n, %1 ; read bytes +.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes + add r0 , r2 ; dst += linesize + dec r5 + jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) + +.emuedge_v_extend_end_ %+ %%n: +%ifdef ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+1 +%endrep +%endmacro VERTICAL_EXTEND + +; left/right (horizontal) fast extend functions +; these are essentially identical to the vertical extend ones above, +; just left/right separated because number of pixels to extend is +; obviously not the same on both sides. +; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the +; lowest two bytes of the register (so val*0x0101), and are splatted +; into each byte of mm0 as well if n_pixels >= 8 + +%macro READ_V_PIXEL 3 + mov vall, %2 + mov valh, vall +%if %1 >= 8 + movd mm0, vald +%ifidn %3, mmx + punpcklwd mm0, mm0 + punpckldq mm0, mm0 +%else ; !mmx + pshufw mm0, mm0, 0 +%endif ; mmx +%endif ; %1 >= 8 +%endmacro + +%macro WRITE_V_PIXEL 2 +%assign %%dst_off 0 +%rep %1/8 + movq [%2+%%dst_off], mm0 +%assign %%dst_off %%dst_off+8 +%endrep +%if %1 & 4 +%if %1 >= 8 + movd [%2+%%dst_off], mm0 +%else ; %1 < 8 + mov [%2+%%dst_off] , valw + mov [%2+%%dst_off+2], valw +%endif ; %1 >=/< 8 +%assign %%dst_off %%dst_off+4 +%endif ; %1 & 4 +%if %1&2 + mov [%2+%%dst_off], valw +%endif ; %1 & 2 +%endmacro + +; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val +%macro LEFT_EXTEND 1 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_left_ %+ %%n: ; do { + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels + WRITE_V_PIXEL %%n, r0 ; write pixels + dec r5 + jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) +%ifdef ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep +%endmacro ; LEFT_EXTEND + +; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val +%macro RIGHT_EXTEND 1 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_right_ %+ %%n: ; do { +%ifdef ARCH_X86_64 + sub r3, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels + WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels + dec r11 +%else ; ARCH_X86_32 + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels + WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels + dec r5 +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) +%ifdef ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep + +%ifdef ARCH_X86_32 +%define stack_offset 0x10 +%endif +%endmacro ; RIGHT_EXTEND + +; below follow the "slow" copy/extend functions, these act on a non-fixed +; width specified in a register, and run a loop to copy the full amount +; of bytes. They are optimized for copying of large amounts of pixels per +; line, so they unconditionally splat data into mm registers to copy 8 +; bytes per loop iteration. It could be considered to use xmm for x86-64 +; also, but I haven't optimized this as much (i.e. FIXME) +%macro V_COPY_NPX 4-5 +%if %0 == 4 + test w_reg, %4 + jz .%1_skip_%4_px +%else ; %0 == 5 +.%1_%4_px_loop: +%endif + %3 %2, [r1+cnt_reg] + %3 [r0+cnt_reg], %2 + add cnt_reg, %4 +%if %0 == 5 + sub w_reg, %4 + test w_reg, %5 + jnz .%1_%4_px_loop +%endif +.%1_skip_%4_px: +%endmacro + +%macro V_COPY_ROW 3 +%ifidn %1, bottom + sub r1, linesize +%endif +.%1_copy_loop: + xor cnt_reg, cnt_reg +%ifidn %3, mmx +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 +%else ; !mmx + V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0 +%ifdef ARCH_X86_64 +%define linesize r2 + V_COPY_NPX %1, rax , mov, 8 +%else ; ARCH_X86_32 +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8 +%endif ; ARCH_X86_64/32 +%endif ; mmx + V_COPY_NPX %1, vald, mov, 4 + V_COPY_NPX %1, valw, mov, 2 + V_COPY_NPX %1, vall, mov, 1 + mov w_reg, cnt_reg +%ifidn %1, body + add r1, linesize +%endif + add r0, linesize + dec %2 + jnz .%1_copy_loop +%endmacro + +%macro SLOW_V_EXTEND 1 +.slow_v_extend_loop: +; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x +%ifdef ARCH_X86_64 + push r11 ; save old value of block_h + test r3, r3 +%define cnt_reg r11 + jz .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, r3, %1 +%else + cmp dword r3m, 0 +%define cnt_reg r2 + je .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, dword r3m, %1 +%endif + +.do_body_copy: + V_COPY_ROW body, r4, %1 + +%ifdef ARCH_X86_64 + pop r11 ; restore old value of block_h +%define cnt_reg r3 +%endif + test r5, r5 +%ifdef ARCH_X86_64 + jz .v_extend_end +%else + jz .skip_bottom_extend +%endif + V_COPY_ROW bottom, r5, %1 +%ifdef ARCH_X86_32 +.skip_bottom_extend: + mov r2, r2m +%endif + jmp .v_extend_end +%endmacro + +%macro SLOW_LEFT_EXTEND 1 +.slow_left_extend_loop: +; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x + mov r4, 8 + sub r0, linesize + READ_V_PIXEL 8, [r0+w_reg], %1 +.left_extend_8px_loop: + movq [r0+r4-8], mm0 + add r4, 8 + cmp r4, w_reg + jle .left_extend_8px_loop + sub r4, 8 + cmp r4, w_reg + jge .left_extend_loop_end +.left_extend_2px_loop: + mov [r0+r4], valw + add r4, 2 + cmp r4, w_reg + jl .left_extend_2px_loop +.left_extend_loop_end: + dec r5 + jnz .slow_left_extend_loop +%ifdef ARCH_X86_32 + mov r2, r2m +%endif + jmp .right_extend +%endmacro + +%macro SLOW_RIGHT_EXTEND 1 +.slow_right_extend_loop: +; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, +; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr +%ifdef ARCH_X86_64 +%define buf_reg r3 +%define bh_reg r11 +%else +%define buf_reg r0 +%define bh_reg r5 +%endif + lea r1, [r4-8] + sub buf_reg, linesize + READ_V_PIXEL 8, [buf_reg+w_reg-1], %1 +.right_extend_8px_loop: + movq [buf_reg+r1], mm0 + sub r1, 8 + cmp r1, w_reg + jge .right_extend_8px_loop + add r1, 8 + cmp r1, w_reg + je .right_extend_loop_end +.right_extend_2px_loop: + sub r1, 2 + mov [buf_reg+r1], valw + cmp r1, w_reg + jg .right_extend_2px_loop +.right_extend_loop_end: + dec bh_reg + jnz .slow_right_extend_loop + jmp .h_extend_end +%endmacro + +%macro emu_edge 1 +EMU_EDGE_FUNC %1 +VERTICAL_EXTEND %1 +LEFT_EXTEND %1 +RIGHT_EXTEND %1 +SLOW_V_EXTEND %1 +SLOW_LEFT_EXTEND %1 +SLOW_RIGHT_EXTEND %1 +%endmacro + +emu_edge sse +%ifdef ARCH_X86_32 +emu_edge mmx +%endif -- 2.11.0