;-----------------------------------------------------------------------------
%macro ADD_DC 4
- %4 m2, [r0+%3]
- %4 m3, [r0+r2+%3]
- %4 m4, [r1+%3]
- %4 m5, [r1+r2+%3]
+ %4 m2, [dst1q+%3]
+ %4 m3, [dst1q+strideq+%3]
+ %4 m4, [dst2q+%3]
+ %4 m5, [dst2q+strideq+%3]
paddusb m2, %1
paddusb m3, %1
paddusb m4, %1
psubusb m3, %2
psubusb m4, %2
psubusb m5, %2
- %4 [r0+%3], m2
- %4 [r0+r2+%3], m3
- %4 [r1+%3], m4
- %4 [r1+r2+%3], m5
+ %4 [dst1q+%3], m2
+ %4 [dst1q+strideq+%3], m3
+ %4 [dst2q+%3], m4
+ %4 [dst2q+strideq+%3], m5
%endmacro
INIT_MMX mmx
-cglobal vp8_idct_dc_add, 3, 3
+cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1]
+ movd m0, [blockq]
; calculate DC
paddw m0, [pw_4]
pxor m1, m1
psraw m0, 3
- movd [r1], m1
+ movd [blockq], m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklwd m1, m1
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh
RET
INIT_XMM sse4
-cglobal vp8_idct_dc_add, 3, 3, 6
+cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data
- movd m0, [r1]
+ movd m0, [blockq]
pxor m1, m1
; calculate DC
paddw m0, [pw_4]
- movd [r1], m1
- lea r1, [r0+r2*2]
- movd m2, [r0]
- movd m3, [r0+r2]
- movd m4, [r1]
- movd m5, [r1+r2]
+ movd [blockq], m1
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ movd m2, [dst1q]
+ movd m3, [dst1q+strideq]
+ movd m4, [dst2q]
+ movd m5, [dst2q+strideq]
psraw m0, 3
pshuflw m0, m0, 0
punpcklqdq m0, m0
paddw m2, m0
paddw m4, m0
packuswb m2, m4
- movd [r0], m2
- pextrd [r0+r2], m2, 1
- pextrd [r1], m2, 2
- pextrd [r1+r2], m2, 3
+ movd [dst1q], m2
+ pextrd [dst1q+strideq], m2, 1
+ pextrd [dst2q], m2, 2
+ pextrd [dst2q+strideq], m2, 3
RET
;-----------------------------------------------------------------------------
%if ARCH_X86_32
INIT_MMX mmx
-cglobal vp8_idct_dc_add4y, 3, 3
+cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m6
- movd [r1+32*1], m6
- movd [r1+32*2], m6
- movd [r1+32*3], m6
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
punpckhbw m7, m7 ; CCCCDDDD
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
ADD_DC m1, m7, 8, mova
RET
%endif
INIT_XMM sse2
-cglobal vp8_idct_dc_add4y, 3, 3, 6
+cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m1, m1
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m1
- movd [r1+32*1], m1
- movd [r1+32*2], m1
- movd [r1+32*3], m1
+ movd [blockq+32*0], m1
+ movd [blockq+32*1], m1
+ movd [blockq+32*2], m1
+ movd [blockq+32*3], m1
psraw m0, 3
psubw m1, m0
packuswb m0, m0
punpcklbw m1, m1
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, mova
RET
;-----------------------------------------------------------------------------
INIT_MMX mmx
-cglobal vp8_idct_dc_add4uv, 3, 3
+cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m6
- movd [r1+32*1], m6
- movd [r1+32*2], m6
- movd [r1+32*3], m6
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
punpckhbw m7, m7 ; CCCCDDDD
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
+ lea dst1q, [dst1q+strideq*4]
+ lea dst2q, [dst2q+strideq*4]
ADD_DC m1, m7, 0, mova
RET
%endmacro
%macro VP8_IDCT_ADD 0
-cglobal vp8_idct_add, 3, 3
+cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
; load block data
- movq m0, [r1+ 0]
- movq m1, [r1+ 8]
- movq m2, [r1+16]
- movq m3, [r1+24]
+ movq m0, [blockq+ 0]
+ movq m1, [blockq+ 8]
+ movq m2, [blockq+16]
+ movq m3, [blockq+24]
movq m6, [pw_20091]
movq m7, [pw_17734]
%if cpuflag(sse)
xorps xmm0, xmm0
- movaps [r1+ 0], xmm0
- movaps [r1+16], xmm0
+ movaps [blockq+ 0], xmm0
+ movaps [blockq+16], xmm0
%else
pxor m4, m4
- movq [r1+ 0], m4
- movq [r1+ 8], m4
- movq [r1+16], m4
- movq [r1+24], m4
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+ movq [blockq+16], m4
+ movq [blockq+24], m4
%endif
; actual IDCT
; store
pxor m4, m4
- lea r1, [r0+2*r2]
- STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
- STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+2*strideq]
+ STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
+ STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
RET
%endmacro
;-----------------------------------------------------------------------------
%macro SCATTER_WHT 3
- movd r1d, m%1
- movd r2d, m%2
- mov [r0+2*16*(0+%3)], r1w
- mov [r0+2*16*(1+%3)], r2w
- shr r1d, 16
- shr r2d, 16
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(0+%3)], dc1w
+ mov [blockq+2*16*(1+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
psrlq m%1, 32
psrlq m%2, 32
- mov [r0+2*16*(4+%3)], r1w
- mov [r0+2*16*(5+%3)], r2w
- movd r1d, m%1
- movd r2d, m%2
- mov [r0+2*16*(8+%3)], r1w
- mov [r0+2*16*(9+%3)], r2w
- shr r1d, 16
- shr r2d, 16
- mov [r0+2*16*(12+%3)], r1w
- mov [r0+2*16*(13+%3)], r2w
+ mov [blockq+2*16*(4+%3)], dc1w
+ mov [blockq+2*16*(5+%3)], dc2w
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(8+%3)], dc1w
+ mov [blockq+2*16*(9+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ mov [blockq+2*16*(12+%3)], dc1w
+ mov [blockq+2*16*(13+%3)], dc2w
%endmacro
%macro HADAMARD4_1D 4
%endmacro
%macro VP8_DC_WHT 0
-cglobal vp8_luma_dc_wht, 2, 3
- movq m0, [r1]
- movq m1, [r1+8]
- movq m2, [r1+16]
- movq m3, [r1+24]
+cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
+ movq m0, [dc1q]
+ movq m1, [dc1q+8]
+ movq m2, [dc1q+16]
+ movq m3, [dc1q+24]
%if cpuflag(sse)
xorps xmm0, xmm0
- movaps [r1+ 0], xmm0
- movaps [r1+16], xmm0
+ movaps [dc1q+ 0], xmm0
+ movaps [dc1q+16], xmm0
%else
pxor m4, m4
- movq [r1+ 0], m4
- movq [r1+ 8], m4
- movq [r1+16], m4
- movq [r1+24], m4
+ movq [dc1q+ 0], m4
+ movq [dc1q+ 8], m4
+ movq [dc1q+16], m4
+ movq [dc1q+24], m4
%endif
HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4