1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2003-2013 Michael Niedermayer
5 ;* Copyright (c) 2013 Daniel Kang
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 pb_zzzzzzzz77777777: times 8 db -1
30 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
31 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
32 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
33 pd_16384: times 4 dd 16384
34 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
38 %macro SCALARPRODUCT 0
39 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
40 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
47 movu m0, [v1q + orderq]
48 movu m1, [v1q + orderq + mmsize]
49 pmaddwd m0, [v2q + orderq]
50 pmaddwd m1, [v2q + orderq + mmsize]
69 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
70 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
85 movu m0, [v2q + orderq]
86 movu m1, [v2q + orderq + mmsize]
87 mova m4, [v1q + orderq]
88 mova m5, [v1q + orderq + mmsize]
89 movu m2, [v3q + orderq]
90 movu m3, [v3q + orderq + mmsize]
99 mova [v1q + orderq], m2
100 mova [v1q + orderq + mmsize], m3
120 %macro SCALARPRODUCT_LOOP 1
126 mova m4, [v2q + orderq]
127 mova m0, [v2q + orderq + mmsize]
131 mova m5, [v3q + orderq]
132 mova m2, [v3q + orderq + mmsize]
136 mova m0, [v2q + orderq]
137 mova m1, [v2q + orderq + mmsize]
138 mova m2, [v3q + orderq]
139 mova m3, [v3q + orderq + mmsize]
141 %define t0 [v1q + orderq]
142 %define t1 [v1q + orderq + mmsize]
157 mova [v1q + orderq], m2
158 mova [v1q + orderq + mmsize], m3
165 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
167 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
177 mova m4, [v2q + orderq]
178 mova m5, [v3q + orderq]
179 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
194 SCALARPRODUCT_LOOP 14
195 SCALARPRODUCT_LOOP 12
196 SCALARPRODUCT_LOOP 10
211 ;-----------------------------------------------------------------------------
212 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
213 ; const int16_t *window, unsigned int len)
214 ;-----------------------------------------------------------------------------
216 %macro REVERSE_WORDS 1-2
217 %if cpuflag(ssse3) && notcpuflag(atom)
223 %elif cpuflag(mmxext)
229 %if cpuflag(ssse3) ; dst, src, unused
230 ; dst = ((dst * src) + (1<<14)) >> 15
232 %elif cpuflag(mmxext) ; dst, src, temp
233 ; dst = (dst * src) >> 15
234 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
235 ; in from the pmullw result.
245 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
247 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
249 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
251 lea offset2q, [offsetq-mmsize]
252 %if cpuflag(ssse3) && notcpuflag(atom)
253 mova m5, [pb_revwords]
260 ; This version does the 16x16->16 multiplication in-place without expanding
261 ; to 32-bit. The ssse3 version is bit-identical.
262 mova m0, [windowq+offset2q]
263 mova m1, [ inputq+offset2q]
266 pmulhrsw m0, [ inputq+offsetq ]
267 mova [outputq+offset2q], m1
268 mova [outputq+offsetq ], m0
270 ; This version expands 16-bit to 32-bit, multiplies by the window,
271 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
272 ; save to the output. The window is reversed for the second half.
273 mova m3, [windowq+offset2q]
274 mova m4, [ inputq+offset2q]
288 mova [outputq+offset2q], m0
290 mova m4, [ inputq+offsetq]
304 mova [outputq+offsetq], m0
306 ; This version does the 16x16->16 multiplication in-place without expanding
307 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
308 ; therefore are not bit-identical to the C version.
309 mova m0, [windowq+offset2q]
310 mova m1, [ inputq+offset2q]
311 mova m2, [ inputq+offsetq ]
312 MUL16FIXED m1, m0, m3
314 MUL16FIXED m2, m0, m3
315 mova [outputq+offset2q], m1
316 mova [outputq+offsetq ], m2
339 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
341 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
344 movd mm4, [left_topq]
349 psubb mm0, mm4 ; t-tl
361 psubb mm0, mm4 ; t-tl
367 paddb mm4, mm3 ; t-tl+l
372 pmaxub mm3, mm5 ; median
373 paddb mm3, mm2 ; +residual
393 movzx r2d, byte [dstq-1]
395 movzx r2d, byte [topq-1]
400 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
430 movhps [dstq+wq+8], m0
442 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
444 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
447 mova m4, [pb_zzzz3333zzzzbbbb]
448 mova m3, [pb_zz11zz55zz99zzdd]
451 ADD_HFYU_LEFT_LOOP 1, 1
454 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
456 mova m6, [pb_zzzzzzzz77777777]
457 mova m4, [pb_zzzz3333zzzzbbbb]
458 mova m3, [pb_zz11zz55zz99zzdd]
465 ADD_HFYU_LEFT_LOOP 1, 1
467 ADD_HFYU_LEFT_LOOP 0, 1
469 ADD_HFYU_LEFT_LOOP 0, 0
471 ;-----------------------------------------------------------------------------
472 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
473 ; int32_t max, unsigned int len)
474 ;-----------------------------------------------------------------------------
476 ; %1 = number of xmm registers used
477 ; %2 = number of inline load/process/store loops per asm loop
478 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
479 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
481 %macro VECTOR_CLIP_INT32 4-5
482 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
495 mova m0, [srcq+mmsize*0*%%i]
496 mova m1, [srcq+mmsize*1*%%i]
497 mova m2, [srcq+mmsize*2*%%i]
498 mova m3, [srcq+mmsize*3*%%i]
500 mova m7, [srcq+mmsize*4*%%i]
501 mova m8, [srcq+mmsize*5*%%i]
502 mova m9, [srcq+mmsize*6*%%i]
503 mova m10, [srcq+mmsize*7*%%i]
513 CLIPD m10, m4, m5, m6
515 mova [dstq+mmsize*0*%%i], m0
516 mova [dstq+mmsize*1*%%i], m1
517 mova [dstq+mmsize*2*%%i], m2
518 mova [dstq+mmsize*3*%%i], m3
520 mova [dstq+mmsize*4*%%i], m7
521 mova [dstq+mmsize*5*%%i], m8
522 mova [dstq+mmsize*6*%%i], m9
523 mova [dstq+mmsize*7*%%i], m10
527 add srcq, mmsize*4*(%2+%3)
528 add dstq, mmsize*4*(%2+%3)
529 sub lend, mmsize*(%2+%3)
535 %define CLIPD CLIPD_MMX
536 VECTOR_CLIP_INT32 0, 1, 0, 0
538 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
539 %define CLIPD CLIPD_SSE2
540 VECTOR_CLIP_INT32 6, 2, 0, 1
542 %define CLIPD CLIPD_SSE41
544 VECTOR_CLIP_INT32 11, 1, 1, 0
546 VECTOR_CLIP_INT32 6, 1, 0, 0
549 ; %1 = aligned/unaligned
563 pshuflw m0, m0, 10110001b
564 pshuflw m1, m1, 10110001b
565 pshufhw m0, m0, 10110001b
566 pshufhw m1, m1, 10110001b
591 pshuflw m0, m0, 10110001b
592 pshufhw m0, m0, 10110001b
603 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
606 cglobal bswap32_buf, 3,4,3
608 mova m2, [pb_bswap32]
610 cglobal bswap32_buf, 3,4,5