--- /dev/null
-;* This file is part of Libav.
+ ;******************************************************************************
+ ;* x86-optimized functions for gradfun filter
+ ;*
-;* Libav is free software; you can redistribute it and/or
++;* This file is part of FFmpeg.
+ ;*
-;* Libav is distributed in the hope that it will be useful,
++;* FFmpeg is free software; you can redistribute it and/or
+ ;* modify it under the terms of the GNU Lesser General Public
+ ;* License as published by the Free Software Foundation; either
+ ;* version 2.1 of the License, or (at your option) any later version.
+ ;*
-;* License along with Libav; if not, write to the Free Software
++;* FFmpeg is distributed in the hope that it will be useful,
+ ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ ;* Lesser General Public License for more details.
+ ;*
+ ;* You should have received a copy of the GNU Lesser General Public
++;* License along with FFmpeg; if not, write to the Free Software
+ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ;******************************************************************************
+
+ %include "libavutil/x86/x86util.asm"
+
+ SECTION_RODATA
+
+ pw_7f: times 8 dw 0x7F
+ pw_ff: times 8 dw 0xFF
+
+ SECTION .text
+
+ %macro FILTER_LINE 1
+ movh m0, [r2+r0]
+ movh m1, [r3+r0]
+ punpcklbw m0, m7
+ punpcklwd m1, m1
+ psllw m0, 7
+ psubw m1, m0
+ PABSW m2, m1
+ pmulhuw m2, m5
+ psubw m2, m6
+ pminsw m2, m7
+ pmullw m2, m2
+ psllw m1, 2
+ paddw m0, %1
+ pmulhw m1, m2
+ paddw m0, m1
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r1+r0], m0
+ %endmacro
+
+ INIT_MMX mmxext
+ cglobal gradfun_filter_line, 6, 6
+ movh m5, r4d
+ pxor m7, m7
+ pshufw m5, m5,0
+ mova m6, [pw_7f]
+ mova m3, [r5]
+ mova m4, [r5+8]
+ .loop:
+ FILTER_LINE m3
+ add r0, 4
+ jge .end
+ FILTER_LINE m4
+ add r0, 4
+ jl .loop
+ .end:
+ REP_RET
+
+ INIT_XMM ssse3
+ cglobal gradfun_filter_line, 6, 6, 8
+ movd m5, r4d
+ pxor m7, m7
+ pshuflw m5, m5, 0
+ mova m6, [pw_7f]
+ punpcklqdq m5, m5
+ mova m4, [r5]
+ .loop:
+ FILTER_LINE m4
+ add r0, 8
+ jl .loop
+ REP_RET
+
+ %macro BLUR_LINE 1
+ cglobal gradfun_blur_line_%1, 6, 6, 8
+ mova m7, [pw_ff]
+ .loop:
+ %1 m0, [r4+r0]
+ %1 m1, [r5+r0]
+ mova m2, m0
+ mova m3, m1
+ psrlw m0, 8
+ psrlw m1, 8
+ pand m2, m7
+ pand m3, m7
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ paddw m0, [r2+r0]
+ mova m1, [r1+r0]
+ mova [r1+r0], m0
+ psubw m0, m1
+ mova [r3+r0], m0
+ add r0, 16
+ jl .loop
+ REP_RET
+ %endmacro
+
+ INIT_XMM sse2
+ BLUR_LINE movdqa
+ BLUR_LINE movdqu
--- /dev/null
- * This file is part of Libav.
+ /*
+ * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
+ *
- * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+ *
- * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
- * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
- uint16_t *dc, int thresh,
++ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ #include "config.h"
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/mem.h"
+ #include "libavutil/x86/asm.h"
+ #include "libavutil/x86/cpu.h"
+ #include "libavfilter/gradfun.h"
+
+ #if HAVE_YASM
-void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
- uint16_t *dc, int thresh,
++void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src,
++ const uint16_t *dc, int thresh,
+ const uint16_t *dithers);
+ static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc,
+ int width, int thresh,
+ const uint16_t *dithers)
+ {
+ intptr_t x;
+ if (width & 3) {
+ x = width & ~3;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ ff_gradfun_filter_line_mmxext(x, dst + width, src + width, dc + width/2,
+ thresh, dithers);
+ }
+
-static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
++void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
++ const uint16_t *dc, int thresh,
+ const uint16_t *dithers);
+ static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+ {
+ intptr_t x;
+ if (width & 7) {
+ // could be 10% faster if I somehow eliminated this
+ x = width & ~7;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ ff_gradfun_filter_line_ssse3(x, dst + width, src + width, dc + width/2,
+ thresh, dithers);
+ }
+
+ void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf, uint16_t *buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
+ void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf, uint16_t *buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
++static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
+ {
+ intptr_t x = -2*width;
+ if (((intptr_t) src | src_linesize) & 15) {
+ ff_gradfun_blur_line_movdqu_sse2(x, buf + width, buf1 + width,
+ dc + width, src + width * 2,
+ src + width * 2 + src_linesize);
+ } else {
+ ff_gradfun_blur_line_movdqa_sse2(x, buf + width, buf1 + width,
+ dc + width, src + width * 2,
+ src + width * 2 + src_linesize);
+ }
+ }
+ #endif /* HAVE_YASM */
+
+ av_cold void ff_gradfun_init_x86(GradFunContext *gf)
+ {
+ #if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ gf->filter_line = gradfun_filter_line_mmxext;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ gf->filter_line = gradfun_filter_line_ssse3;
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ gf->blur_line = gradfun_blur_line_sse2;
+ #endif /* HAVE_YASM */
+ }