x86/hevc_res_add: add ff_hevc_transform_add32_8_avx2

author James Almer <jamrial@gmail.com>

Mon, 1 Sep 2014 02:43:02 +0000 (23:43 -0300)

committer James Almer <jamrial@gmail.com>

Thu, 4 Sep 2014 23:21:29 +0000 (20:21 -0300)
author James Almer <jamrial@gmail.com>
Mon, 1 Sep 2014 02:43:02 +0000 (23:43 -0300)
committer James Almer <jamrial@gmail.com>
Thu, 4 Sep 2014 23:21:29 +0000 (20:21 -0300)
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm

index 7238fb3..488c5b7 100644 (file)
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
  %endmacro
  
  %macro TR_ADD_SSE_16_32_8 3
-    mova              m2, [r1+%1   ]
-    mova              m6, [r1+%1+16]
+    mova             xm2, [r1+%1   ]
+    mova             xm6, [r1+%1+16]
+%if cpuflag(avx2)
+    vinserti128       m2, m2, [r1+%1+32], 1
+    vinserti128       m6, m6, [r1+%1+48], 1
+%endif
  %if cpuflag(avx)
      psubw             m1, m0, m2
      psubw             m5, m0, m6
@@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
      packuswb          m2, m6
      packuswb          m1, m5
  
-    mova              m4, [r1+%1+32]
-    mova              m6, [r1+%1+48]
+    mova             xm4, [r1+%1+mmsize*2   ]
+    mova             xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128       m4, m4, [r1+%1+96 ], 1
+    vinserti128       m6, m6, [r1+%1+112], 1
+%endif
  %if cpuflag(avx)
      psubw             m3, m0, m4
      psubw             m5, m0, m6
@@ -169,6 +177,21 @@ TRANSFORM_ADD_8
  INIT_XMM avx
  TRANSFORM_ADD_8
  
+INIT_YMM avx2
+; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+    add                r1, 256
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+    RET
+
  ;-----------------------------------------------------------------------------
  ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
  ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h

index 839e052..8dea142 100644 (file)
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
  void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
  void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
  
+void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
  void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
  void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
  void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c

index 6bcced6..eaa97e1 100644 (file)
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
          if (EXTERNAL_AVX2(cpu_flags)) {
              c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
              c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
          }
      } else if (bit_depth == 10) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
author	James Almer <jamrial@gmail.com>
	Mon, 1 Sep 2014 02:43:02 +0000 (23:43 -0300)
committer	James Almer <jamrial@gmail.com>
	Thu, 4 Sep 2014 23:21:29 +0000 (20:21 -0300)
libavcodec/x86/hevc_res_add.asm		patch \| blob \| history
libavcodec/x86/hevcdsp.h		patch \| blob \| history
libavcodec/x86/hevcdsp_init.c		patch \| blob \| history