From 32f8fb8ecf8178b9c9ec8d7152f1fdd8537f7f3a Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 24 Apr 2011 17:50:17 -0400
Subject: [PATCH] Add float_interleave() to FmtConvertContext with
 x86-optimized versions.

Partially based on patches by clsid2 in ffdshow-tryout.
ff_float_interleave6() x86 improvements by Loren Merrit.
---
 libavcodec/fmtconvert.c         |  20 ++++++
 libavcodec/fmtconvert.h         |   9 +++
 libavcodec/x86/fmtconvert.asm   | 141 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c |  30 +++++++++
 4 files changed, 200 insertions(+)

diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index e9707555af..58fece70b2 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
     }
 }
 
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+                           int channels)
+{
+    int j, c;
+    unsigned int i;
+    if (channels == 2) {
+        for (i = 0; i < len; i++) {
+            dst[2*i]   = src[0][i];
+            dst[2*i+1] = src[1][i];
+        }
+    } else if (channels == 1 && len < INT_MAX / sizeof(float)) {
+        memcpy(dst, src[0], len * sizeof(float));
+    } else {
+        for (c = 0; c < channels; c++)
+            for (i = 0, j = c; i < len; i++, j += channels)
+                dst[j] = src[c][i];
+    }
+}
+
 av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
 {
     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
     c->float_to_int16             = float_to_int16_c;
     c->float_to_int16_interleave  = float_to_int16_interleave_c;
+    c->float_interleave           = ff_float_interleave_c;
 
     if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
     if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index e0afee47e1..d7741135b7 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -68,8 +68,17 @@ typedef struct FmtConvertContext {
      */
     void (*float_to_int16_interleave)(int16_t *dst, const float **src,
                                       long len, int channels);
+
+    /**
+     * Convert an array of interleaved float to multiple arrays of float.
+     */
+    void (*float_interleave)(float *dst, const float **src, unsigned int len,
+                             int channels);
 } FmtConvertContext;
 
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+                           int channels);
+
 void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
 
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 5cd8f6c596..e023b48322 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -20,6 +20,7 @@
 ;******************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 section .text align=16
 
@@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 %undef cvtps2pi
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro BUTTERFLYPS 3
+    movaps    m%3, m%1
+    unpcklps  m%1, m%2
+    unpckhps  m%3, m%2
+    SWAP %2, %3
+%endmacro
+
+%macro FLOAT_INTERLEAVE6 2
+cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+    %define lend r10d
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov    src1q, [srcq+1*gprsize]
+    mov    src2q, [srcq+2*gprsize]
+    mov    src3q, [srcq+3*gprsize]
+    mov    src4q, [srcq+4*gprsize]
+    mov    src5q, [srcq+5*gprsize]
+    mov     srcq, [srcq]
+    sub    src1q, srcq
+    sub    src2q, srcq
+    sub    src3q, srcq
+    sub    src4q, srcq
+    sub    src5q, srcq
+.loop:
+%ifidn %1, sse
+    movaps    m0, [srcq]
+    movaps    m1, [srcq+src1q]
+    movaps    m2, [srcq+src2q]
+    movaps    m3, [srcq+src3q]
+    movaps    m4, [srcq+src4q]
+    movaps    m5, [srcq+src5q]
+
+    BUTTERFLYPS 0, 1, 6
+    BUTTERFLYPS 2, 3, 6
+    BUTTERFLYPS 4, 5, 6
+
+    movaps    m6, m4
+    shufps    m4, m0, 0xe4
+    movlhps   m0, m2
+    movhlps   m6, m2
+    movaps [dstq   ], m0
+    movaps [dstq+16], m4
+    movaps [dstq+32], m6
+
+    movaps    m6, m5
+    shufps    m5, m1, 0xe4
+    movlhps   m1, m3
+    movhlps   m6, m3
+    movaps [dstq+48], m1
+    movaps [dstq+64], m5
+    movaps [dstq+80], m6
+%else ; mmx
+    movq       m0, [srcq]
+    movq       m1, [srcq+src1q]
+    movq       m2, [srcq+src2q]
+    movq       m3, [srcq+src3q]
+    movq       m4, [srcq+src4q]
+    movq       m5, [srcq+src5q]
+
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+    movq [dstq   ], m0
+    movq [dstq+ 8], m2
+    movq [dstq+16], m4
+    movq [dstq+24], m1
+    movq [dstq+32], m3
+    movq [dstq+40], m5
+%endif
+    add      srcq, mmsize
+    add      dstq, mmsize*6
+    sub      lend, mmsize/4
+    jg .loop
+%ifidn %1, mmx
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+FLOAT_INTERLEAVE6 mmx, 0
+INIT_XMM
+FLOAT_INTERLEAVE6 sse, 7
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE2 2
+cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
+    mov     src1q, [srcq+gprsize]
+    mov      srcq, [srcq        ]
+    sub     src1q, srcq
+.loop
+    MOVPS      m0, [srcq             ]
+    MOVPS      m1, [srcq+src1q       ]
+    MOVPS      m3, [srcq      +mmsize]
+    MOVPS      m4, [srcq+src1q+mmsize]
+
+    MOVPS      m2, m0
+    PUNPCKLDQ  m0, m1
+    PUNPCKHDQ  m2, m1
+
+    MOVPS      m1, m3
+    PUNPCKLDQ  m3, m4
+    PUNPCKHDQ  m1, m4
+
+    MOVPS [dstq         ], m0
+    MOVPS [dstq+1*mmsize], m2
+    MOVPS [dstq+2*mmsize], m3
+    MOVPS [dstq+3*mmsize], m1
+
+    add      srcq, mmsize*2
+    add      dstq, mmsize*4
+    sub      lend, mmsize/2
+    jg .loop
+%ifidn %1, mmx
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define MOVPS     movq
+%define PUNPCKLDQ punpckldq
+%define PUNPCKHDQ punpckhdq
+FLOAT_INTERLEAVE2 mmx, 0
+INIT_XMM
+%define MOVPS     movaps
+%define PUNPCKLDQ unpcklps
+%define PUNPCKHDQ unpckhps
+FLOAT_INTERLEAVE2 sse, 5
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 847bd80fcd..61a4272a69 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
         float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
 
+void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
+
+void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
+
+static void float_interleave_mmx(float *dst, const float **src,
+                                 unsigned int len, int channels)
+{
+    if (channels == 2) {
+        ff_float_interleave2_mmx(dst, src, len);
+    } else if (channels == 6)
+        ff_float_interleave6_mmx(dst, src, len);
+    else
+        ff_float_interleave_c(dst, src, len, channels);
+}
+
+static void float_interleave_sse(float *dst, const float **src,
+                                 unsigned int len, int channels)
+{
+    if (channels == 2) {
+        ff_float_interleave2_sse(dst, src, len);
+    } else if (channels == 6)
+        ff_float_interleave6_sse(dst, src, len);
+    else
+        ff_float_interleave_c(dst, src, len, channels);
+}
+
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
 
     if (mm_flags & AV_CPU_FLAG_MMX) {
+        c->float_interleave = float_interleave_mmx;
 
         if(mm_flags & AV_CPU_FLAG_3DNOW){
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
             c->float_to_int16 = float_to_int16_sse;
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
+            c->float_interleave = float_interleave_sse;
         }
         if(mm_flags & AV_CPU_FLAG_SSE2){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-- 
2.11.0