OSDN Git Service

AArch64: rewrite audioflinger's sinc resample by intrinsics.
authorZhongwei Yao <zhongwei.yao@arm.com>
Thu, 10 Apr 2014 16:23:42 +0000 (17:23 +0100)
committerZheng Xu <zheng.xu@arm.com>
Thu, 10 Apr 2014 17:28:46 +0000 (18:28 +0100)
Pass conformance test on armv7 and aarch64, performance test is done on armv7.
Compared with original armv7 assembly version, this version has similar
result. Here is performance data on pandaboard android4.4 (input: random wave
file, unit: Mspl/s, toolchain:gcc 4.8):

|                | origin(assembly) | current(intrinsics) | C version |
|----------------+------------------+---------------------+-----------|
| single channel | 6.17             | 7.14                | 3.43      |
| double channel | 5.24             | 5.63                | 3.50      |

Change-Id: If5670218e1586e9dfd2b8d9c66a6880f3e4808ca

services/audioflinger/AudioResamplerSinc.cpp

index 207f26b..e50b192 100644 (file)
@@ -17,6 +17,7 @@
 #define LOG_TAG "AudioResamplerSinc"
 //#define LOG_NDEBUG 0
 
+#define __STDC_CONSTANT_MACROS
 #include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #define USE_INLINE_ASSEMBLY (false)
 #endif
 
-#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
-#define USE_NEON (true)
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON
 #else
-#define USE_NEON (false)
+#undef USE_NEON
 #endif
 
+#define UNUSED(x) ((void)(x))
 
 namespace android {
 // ----------------------------------------------------------------------------
@@ -634,8 +637,8 @@ void AudioResamplerSinc::read(
 }
 
 template<int CHANNELS>
-void AudioResamplerSinc::filterCoefficient(
-        int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
+void AudioResamplerSinc::filterCoefficient(int32_t* out, uint32_t phase,
+         const int16_t *samples, uint32_t vRL)
 {
     // NOTE: be very careful when modifying the code here. register
     // pressure is very high and a small change might cause the compiler
@@ -662,160 +665,171 @@ void AudioResamplerSinc::filterCoefficient(
 
     size_t count = offset;
 
-    if (!USE_NEON) {
-        int32_t l = 0;
-        int32_t r = 0;
-        for (size_t i=0 ; i<count ; i++) {
-            interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
-            sP -= CHANNELS;
-            interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
-            sN += CHANNELS;
-        }
-        out[0] += 2 * mulRL(1, l, vRL);
-        out[1] += 2 * mulRL(0, r, vRL);
-    } else if (CHANNELS == 1) {
+#ifndef USE_NEON
+    int32_t l = 0;
+    int32_t r = 0;
+    for (size_t i=0 ; i<count ; i++) {
+        interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+        sP -= CHANNELS;
+        interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+        sN += CHANNELS;
+    }
+    out[0] += 2 * mulRL(1, l, vRL);
+    out[1] += 2 * mulRL(0, r, vRL);
+#else
+    UNUSED(vRL);
+    if (CHANNELS == 1) {
         int32_t const* coefsP1 = coefsP + offset;
         int32_t const* coefsN1 = coefsN + offset;
         sP -= CHANNELS*3;
-        asm (
-            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
-            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
-            "veor           q0, q0, q0               \n"    // result, initialize to 0
-            "vshl.s32       d2, d2, #16              \n"    // convert to 32 bits
-
-            "1:                                      \n"
-            "vld1.16        { d4}, [%[sP]]           \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
-            "vld1.16        { d6}, [%[sN]]!          \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
-
-            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
-
-            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
-            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
-            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
-
-            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
-            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
-            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
-
-            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
-            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
-            "subs           %[count], %[count], #4   \n"    // update loop counter
-
-            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
-            "sub            %[sP], %[sP], #8         \n"    // move pointer to next set of samples
-
-            "vadd.s32       q0, q0, q12              \n"    // accumulate result
-            "vadd.s32       q0, q0, q14              \n"    // accumulate result
-
-            "bne            1b                       \n"    // loop
-
-            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
-            "vld1.s32       {d3}, %[out]             \n"    // load the output
-            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
-            "vpadd.s32      d0, d0, d0               \n"    // together
-            "vdup.i32       d0, d0[0]                \n"    // interleave L,R channels
-            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
-            "vadd.s32       d3, d3, d0               \n"    // accumulate result
-            "vst1.s32       {d3}, %[out]             \n"    // store result
-
-            : [out]     "=Uv" (out[0]),
-              [count]   "+r" (count),
-              [coefsP0] "+r" (coefsP),
-              [coefsP1] "+r" (coefsP1),
-              [coefsN0] "+r" (coefsN),
-              [coefsN1] "+r" (coefsN1),
-              [sP]      "+r" (sP),
-              [sN]      "+r" (sN)
-            : [lerpP]   "r" (lerpP),
-              [lerpN]   "r" (lerpN),
-              [vLR]     "r" (mVolumeSIMD)
-            : "cc", "memory",
-              "q0", "q1", "q2", "q3",
-              "q8", "q9", "q10", "q11",
-              "q12", "q14"
-        );
+
+        int32x4_t sum;
+        int32x2_t lerpPN;
+        lerpPN = vdup_n_s32(0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+        lerpPN = vshl_n_s32(lerpPN, 16);
+        sum = vdupq_n_s32(0);
+
+        int16x4_t sampleP, sampleN;
+        int32x4_t samplePExt, sampleNExt;
+        int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+        coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+        coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+        for (; count > 0; count -= 4) {
+            sampleP = vld1_s16(sP);
+            sampleN = vld1_s16(sN);
+            coefsPV0 = vld1q_s32(coefsP);
+            coefsNV0 = vld1q_s32(coefsN);
+            coefsPV1 = vld1q_s32(coefsP1);
+            coefsNV1 = vld1q_s32(coefsN1);
+            sP -= 4;
+            sN += 4;
+            coefsP += 4;
+            coefsN += 4;
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            sampleP = vrev64_s16(sampleP);
+
+            // interpolate (step1)
+            coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+            coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+            samplePExt = vshll_n_s16(sampleP, 15);
+            // interpolate (step2)
+            coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+            coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+            sampleNExt = vshll_n_s16(sampleN, 15);
+            // interpolate (step3)
+            coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+            coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+            samplePExt = vqrdmulhq_s32(samplePExt, coefsPV0);
+            sampleNExt = vqrdmulhq_s32(sampleNExt, coefsNV0);
+            sum = vaddq_s32(sum, samplePExt);
+            sum = vaddq_s32(sum, sampleNExt);
+        }
+        int32x2_t volumesV, outV;
+        volumesV = vld1_s32(mVolumeSIMD);
+        outV = vld1_s32(out);
+
+        //add all 4 partial sums
+        int32x2_t sumLow, sumHigh;
+        sumLow = vget_low_s32(sum);
+        sumHigh = vget_high_s32(sum);
+        sumLow = vpadd_s32(sumLow, sumHigh);
+        sumLow = vpadd_s32(sumLow, sumLow);
+
+        sumLow = vqrdmulh_s32(sumLow, volumesV);
+        outV = vadd_s32(outV, sumLow);
+        vst1_s32(out, outV);
     } else if (CHANNELS == 2) {
         int32_t const* coefsP1 = coefsP + offset;
         int32_t const* coefsN1 = coefsN + offset;
         sP -= CHANNELS*3;
-        asm (
-            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
-            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
-            "veor           q0, q0, q0               \n"    // result, initialize to 0
-            "veor           q4, q4, q4               \n"    // result, initialize to 0
-            "vshl.s32       d2, d2, #16              \n"    // convert to 32 bits
-
-            "1:                                      \n"
-            "vld2.16        {d4,d5}, [%[sP]]         \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
-            "vld2.16        {d6,d7}, [%[sN]]!        \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
-
-            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
-            "vrev64.16      d5, d5                   \n"    // reverse 2 frames of the positive side
-
-            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
-            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
-            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
-            "vshll.s16      q13,  d5, #15            \n"    // extend samples to 31 bits
-
-            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
-            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
-            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
-            "vshll.s16      q15,  d7, #15            \n"    // extend samples to 31 bits
-
-            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
-            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
-            "subs           %[count], %[count], #4   \n"    // update loop counter
-
-            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q13, q13, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q15, q15, q10            \n"    // multiply samples by interpolated coef
-            "sub            %[sP], %[sP], #16        \n"    // move pointer to next set of samples
-
-            "vadd.s32       q0, q0, q12              \n"    // accumulate result
-            "vadd.s32       q4, q4, q13              \n"    // accumulate result
-            "vadd.s32       q0, q0, q14              \n"    // accumulate result
-            "vadd.s32       q4, q4, q15              \n"    // accumulate result
-
-            "bne            1b                       \n"    // loop
-
-            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
-            "vld1.s32       {d3}, %[out]             \n"    // load the output
-            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums from q0
-            "vpadd.s32      d8, d8, d9               \n"    // add all 4 partial sums from q4
-            "vpadd.s32      d0, d0, d0               \n"    // together
-            "vpadd.s32      d8, d8, d8               \n"    // together
-            "vtrn.s32       d0, d8                   \n"    // interlace L,R channels
-            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
-            "vadd.s32       d3, d3, d0               \n"    // accumulate result
-            "vst1.s32       {d3}, %[out]             \n"    // store result
-
-            : [out]     "=Uv" (out[0]),
-              [count]   "+r" (count),
-              [coefsP0] "+r" (coefsP),
-              [coefsP1] "+r" (coefsP1),
-              [coefsN0] "+r" (coefsN),
-              [coefsN1] "+r" (coefsN1),
-              [sP]      "+r" (sP),
-              [sN]      "+r" (sN)
-            : [lerpP]   "r" (lerpP),
-              [lerpN]   "r" (lerpN),
-              [vLR]     "r" (mVolumeSIMD)
-            : "cc", "memory",
-              "q0", "q1", "q2", "q3", "q4",
-              "q8", "q9", "q10", "q11",
-              "q12", "q13", "q14", "q15"
-        );
+
+        int32x4_t sum0, sum1;
+        int32x2_t lerpPN;
+
+        lerpPN = vdup_n_s32(0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+        lerpPN = vshl_n_s32(lerpPN, 16);
+        sum0 = vdupq_n_s32(0);
+        sum1 = vdupq_n_s32(0);
+
+        int16x4x2_t sampleP, sampleN;
+        int32x4x2_t samplePExt, sampleNExt;
+        int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+        coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+        coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+        for (; count > 0; count -= 4) {
+            sampleP = vld2_s16(sP);
+            sampleN = vld2_s16(sN);
+            coefsPV0 = vld1q_s32(coefsP);
+            coefsNV0 = vld1q_s32(coefsN);
+            coefsPV1 = vld1q_s32(coefsP1);
+            coefsNV1 = vld1q_s32(coefsN1);
+            sP -= 8;
+            sN += 8;
+            coefsP += 4;
+            coefsN += 4;
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            sampleP.val[0] = vrev64_s16(sampleP.val[0]);
+            sampleP.val[1] = vrev64_s16(sampleP.val[1]);
+
+            // interpolate (step1)
+            coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+            coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+            samplePExt.val[0] = vshll_n_s16(sampleP.val[0], 15);
+            samplePExt.val[1] = vshll_n_s16(sampleP.val[1], 15);
+            // interpolate (step2)
+            coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+            coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+            sampleNExt.val[0] = vshll_n_s16(sampleN.val[0], 15);
+            sampleNExt.val[1] = vshll_n_s16(sampleN.val[1], 15);
+            // interpolate (step3)
+            coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+            coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+            samplePExt.val[0] = vqrdmulhq_s32(samplePExt.val[0], coefsPV0);
+            samplePExt.val[1] = vqrdmulhq_s32(samplePExt.val[1], coefsPV0);
+            sampleNExt.val[0] = vqrdmulhq_s32(sampleNExt.val[0], coefsNV0);
+            sampleNExt.val[1] = vqrdmulhq_s32(sampleNExt.val[1], coefsNV0);
+            sum0 = vaddq_s32(sum0, samplePExt.val[0]);
+            sum1 = vaddq_s32(sum1, samplePExt.val[1]);
+            sum0 = vaddq_s32(sum0, sampleNExt.val[0]);
+            sum1 = vaddq_s32(sum1, sampleNExt.val[1]);
+        }
+        int32x2_t volumesV, outV;
+        volumesV = vld1_s32(mVolumeSIMD);
+        outV = vld1_s32(out);
+
+        //add all 4 partial sums
+        int32x2_t sumLow0, sumHigh0, sumLow1, sumHigh1;
+        sumLow0 = vget_low_s32(sum0);
+        sumHigh0 = vget_high_s32(sum0);
+        sumLow1 = vget_low_s32(sum1);
+        sumHigh1 = vget_high_s32(sum1);
+        sumLow0 = vpadd_s32(sumLow0, sumHigh0);
+        sumLow0 = vpadd_s32(sumLow0, sumLow0);
+        sumLow1 = vpadd_s32(sumLow1, sumHigh1);
+        sumLow1 = vpadd_s32(sumLow1, sumLow1);
+
+        sumLow0 = vtrn_s32(sumLow0, sumLow1).val[0];
+        sumLow0 = vqrdmulh_s32(sumLow0, volumesV);
+        outV = vadd_s32(outV, sumLow0);
+        vst1_s32(out, outV);
     }
+#endif
 }
 
 template<int CHANNELS>