AArch64: rewrite audioflinger's sinc resample by intrinsics.

author Zhongwei Yao <zhongwei.yao@arm.com>

Thu, 10 Apr 2014 16:23:42 +0000 (17:23 +0100)

committer Zheng Xu <zheng.xu@arm.com>

Thu, 10 Apr 2014 17:28:46 +0000 (18:28 +0100)
author Zhongwei Yao <zhongwei.yao@arm.com>
Thu, 10 Apr 2014 16:23:42 +0000 (17:23 +0100)
committer Zheng Xu <zheng.xu@arm.com>
Thu, 10 Apr 2014 17:28:46 +0000 (18:28 +0100)
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp

index 207f26b..e50b192 100644 (file)
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -17,6 +17,7 @@
  #define LOG_TAG "AudioResamplerSinc"
  //#define LOG_NDEBUG 0
  
+#define __STDC_CONSTANT_MACROS
  #include <malloc.h>
  #include <string.h>
  #include <stdlib.h>
@@ -37,12 +38,14 @@
  #define USE_INLINE_ASSEMBLY (false)
  #endif
  
-#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
-#define USE_NEON (true)
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON
  #else
-#define USE_NEON (false)
+#undef USE_NEON
  #endif
  
+#define UNUSED(x) ((void)(x))
  
  namespace android {
  // ----------------------------------------------------------------------------
@@ -634,8 +637,8 @@ void AudioResamplerSinc::read(
  }
  
  template<int CHANNELS>
-void AudioResamplerSinc::filterCoefficient(
-        int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
+void AudioResamplerSinc::filterCoefficient(int32_t* out, uint32_t phase,
+         const int16_t *samples, uint32_t vRL)
  {
      // NOTE: be very careful when modifying the code here. register
      // pressure is very high and a small change might cause the compiler
@@ -662,160 +665,171 @@ void AudioResamplerSinc::filterCoefficient(
  
      size_t count = offset;
  
-    if (!USE_NEON) {
-        int32_t l = 0;
-        int32_t r = 0;
-        for (size_t i=0 ; i<count ; i++) {
-            interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
-            sP -= CHANNELS;
-            interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
-            sN += CHANNELS;
-        }
-        out[0] += 2 * mulRL(1, l, vRL);
-        out[1] += 2 * mulRL(0, r, vRL);
-    } else if (CHANNELS == 1) {
+#ifndef USE_NEON
+    int32_t l = 0;
+    int32_t r = 0;
+    for (size_t i=0 ; i<count ; i++) {
+        interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+        sP -= CHANNELS;
+        interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+        sN += CHANNELS;
+    }
+    out[0] += 2 * mulRL(1, l, vRL);
+    out[1] += 2 * mulRL(0, r, vRL);
+#else
+    UNUSED(vRL);
+    if (CHANNELS == 1) {
          int32_t const* coefsP1 = coefsP + offset;
          int32_t const* coefsN1 = coefsN + offset;
          sP -= CHANNELS*3;
-        asm (
-            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
-            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
-            "veor           q0, q0, q0               \n"    // result, initialize to 0
-            "vshl.s32       d2, d2, #16              \n"    // convert to 32 bits
-
-            "1:                                      \n"
-            "vld1.16        { d4}, [%[sP]]           \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
-            "vld1.16        { d6}, [%[sN]]!          \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
-
-            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
-
-            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
-            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
-            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
-
-            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
-            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
-            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
-
-            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
-            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
-            "subs           %[count], %[count], #4   \n"    // update loop counter
-
-            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
-            "sub            %[sP], %[sP], #8         \n"    // move pointer to next set of samples
-
-            "vadd.s32       q0, q0, q12              \n"    // accumulate result
-            "vadd.s32       q0, q0, q14              \n"    // accumulate result
-
-            "bne            1b                       \n"    // loop
-
-            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
-            "vld1.s32       {d3}, %[out]             \n"    // load the output
-            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
-            "vpadd.s32      d0, d0, d0               \n"    // together
-            "vdup.i32       d0, d0[0]                \n"    // interleave L,R channels
-            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
-            "vadd.s32       d3, d3, d0               \n"    // accumulate result
-            "vst1.s32       {d3}, %[out]             \n"    // store result
-
-            : [out]     "=Uv" (out[0]),
-              [count]   "+r" (count),
-              [coefsP0] "+r" (coefsP),
-              [coefsP1] "+r" (coefsP1),
-              [coefsN0] "+r" (coefsN),
-              [coefsN1] "+r" (coefsN1),
-              [sP]      "+r" (sP),
-              [sN]      "+r" (sN)
-            : [lerpP]   "r" (lerpP),
-              [lerpN]   "r" (lerpN),
-              [vLR]     "r" (mVolumeSIMD)
-            : "cc", "memory",
-              "q0", "q1", "q2", "q3",
-              "q8", "q9", "q10", "q11",
-              "q12", "q14"
-        );
+
+        int32x4_t sum;
+        int32x2_t lerpPN;
+        lerpPN = vdup_n_s32(0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+        lerpPN = vshl_n_s32(lerpPN, 16);
+        sum = vdupq_n_s32(0);
+
+        int16x4_t sampleP, sampleN;
+        int32x4_t samplePExt, sampleNExt;
+        int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+        coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+        coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+        for (; count > 0; count -= 4) {
+            sampleP = vld1_s16(sP);
+            sampleN = vld1_s16(sN);
+            coefsPV0 = vld1q_s32(coefsP);
+            coefsNV0 = vld1q_s32(coefsN);
+            coefsPV1 = vld1q_s32(coefsP1);
+            coefsNV1 = vld1q_s32(coefsN1);
+            sP -= 4;
+            sN += 4;
+            coefsP += 4;
+            coefsN += 4;
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            sampleP = vrev64_s16(sampleP);
+
+            // interpolate (step1)
+            coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+            coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+            samplePExt = vshll_n_s16(sampleP, 15);
+            // interpolate (step2)
+            coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+            coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+            sampleNExt = vshll_n_s16(sampleN, 15);
+            // interpolate (step3)
+            coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+            coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+            samplePExt = vqrdmulhq_s32(samplePExt, coefsPV0);
+            sampleNExt = vqrdmulhq_s32(sampleNExt, coefsNV0);
+            sum = vaddq_s32(sum, samplePExt);
+            sum = vaddq_s32(sum, sampleNExt);
+        }
+        int32x2_t volumesV, outV;
+        volumesV = vld1_s32(mVolumeSIMD);
+        outV = vld1_s32(out);
+
+        //add all 4 partial sums
+        int32x2_t sumLow, sumHigh;
+        sumLow = vget_low_s32(sum);
+        sumHigh = vget_high_s32(sum);
+        sumLow = vpadd_s32(sumLow, sumHigh);
+        sumLow = vpadd_s32(sumLow, sumLow);
+
+        sumLow = vqrdmulh_s32(sumLow, volumesV);
+        outV = vadd_s32(outV, sumLow);
+        vst1_s32(out, outV);
      } else if (CHANNELS == 2) {
          int32_t const* coefsP1 = coefsP + offset;
          int32_t const* coefsN1 = coefsN + offset;
          sP -= CHANNELS*3;
-        asm (
-            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
-            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
-            "veor           q0, q0, q0               \n"    // result, initialize to 0
-            "veor           q4, q4, q4               \n"    // result, initialize to 0
-            "vshl.s32       d2, d2, #16              \n"    // convert to 32 bits
-
-            "1:                                      \n"
-            "vld2.16        {d4,d5}, [%[sP]]         \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
-            "vld2.16        {d6,d7}, [%[sN]]!        \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
-
-            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
-            "vrev64.16      d5, d5                   \n"    // reverse 2 frames of the positive side
-
-            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
-            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
-            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
-            "vshll.s16      q13,  d5, #15            \n"    // extend samples to 31 bits
-
-            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
-            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
-            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
-            "vshll.s16      q15,  d7, #15            \n"    // extend samples to 31 bits
-
-            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
-            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
-            "subs           %[count], %[count], #4   \n"    // update loop counter
-
-            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q13, q13, q8             \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
-            "vqrdmulh.s32   q15, q15, q10            \n"    // multiply samples by interpolated coef
-            "sub            %[sP], %[sP], #16        \n"    // move pointer to next set of samples
-
-            "vadd.s32       q0, q0, q12              \n"    // accumulate result
-            "vadd.s32       q4, q4, q13              \n"    // accumulate result
-            "vadd.s32       q0, q0, q14              \n"    // accumulate result
-            "vadd.s32       q4, q4, q15              \n"    // accumulate result
-
-            "bne            1b                       \n"    // loop
-
-            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
-            "vld1.s32       {d3}, %[out]             \n"    // load the output
-            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums from q0
-            "vpadd.s32      d8, d8, d9               \n"    // add all 4 partial sums from q4
-            "vpadd.s32      d0, d0, d0               \n"    // together
-            "vpadd.s32      d8, d8, d8               \n"    // together
-            "vtrn.s32       d0, d8                   \n"    // interlace L,R channels
-            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
-            "vadd.s32       d3, d3, d0               \n"    // accumulate result
-            "vst1.s32       {d3}, %[out]             \n"    // store result
-
-            : [out]     "=Uv" (out[0]),
-              [count]   "+r" (count),
-              [coefsP0] "+r" (coefsP),
-              [coefsP1] "+r" (coefsP1),
-              [coefsN0] "+r" (coefsN),
-              [coefsN1] "+r" (coefsN1),
-              [sP]      "+r" (sP),
-              [sN]      "+r" (sN)
-            : [lerpP]   "r" (lerpP),
-              [lerpN]   "r" (lerpN),
-              [vLR]     "r" (mVolumeSIMD)
-            : "cc", "memory",
-              "q0", "q1", "q2", "q3", "q4",
-              "q8", "q9", "q10", "q11",
-              "q12", "q13", "q14", "q15"
-        );
+
+        int32x4_t sum0, sum1;
+        int32x2_t lerpPN;
+
+        lerpPN = vdup_n_s32(0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+        lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+        lerpPN = vshl_n_s32(lerpPN, 16);
+        sum0 = vdupq_n_s32(0);
+        sum1 = vdupq_n_s32(0);
+
+        int16x4x2_t sampleP, sampleN;
+        int32x4x2_t samplePExt, sampleNExt;
+        int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+        coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+        coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+        for (; count > 0; count -= 4) {
+            sampleP = vld2_s16(sP);
+            sampleN = vld2_s16(sN);
+            coefsPV0 = vld1q_s32(coefsP);
+            coefsNV0 = vld1q_s32(coefsN);
+            coefsPV1 = vld1q_s32(coefsP1);
+            coefsNV1 = vld1q_s32(coefsN1);
+            sP -= 8;
+            sN += 8;
+            coefsP += 4;
+            coefsN += 4;
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            sampleP.val[0] = vrev64_s16(sampleP.val[0]);
+            sampleP.val[1] = vrev64_s16(sampleP.val[1]);
+
+            // interpolate (step1)
+            coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+            coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+            samplePExt.val[0] = vshll_n_s16(sampleP.val[0], 15);
+            samplePExt.val[1] = vshll_n_s16(sampleP.val[1], 15);
+            // interpolate (step2)
+            coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+            coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+            sampleNExt.val[0] = vshll_n_s16(sampleN.val[0], 15);
+            sampleNExt.val[1] = vshll_n_s16(sampleN.val[1], 15);
+            // interpolate (step3)
+            coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+            coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+            samplePExt.val[0] = vqrdmulhq_s32(samplePExt.val[0], coefsPV0);
+            samplePExt.val[1] = vqrdmulhq_s32(samplePExt.val[1], coefsPV0);
+            sampleNExt.val[0] = vqrdmulhq_s32(sampleNExt.val[0], coefsNV0);
+            sampleNExt.val[1] = vqrdmulhq_s32(sampleNExt.val[1], coefsNV0);
+            sum0 = vaddq_s32(sum0, samplePExt.val[0]);
+            sum1 = vaddq_s32(sum1, samplePExt.val[1]);
+            sum0 = vaddq_s32(sum0, sampleNExt.val[0]);
+            sum1 = vaddq_s32(sum1, sampleNExt.val[1]);
+        }
+        int32x2_t volumesV, outV;
+        volumesV = vld1_s32(mVolumeSIMD);
+        outV = vld1_s32(out);
+
+        //add all 4 partial sums
+        int32x2_t sumLow0, sumHigh0, sumLow1, sumHigh1;
+        sumLow0 = vget_low_s32(sum0);
+        sumHigh0 = vget_high_s32(sum0);
+        sumLow1 = vget_low_s32(sum1);
+        sumHigh1 = vget_high_s32(sum1);
+        sumLow0 = vpadd_s32(sumLow0, sumHigh0);
+        sumLow0 = vpadd_s32(sumLow0, sumLow0);
+        sumLow1 = vpadd_s32(sumLow1, sumHigh1);
+        sumLow1 = vpadd_s32(sumLow1, sumLow1);
+
+        sumLow0 = vtrn_s32(sumLow0, sumLow1).val[0];
+        sumLow0 = vqrdmulh_s32(sumLow0, volumesV);
+        outV = vadd_s32(outV, sumLow0);
+        vst1_s32(out, outV);
      }
+#endif
  }
  
  template<int CHANNELS>
author	Zhongwei Yao <zhongwei.yao@arm.com>
	Thu, 10 Apr 2014 16:23:42 +0000 (17:23 +0100)
committer	Zheng Xu <zheng.xu@arm.com>
	Thu, 10 Apr 2014 17:28:46 +0000 (18:28 +0100)