Allow unaligned vector pointers for power computation

author Andy Hung <hunga@google.com>

Mon, 31 Jul 2017 22:32:29 +0000 (15:32 -0700)

committer Andy Hung <hunga@google.com>

Tue, 1 Aug 2017 18:47:22 +0000 (18:47 +0000)
author Andy Hung <hunga@google.com>
Mon, 31 Jul 2017 22:32:29 +0000 (15:32 -0700)
committer Andy Hung <hunga@google.com>
Tue, 1 Aug 2017 18:47:22 +0000 (18:47 +0000)
diff --git a/audio_utils/power.cpp b/audio_utils/power.cpp

index e801da0..fea3112 100644 (file)
--- a/audio_utils/power.cpp
+++ b/audio_utils/power.cpp
@@ -18,6 +18,7 @@
  #define LOG_TAG "audio_utils_power"
  #include <log/log.h>
  
+#include <algorithm>
  #include <math.h>
  
  #include <audio_utils/power.h>
@@ -137,130 +138,101 @@ inline float energyMono(const void *amplitudes, size_t size)
  // fast float power computation for ARM processors that support NEON.
  #ifdef USE_NEON
  
-template <>
-inline float energyMono<AUDIO_FORMAT_PCM_FLOAT>(const void *amplitudes, size_t size)
-{
-    float32x4_t *famplitudes = (float32x4_t *)amplitudes;
-
-    // clear accumulator
-    float32x4_t accum = vdupq_n_f32(0);
-
-    // iterate over array getting sum of squares in 4 lanes.
-    size_t i;
-    for (i = 0; i < (size & ~3); i += 4) {
-        accum = vmlaq_f32(accum, *famplitudes, *famplitudes);
-        ++famplitudes;
-    }
-
-    // narrow 4 lanes of floats
-    float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume
-    accum2 = vpadd_f32(accum2, accum2); // combine to mono
+template <typename T>
+float32x4_t convertToFloatVectorAmplitude(T vamplitude) = delete;
  
-    // accumulate remainder
-    float value = vget_lane_f32(accum2, 0);
-    for (; i < size; ++i) {
-        const float amplitude = ((float *)amplitudes)[i];
-        value +=  amplitude * amplitude;
-    }
+template <>
+float32x4_t convertToFloatVectorAmplitude<float32x4_t>(float32x4_t vamplitude) {
+    return vamplitude;
+}
  
-    return value;
+template <>
+float32x4_t convertToFloatVectorAmplitude<int16x4_t>(int16x4_t vamplitude) {
+    const int32x4_t iamplitude = vmovl_s16(vamplitude); // expand s16 to s32 first
+    return vcvtq_f32_s32(iamplitude);
  }
  
  template <>
-inline float energyMono<AUDIO_FORMAT_PCM_16_BIT>(const void *amplitudes, size_t size)
+float32x4_t convertToFloatVectorAmplitude<int32x4_t>(int32x4_t vamplitude) {
+    return vcvtq_f32_s32(vamplitude);
+}
+
+template <typename Vector, typename Scalar>
+inline float energyMonoVector(const void *amplitudes, size_t size)
  {
-    int16x4_t *samplitudes = (int16x4_t *)amplitudes;
+    static_assert(sizeof(Vector) % sizeof(Scalar) == 0,
+             "Vector size must be a multiple of scalar size");
+    const size_t vectorLength = sizeof(Vector) / sizeof(Scalar); // typically 4 (a const)
+
+    // check pointer validity, must be aligned with scalar type.
+    const Scalar *samplitudes = reinterpret_cast<const Scalar *>(amplitudes);
+    LOG_ALWAYS_FATAL_IF((uintptr_t)samplitudes % alignof(Scalar) != 0,
+            "Non-element aligned address: %p %zu", samplitudes, alignof(Scalar));
+
+    float accumulator = 0;
+
+    // handle pointer unaligned to vector type.
+    while ((uintptr_t)samplitudes % alignof(Vector) != 0 /* compiler optimized */ && size > 0) {
+        const float amp = (float)*samplitudes++;
+        accumulator += amp * amp;
+        --size;
+    }
+
+    // samplitudes is now adjusted for proper vector alignment, cast to Vector *
+    const Vector *vamplitudes = reinterpret_cast<const Vector *>(samplitudes);
  
-    // clear accumulator
+    // clear vector accumulator
      float32x4_t accum = vdupq_n_f32(0);
  
-    // iterate over array getting sum of squares in 4 lanes.
+    // iterate over array getting sum of squares in vectorLength lanes.
      size_t i;
-    for (i = 0; i < (size & ~3); i += 4) {
-        // expand s16 to s32
-        int32x4_t amplitude = vmovl_s16(*samplitudes);
-        ++samplitudes;
-        // convert s32 to f32
-        float32x4_t famplitude = vcvtq_f32_s32(amplitude);
+    for (i = 0; i < size - size % vectorLength /* compiler optimized */; i += vectorLength) {
+        const float32x4_t famplitude = convertToFloatVectorAmplitude(*vamplitudes++);
          accum = vmlaq_f32(accum, famplitude, famplitude);
      }
  
-    // narrow 4 lanes of floats
+    // narrow vectorLength lanes of floats
      float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume
      accum2 = vpadd_f32(accum2, accum2); // combine to mono
  
-    // accumulate remainder
-    float value = vget_lane_f32(accum2, 0);
+    // accumulate vector
+    accumulator += vget_lane_f32(accum2, 0);
+
+    // accumulate any trailing elements too small for vector size
      for (; i < size; ++i) {
-        const float amplitude = (float)((int16_t *)amplitudes)[i];
-        value +=  amplitude * amplitude;
+        const float amp = (float)samplitudes[i];
+        accumulator += amp * amp;
      }
+    return accumulator;
+}
+
+template <>
+inline float energyMono<AUDIO_FORMAT_PCM_FLOAT>(const void *amplitudes, size_t size)
+{
+    return energyMonoVector<float32x4_t, float>(amplitudes, size);
+}
  
-    return value * normalizeEnergy<AUDIO_FORMAT_PCM_16_BIT>();
+template <>
+inline float energyMono<AUDIO_FORMAT_PCM_16_BIT>(const void *amplitudes, size_t size)
+{
+    return energyMonoVector<int16x4_t, int16_t>(amplitudes, size)
+            * normalizeEnergy<AUDIO_FORMAT_PCM_16_BIT>();
  }
  
  // fast int32_t power computation for PCM_32
  template <>
  inline float energyMono<AUDIO_FORMAT_PCM_32_BIT>(const void *amplitudes, size_t size)
  {
-    int32x4_t *samplitudes = (int32x4_t *)amplitudes;
-
-    // clear accumulator
-    float32x4_t accum = vdupq_n_f32(0);
-
-    // iterate over array getting sum of squares in 4 lanes.
-    size_t i;
-    for (i = 0; i < (size & ~3); i += 4) {
-        // convert s32 to f32
-        float32x4_t famplitude = vcvtq_f32_s32(*samplitudes);
-        ++samplitudes;
-        accum = vmlaq_f32(accum, famplitude, famplitude);
-    }
-
-    // narrow 4 lanes of floats
-    float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume
-    accum2 = vpadd_f32(accum2, accum2); // combine to mono
-
-    // accumulate remainder
-    float value = vget_lane_f32(accum2, 0);
-    for (; i < size; ++i) {
-        const float amplitude = (float)((int32_t *)amplitudes)[i];
-        value +=  amplitude * amplitude;
-    }
-
-    return value * normalizeEnergy<AUDIO_FORMAT_PCM_32_BIT>();
+    return energyMonoVector<int32x4_t, int32_t>(amplitudes, size)
+            * normalizeEnergy<AUDIO_FORMAT_PCM_32_BIT>();
  }
  
  // fast int32_t power computation for PCM_8_24 (essentially identical to PCM_32 above)
  template <>
  inline float energyMono<AUDIO_FORMAT_PCM_8_24_BIT>(const void *amplitudes, size_t size)
  {
-    int32x4_t *samplitudes = (int32x4_t *)amplitudes;
-
-    // clear accumulator
-    float32x4_t accum = vdupq_n_f32(0);
-
-    // iterate over array getting sum of squares in 4 lanes.
-    size_t i;
-    for (i = 0; i < (size & ~3); i += 4) {
-        // convert s32 to f32
-        float32x4_t famplitude = vcvtq_f32_s32(*samplitudes);
-        ++samplitudes;
-        accum = vmlaq_f32(accum, famplitude, famplitude);
-    }
-
-    // narrow 4 lanes of floats
-    float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume
-    accum2 = vpadd_f32(accum2, accum2); // combine to mono
-
-    // accumulate remainder
-    float value = vget_lane_f32(accum2, 0);
-    for (; i < size; ++i) {
-        const float amplitude = (float)((int32_t *)amplitudes)[i];
-        value +=  amplitude * amplitude;
-    }
-
-    return value * normalizeEnergy<AUDIO_FORMAT_PCM_8_24_BIT>();
+    return energyMonoVector<int32x4_t, int32_t>(amplitudes, size)
+            * normalizeEnergy<AUDIO_FORMAT_PCM_8_24_BIT>();
  }
  
  #endif // USE_NEON
diff --git a/audio_utils/tests/build_and_run_all_unit_tests.sh b/audio_utils/tests/build_and_run_all_unit_tests.sh

index 3656974..401847d 100755 (executable)
--- a/audio_utils/tests/build_and_run_all_unit_tests.sh
+++ b/audio_utils/tests/build_and_run_all_unit_tests.sh
@@ -20,5 +20,9 @@ adb root && adb wait-for-device remount
  echo "========================================"
  echo "testing primitives"
  adb push $OUT/system/lib/libaudioutils.so /system/lib
-adb push $OUT/data/nativetest/primitives_tests /system/bin
+adb push $OUT/data/nativetest/primitives_tests/primitives_tests /system/bin
  adb shell /system/bin/primitives_tests
+
+echo "testing power"
+adb push $OUT/data/nativetest/power_tests/power_tests /system/bin
+adb shell /system/bin/power_tests
diff --git a/audio_utils/tests/power_tests.cpp b/audio_utils/tests/power_tests.cpp

index b86dac3..5c3f8b0 100644 (file)
--- a/audio_utils/tests/power_tests.cpp
+++ b/audio_utils/tests/power_tests.cpp
@@ -66,18 +66,24 @@ void testFloatValue(float f_value, size_t length) {
          p24_ary[i] = p24_value;
      }
  
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(f_ary, AUDIO_FORMAT_PCM_FLOAT, length));
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(u8_ary, AUDIO_FORMAT_PCM_8_BIT, length));
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(i16_ary, AUDIO_FORMAT_PCM_16_BIT, length));
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(i32_ary, AUDIO_FORMAT_PCM_32_BIT, length));
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(q8_23_ary, AUDIO_FORMAT_PCM_8_24_BIT, length));
-    EXPECT_EQ(power,
-            audio_utils_compute_power_mono(p24_ary, AUDIO_FORMAT_PCM_24_BIT_PACKED, length));
+    // check offset by 1, 2, 3 elements for unaligned NEON vector handling.
+    for (size_t i = 0; i < 3; ++i) {
+        if (i >= length) break;
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(f_ary + i, AUDIO_FORMAT_PCM_FLOAT, length - i));
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(u8_ary + i, AUDIO_FORMAT_PCM_8_BIT, length - i));
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(i16_ary + i, AUDIO_FORMAT_PCM_16_BIT, length - i));
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(i32_ary + i, AUDIO_FORMAT_PCM_32_BIT, length - i));
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(
+                        q8_23_ary + i, AUDIO_FORMAT_PCM_8_24_BIT, length - i));
+        EXPECT_EQ(power,
+                audio_utils_compute_power_mono(
+                        p24_ary + i, AUDIO_FORMAT_PCM_24_BIT_PACKED, length - i));
+    }
  }
  
  void testFloatRamp(size_t length) {
author	Andy Hung <hunga@google.com>
	Mon, 31 Jul 2017 22:32:29 +0000 (15:32 -0700)
committer	Andy Hung <hunga@google.com>
	Tue, 1 Aug 2017 18:47:22 +0000 (18:47 +0000)
audio_utils/power.cpp		patch \| blob \| history
audio_utils/tests/build_and_run_all_unit_tests.sh		patch \| blob \| history
audio_utils/tests/power_tests.cpp		patch \| blob \| history