From: Andy Hung Date: Mon, 31 Jul 2017 22:32:29 +0000 (-0700) Subject: Allow unaligned vector pointers for power computation X-Git-Tag: android-x86-8.1-r3~5^2 X-Git-Url: http://git.osdn.net/view?p=android-x86%2Fsystem-media.git;a=commitdiff_plain;h=10b7f5a9aa37908172a4367363c40f481dbcfd7f Allow unaligned vector pointers for power computation Test: dedicated native test Bug: 64151823 Change-Id: I620ba9f673782e5769c499158c6fb866a103a9e4 --- diff --git a/audio_utils/power.cpp b/audio_utils/power.cpp index e801da05..fea3112d 100644 --- a/audio_utils/power.cpp +++ b/audio_utils/power.cpp @@ -18,6 +18,7 @@ #define LOG_TAG "audio_utils_power" #include +#include #include #include @@ -137,130 +138,101 @@ inline float energyMono(const void *amplitudes, size_t size) // fast float power computation for ARM processors that support NEON. #ifdef USE_NEON -template <> -inline float energyMono(const void *amplitudes, size_t size) -{ - float32x4_t *famplitudes = (float32x4_t *)amplitudes; - - // clear accumulator - float32x4_t accum = vdupq_n_f32(0); - - // iterate over array getting sum of squares in 4 lanes. - size_t i; - for (i = 0; i < (size & ~3); i += 4) { - accum = vmlaq_f32(accum, *famplitudes, *famplitudes); - ++famplitudes; - } - - // narrow 4 lanes of floats - float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume - accum2 = vpadd_f32(accum2, accum2); // combine to mono +template +float32x4_t convertToFloatVectorAmplitude(T vamplitude) = delete; - // accumulate remainder - float value = vget_lane_f32(accum2, 0); - for (; i < size; ++i) { - const float amplitude = ((float *)amplitudes)[i]; - value += amplitude * amplitude; - } +template <> +float32x4_t convertToFloatVectorAmplitude(float32x4_t vamplitude) { + return vamplitude; +} - return value; +template <> +float32x4_t convertToFloatVectorAmplitude(int16x4_t vamplitude) { + const int32x4_t iamplitude = vmovl_s16(vamplitude); // expand s16 to s32 first + return vcvtq_f32_s32(iamplitude); } template <> -inline float energyMono(const void *amplitudes, size_t size) +float32x4_t convertToFloatVectorAmplitude(int32x4_t vamplitude) { + return vcvtq_f32_s32(vamplitude); +} + +template +inline float energyMonoVector(const void *amplitudes, size_t size) { - int16x4_t *samplitudes = (int16x4_t *)amplitudes; + static_assert(sizeof(Vector) % sizeof(Scalar) == 0, + "Vector size must be a multiple of scalar size"); + const size_t vectorLength = sizeof(Vector) / sizeof(Scalar); // typically 4 (a const) + + // check pointer validity, must be aligned with scalar type. + const Scalar *samplitudes = reinterpret_cast(amplitudes); + LOG_ALWAYS_FATAL_IF((uintptr_t)samplitudes % alignof(Scalar) != 0, + "Non-element aligned address: %p %zu", samplitudes, alignof(Scalar)); + + float accumulator = 0; + + // handle pointer unaligned to vector type. + while ((uintptr_t)samplitudes % alignof(Vector) != 0 /* compiler optimized */ && size > 0) { + const float amp = (float)*samplitudes++; + accumulator += amp * amp; + --size; + } + + // samplitudes is now adjusted for proper vector alignment, cast to Vector * + const Vector *vamplitudes = reinterpret_cast(samplitudes); - // clear accumulator + // clear vector accumulator float32x4_t accum = vdupq_n_f32(0); - // iterate over array getting sum of squares in 4 lanes. + // iterate over array getting sum of squares in vectorLength lanes. size_t i; - for (i = 0; i < (size & ~3); i += 4) { - // expand s16 to s32 - int32x4_t amplitude = vmovl_s16(*samplitudes); - ++samplitudes; - // convert s32 to f32 - float32x4_t famplitude = vcvtq_f32_s32(amplitude); + for (i = 0; i < size - size % vectorLength /* compiler optimized */; i += vectorLength) { + const float32x4_t famplitude = convertToFloatVectorAmplitude(*vamplitudes++); accum = vmlaq_f32(accum, famplitude, famplitude); } - // narrow 4 lanes of floats + // narrow vectorLength lanes of floats float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume accum2 = vpadd_f32(accum2, accum2); // combine to mono - // accumulate remainder - float value = vget_lane_f32(accum2, 0); + // accumulate vector + accumulator += vget_lane_f32(accum2, 0); + + // accumulate any trailing elements too small for vector size for (; i < size; ++i) { - const float amplitude = (float)((int16_t *)amplitudes)[i]; - value += amplitude * amplitude; + const float amp = (float)samplitudes[i]; + accumulator += amp * amp; } + return accumulator; +} + +template <> +inline float energyMono(const void *amplitudes, size_t size) +{ + return energyMonoVector(amplitudes, size); +} - return value * normalizeEnergy(); +template <> +inline float energyMono(const void *amplitudes, size_t size) +{ + return energyMonoVector(amplitudes, size) + * normalizeEnergy(); } // fast int32_t power computation for PCM_32 template <> inline float energyMono(const void *amplitudes, size_t size) { - int32x4_t *samplitudes = (int32x4_t *)amplitudes; - - // clear accumulator - float32x4_t accum = vdupq_n_f32(0); - - // iterate over array getting sum of squares in 4 lanes. - size_t i; - for (i = 0; i < (size & ~3); i += 4) { - // convert s32 to f32 - float32x4_t famplitude = vcvtq_f32_s32(*samplitudes); - ++samplitudes; - accum = vmlaq_f32(accum, famplitude, famplitude); - } - - // narrow 4 lanes of floats - float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume - accum2 = vpadd_f32(accum2, accum2); // combine to mono - - // accumulate remainder - float value = vget_lane_f32(accum2, 0); - for (; i < size; ++i) { - const float amplitude = (float)((int32_t *)amplitudes)[i]; - value += amplitude * amplitude; - } - - return value * normalizeEnergy(); + return energyMonoVector(amplitudes, size) + * normalizeEnergy(); } // fast int32_t power computation for PCM_8_24 (essentially identical to PCM_32 above) template <> inline float energyMono(const void *amplitudes, size_t size) { - int32x4_t *samplitudes = (int32x4_t *)amplitudes; - - // clear accumulator - float32x4_t accum = vdupq_n_f32(0); - - // iterate over array getting sum of squares in 4 lanes. - size_t i; - for (i = 0; i < (size & ~3); i += 4) { - // convert s32 to f32 - float32x4_t famplitude = vcvtq_f32_s32(*samplitudes); - ++samplitudes; - accum = vmlaq_f32(accum, famplitude, famplitude); - } - - // narrow 4 lanes of floats - float32x2_t accum2 = vadd_f32(vget_low_f32(accum), vget_high_f32(accum)); // get stereo volume - accum2 = vpadd_f32(accum2, accum2); // combine to mono - - // accumulate remainder - float value = vget_lane_f32(accum2, 0); - for (; i < size; ++i) { - const float amplitude = (float)((int32_t *)amplitudes)[i]; - value += amplitude * amplitude; - } - - return value * normalizeEnergy(); + return energyMonoVector(amplitudes, size) + * normalizeEnergy(); } #endif // USE_NEON diff --git a/audio_utils/tests/build_and_run_all_unit_tests.sh b/audio_utils/tests/build_and_run_all_unit_tests.sh index 36569746..401847db 100755 --- a/audio_utils/tests/build_and_run_all_unit_tests.sh +++ b/audio_utils/tests/build_and_run_all_unit_tests.sh @@ -20,5 +20,9 @@ adb root && adb wait-for-device remount echo "========================================" echo "testing primitives" adb push $OUT/system/lib/libaudioutils.so /system/lib -adb push $OUT/data/nativetest/primitives_tests /system/bin +adb push $OUT/data/nativetest/primitives_tests/primitives_tests /system/bin adb shell /system/bin/primitives_tests + +echo "testing power" +adb push $OUT/data/nativetest/power_tests/power_tests /system/bin +adb shell /system/bin/power_tests diff --git a/audio_utils/tests/power_tests.cpp b/audio_utils/tests/power_tests.cpp index b86dac30..5c3f8b07 100644 --- a/audio_utils/tests/power_tests.cpp +++ b/audio_utils/tests/power_tests.cpp @@ -66,18 +66,24 @@ void testFloatValue(float f_value, size_t length) { p24_ary[i] = p24_value; } - EXPECT_EQ(power, - audio_utils_compute_power_mono(f_ary, AUDIO_FORMAT_PCM_FLOAT, length)); - EXPECT_EQ(power, - audio_utils_compute_power_mono(u8_ary, AUDIO_FORMAT_PCM_8_BIT, length)); - EXPECT_EQ(power, - audio_utils_compute_power_mono(i16_ary, AUDIO_FORMAT_PCM_16_BIT, length)); - EXPECT_EQ(power, - audio_utils_compute_power_mono(i32_ary, AUDIO_FORMAT_PCM_32_BIT, length)); - EXPECT_EQ(power, - audio_utils_compute_power_mono(q8_23_ary, AUDIO_FORMAT_PCM_8_24_BIT, length)); - EXPECT_EQ(power, - audio_utils_compute_power_mono(p24_ary, AUDIO_FORMAT_PCM_24_BIT_PACKED, length)); + // check offset by 1, 2, 3 elements for unaligned NEON vector handling. + for (size_t i = 0; i < 3; ++i) { + if (i >= length) break; + EXPECT_EQ(power, + audio_utils_compute_power_mono(f_ary + i, AUDIO_FORMAT_PCM_FLOAT, length - i)); + EXPECT_EQ(power, + audio_utils_compute_power_mono(u8_ary + i, AUDIO_FORMAT_PCM_8_BIT, length - i)); + EXPECT_EQ(power, + audio_utils_compute_power_mono(i16_ary + i, AUDIO_FORMAT_PCM_16_BIT, length - i)); + EXPECT_EQ(power, + audio_utils_compute_power_mono(i32_ary + i, AUDIO_FORMAT_PCM_32_BIT, length - i)); + EXPECT_EQ(power, + audio_utils_compute_power_mono( + q8_23_ary + i, AUDIO_FORMAT_PCM_8_24_BIT, length - i)); + EXPECT_EQ(power, + audio_utils_compute_power_mono( + p24_ary + i, AUDIO_FORMAT_PCM_24_BIT_PACKED, length - i)); + } } void testFloatRamp(size_t length) {