OSDN Git Service

[filter] Optimize LPF12_2 for AVX
authorStarg <starg@users.osdn.me>
Sun, 7 Mar 2021 06:41:45 +0000 (15:41 +0900)
committerStarg <starg@users.osdn.me>
Sun, 7 Mar 2021 06:41:45 +0000 (15:41 +0900)
timidity/filter.c
timidity/mix.c
timidity/thread_mix.c

index 4453832..6d95107 100644 (file)
@@ -1918,7 +1918,42 @@ static inline void recalc_filter_LPF12_2(FilterCoefficients *fc)
        }
 }
 
-#if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+// SIMD optimization (double * 2)
+static inline void buffer_filter_LPF12_2(FILTER_T* dc, FILTER_T* db, DATA_T* sp, int32 count)
+{
+       int32 i;
+       __m256d vcx0 = _mm256_broadcast_pd((__m128d *)(dc + 2));
+       __m256d vcx1 = _mm256_broadcast_pd((__m128d *)(dc + 4));
+       __m128d vcym2 = _mm_loadu_pd(dc + 6);
+       __m128d vcym1 = _mm_loadu_pd(dc + 8);
+       __m128d vy = _mm_loadu_pd(db + 2);
+       __m128d vym2 = _mm_unpacklo_pd(vy, vy);
+       __m128d vym1 = _mm_unpackhi_pd(vy, vy);
+
+       for (i = 0; i < count; i += 4)
+       {
+               __m256d vin = _mm256_loadu_pd(sp + i);
+               __m256d vx0 = _mm256_unpacklo_pd(vin, vin);
+               __m256d vx1 = _mm256_unpackhi_pd(vin, vin);
+               __m256d vfma2x = MM256_FMA2_PD(vcx0, vx0, vcx1, vx1);
+
+               __m128d vy0 = _mm_add_pd(_mm256_castpd256_pd128(vfma2x), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
+               _mm_storeu_pd(sp + i, vy0);
+               vym2 = _mm_unpacklo_pd(vy0, vy0);
+               vym1 = _mm_unpackhi_pd(vy0, vy0);
+
+               __m128d vy1 = _mm_add_pd(_mm256_extractf128_pd(vfma2x, 1), MM_FMA2_PD(vcym2, vym2, vcym1, vym1));
+               _mm_storeu_pd(sp + i + 2, vy1);
+               vym2 = _mm_unpacklo_pd(vy1, vy1);
+               vym1 = _mm_unpackhi_pd(vy1, vy1);
+               vy = vy1;
+       }
+
+       _mm_storeu_pd(db + 2, vy);
+}
+
+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
 // SIMD optimization (double * 2)
 static inline void buffer_filter_LPF12_2(FILTER_T *dc, FILTER_T *db, DATA_T *sp, int32 count)
 {
index 04e2598..e072ca0 100644 (file)
@@ -508,7 +508,9 @@ void mix_voice(DATA_T *buf, int v, int32 c)
        if (delay_cnt) {
                if(delay_cnt == c)
                        return;
-#if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+               delay_cnt &= ~(0x3); // for filter SIMD optimaize (filter.c buffer_filter()
+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
                delay_cnt &= ~(0x1); // for filter SIMD optimaize (filter.c buffer_filter()
 #endif
                if (play_mode->encoding & PE_MONO)
index 87f68b7..f8d72a0 100644 (file)
@@ -406,7 +406,9 @@ void mix_voice_thread(DATA_T *buf, int v, int32 c, int thread)
        if (delay_cnt) {
                if(delay_cnt == c)
                        return;
-#if (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+               delay_cnt &= ~(0x3); // for filter SIMD optimaize (filter.c buffer_filter()
+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
                delay_cnt &= ~(0x1); // for filter SIMD optimaize (filter.c buffer_filter()
 #endif
                if (play_mode->encoding & PE_MONO)