OSDN Git Service

Batch voice filters
authorStarg <starg@users.osdn.me>
Sat, 20 Mar 2021 14:48:54 +0000 (23:48 +0900)
committerStarg <starg@users.osdn.me>
Sat, 20 Mar 2021 15:03:44 +0000 (00:03 +0900)
timidity/filter.c
timidity/filter.h
timidity/mix.c
timidity/mix.h
timidity/optcode.h
timidity/playmidi.c

index 6d95107..15fd0f5 100644 (file)
@@ -3921,9 +3921,293 @@ void resample_filter(int v, DATA_T *sp, int32 count)
 }
 
 
+#ifdef MIX_VOICE_BATCH
 
+#if MIX_VOICE_BATCH_SIZE != 8
+#error invalid MIX_VOICE_BATCH_SIZE
+#endif
+
+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+static void sample_filter_LPF12_2_batch(int batch_size, FILTER_T **dcs, FILTER_T **dbs, DATA_T **sps, int32 *counts)
+{
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
+               if (i >= batch_size)
+                       break;
+
+               __m128i vcounts = _mm_set_epi32(
+                       i + 3 < batch_size ? counts[i + 3] : 0,
+                       i + 2 < batch_size ? counts[i + 2] : 0,
+                       i + 1 < batch_size ? counts[i + 1] : 0,
+                       counts[i]
+               );
+
+               __m128d vdb01_0 = _mm_loadu_pd(dbs[i]);
+               __m128d vdb01_1 = i + 1 < batch_size ? _mm_loadu_pd(dbs[i + 1]) : _mm_setzero_pd();
+               __m128d vdb01_2 = i + 2 < batch_size ? _mm_loadu_pd(dbs[i + 2]) : _mm_setzero_pd();
+               __m128d vdb01_3 = i + 3 < batch_size ? _mm_loadu_pd(dbs[i + 3]) : _mm_setzero_pd();
+
+               __m256d vdb01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_0), vdb01_2, 1);
+               __m256d vdb01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdb01_1), vdb01_3, 1);
+
+               __m256d vdb0 = _mm256_unpacklo_pd(vdb01_02, vdb01_13);
+               __m256d vdb1 = _mm256_unpackhi_pd(vdb01_02, vdb01_13);
+
+               __m128d vdc01_0 = _mm_loadu_pd(dcs[i]);
+               __m128d vdc01_1 = i + 1 < batch_size ? _mm_loadu_pd(dcs[i + 1]) : _mm_setzero_pd();
+               __m128d vdc01_2 = i + 2 < batch_size ? _mm_loadu_pd(dcs[i + 2]) : _mm_setzero_pd();
+               __m128d vdc01_3 = i + 3 < batch_size ? _mm_loadu_pd(dcs[i + 3]) : _mm_setzero_pd();
+
+               __m256d vdc01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_0), vdc01_2, 1);
+               __m256d vdc01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(vdc01_1), vdc01_3, 1);
+
+               __m256d vdc0 = _mm256_unpacklo_pd(vdc01_02, vdc01_13);
+               __m256d vdc1 = _mm256_unpackhi_pd(vdc01_02, vdc01_13);
+
+               __m128i vcounts_halfmax = _mm_max_epi32(vcounts, _mm_shuffle_epi32(vcounts, (3 << 2) | 2));
+               int32 count_max = _mm_cvtsi128_si32(_mm_max_epi32(vcounts_halfmax, _mm_shuffle_epi32(vcounts_halfmax, 1)));
+
+               for (int32 j = 0; j < count_max; j += 4) {
+                       __m256d vsp0123_0 = j < counts[i] ? _mm256_loadu_pd(&sps[i][j]) : _mm256_setzero_pd();
+                       __m256d vsp0123_1 = i + 1 < batch_size && j < counts[i + 1] ? _mm256_loadu_pd(&sps[i + 1][j]) : _mm256_setzero_pd();
+                       __m256d vsp0123_2 = i + 2 < batch_size && j < counts[i + 2] ? _mm256_loadu_pd(&sps[i + 2][j]) : _mm256_setzero_pd();
+                       __m256d vsp0123_3 = i + 3 < batch_size && j < counts[i + 3] ? _mm256_loadu_pd(&sps[i + 3][j]) : _mm256_setzero_pd();
+
+                       __m256d vsp01_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (2 << 4) | 0);
+                       __m256d vsp01_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (2 << 4) | 0);
+                       __m256d vsp23_02 = _mm256_permute2f128_pd(vsp0123_0, vsp0123_2, (3 << 4) | 1);
+                       __m256d vsp23_13 = _mm256_permute2f128_pd(vsp0123_1, vsp0123_3, (3 << 4) | 1);
+
+                       __m256d vsps[4];
+                       vsps[0] = _mm256_unpacklo_pd(vsp01_02, vsp01_13);
+                       vsps[1] = _mm256_unpackhi_pd(vsp01_02, vsp01_13);
+                       vsps[2] = _mm256_unpacklo_pd(vsp23_02, vsp23_13);
+                       vsps[3] = _mm256_unpackhi_pd(vsp23_02, vsp23_13);
+
+                       for (int k = 0; k < 4; k++) {
+                               vdb1 = MM256_FMA_PD(_mm256_sub_pd(vsps[k], vdb0), vdc1, vdb1);
+                               vdb0 = _mm256_add_pd(vdb0, vdb1);
+                               vdb1 = _mm256_mul_pd(vdb1, vdc0);
+                               vsps[k] = vdb0;
+                       }
+
+                       vsp01_02 = _mm256_unpacklo_pd(vsps[0], vsps[1]);
+                       vsp01_13 = _mm256_unpackhi_pd(vsps[0], vsps[1]);
+                       vsp23_02 = _mm256_unpacklo_pd(vsps[2], vsps[3]);
+                       vsp23_13 = _mm256_unpackhi_pd(vsps[2], vsps[3]);
+
+                       vsp0123_0 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (2 << 4) | 0);
+                       vsp0123_1 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (2 << 4) | 0);
+                       vsp0123_2 = _mm256_permute2f128_pd(vsp01_02, vsp23_02, (3 << 4) | 1);
+                       vsp0123_3 = _mm256_permute2f128_pd(vsp01_13, vsp23_13, (3 << 4) | 1);
+
+                       if (j < counts[i])
+                               _mm256_storeu_pd(&sps[i][j], vsp0123_0);
+
+                       if (i + 1 < batch_size && j < counts[i + 1])
+                               _mm256_storeu_pd(&sps[i + 1][j], vsp0123_1);
+
+                       if (i + 2 < batch_size && j < counts[i + 2])
+                               _mm256_storeu_pd(&sps[i + 2][j], vsp0123_2);
+
+                       if (i + 3 < batch_size && j < counts[i + 3])
+                               _mm256_storeu_pd(&sps[i + 3][j], vsp0123_3);
+               }
+
+               vdb01_02 = _mm256_unpacklo_pd(vdb0, vdb1);
+               vdb01_13 = _mm256_unpackhi_pd(vdb0, vdb1);
+
+               _mm_storeu_pd(dbs[i], _mm256_castpd256_pd128(vdb01_02));
+
+               if (i + 1 < batch_size)
+                       _mm_storeu_pd(dbs[i + 1], _mm256_castpd256_pd128(vdb01_13));
+
+               if (i + 2 < batch_size)
+                       _mm_storeu_pd(dbs[i + 2], _mm256_extractf128_pd(vdb01_02, 1));
+
+               if (i + 3 < batch_size)
+                       _mm_storeu_pd(dbs[i + 3], _mm256_extractf128_pd(vdb01_13, 1));
+       }
+}
+
+static void recalc_filter_LPF12_2_batch(int batch_size, FilterCoefficients **fcs)
+{
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i += 4) {
+               if (i >= batch_size)
+                       break;
+
+               __m256d vfcrange0123_0 = _mm256_loadu_pd(fcs[i]->range);
+               __m256d vfcrange0123_1 = i + 1 < batch_size ? _mm256_loadu_pd(fcs[i + 1]->range) : vfcrange0123_0;
+               __m256d vfcrange0123_2 = i + 2 < batch_size ? _mm256_loadu_pd(fcs[i + 2]->range) : vfcrange0123_0;
+               __m256d vfcrange0123_3 = i + 3 < batch_size ? _mm256_loadu_pd(fcs[i + 3]->range) : vfcrange0123_0;
+
+               __m256d vfcrange01_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (2 << 4) | 0);
+               __m256d vfcrange01_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (2 << 4) | 0);
+               __m256d vfcrange23_02 = _mm256_permute2f128_pd(vfcrange0123_0, vfcrange0123_2, (3 << 4) | 1);
+               __m256d vfcrange23_13 = _mm256_permute2f128_pd(vfcrange0123_1, vfcrange0123_3, (3 << 4) | 1);
+
+               __m256d vfcrange0 = _mm256_unpacklo_pd(vfcrange01_02, vfcrange01_13);
+               __m256d vfcrange1 = _mm256_unpackhi_pd(vfcrange01_02, vfcrange01_13);
+               __m256d vfcrange2 = _mm256_unpacklo_pd(vfcrange23_02, vfcrange23_13);
+               __m256d vfcrange3 = _mm256_unpackhi_pd(vfcrange23_02, vfcrange23_13);
+
+               __m256d vfcfreq = _mm256_set_pd(
+                       i + 3 < batch_size ? fcs[i + 3]->freq : fcs[i]->freq,
+                       i + 2 < batch_size ? fcs[i + 2]->freq : fcs[i]->freq,
+                       i + 1 < batch_size ? fcs[i + 1]->freq : fcs[i]->freq,
+                       fcs[i]->freq
+               );
+
+               __m256d vfcreso_DB = _mm256_set_pd(
+                       i + 3 < batch_size ? fcs[i + 3]->reso_dB : fcs[i]->reso_dB,
+                       i + 2 < batch_size ? fcs[i + 2]->reso_dB : fcs[i]->reso_dB,
+                       i + 1 < batch_size ? fcs[i + 1]->reso_dB : fcs[i]->reso_dB,
+                       fcs[i]->reso_dB
+               );
+
+               __m256d vmask = _mm256_or_pd(
+                       _mm256_or_pd(_mm256_cmp_pd(vfcfreq, vfcrange0, _CMP_LT_OS), _mm256_cmp_pd(vfcfreq, vfcrange1, _CMP_GT_OS)),
+                       _mm256_or_pd(_mm256_cmp_pd(vfcreso_DB, vfcrange2, _CMP_LT_OS), _mm256_cmp_pd(vfcreso_DB, vfcrange3, _CMP_GT_OS))
+               );
+
+               if (!_mm256_testz_pd(vmask, vmask)) {
+                       __m256d v1mmargin = _mm256_set1_pd(1.0 - ext_filter_margin);
+                       __m256d v1pmargin = _mm256_set1_pd(1.0 + ext_filter_margin);
+
+                       vfcrange0 = _mm256_blendv_pd(vfcrange0, _mm256_mul_pd(vfcfreq, v1mmargin), vmask);
+                       vfcrange1 = _mm256_blendv_pd(vfcrange1, _mm256_mul_pd(vfcfreq, v1pmargin), vmask);
+                       vfcrange2 = _mm256_blendv_pd(vfcrange2, _mm256_mul_pd(vfcreso_DB, v1mmargin), vmask);
+                       vfcrange3 = _mm256_blendv_pd(vfcrange3, _mm256_mul_pd(vfcreso_DB, v1pmargin), vmask);
+
+                       vfcrange01_02 = _mm256_unpacklo_pd(vfcrange0, vfcrange1);
+                       vfcrange01_13 = _mm256_unpackhi_pd(vfcrange0, vfcrange1);
+                       vfcrange23_02 = _mm256_unpacklo_pd(vfcrange2, vfcrange3);
+                       vfcrange23_13 = _mm256_unpackhi_pd(vfcrange2, vfcrange3);
+
+                       vfcrange0123_0 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (2 << 4) | 0);
+                       vfcrange0123_1 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (2 << 4) | 0);
+                       vfcrange0123_2 = _mm256_permute2f128_pd(vfcrange01_02, vfcrange23_02, (3 << 4) | 1);
+                       vfcrange0123_3 = _mm256_permute2f128_pd(vfcrange01_13, vfcrange23_13, (3 << 4) | 1);
+
+                       _mm256_storeu_pd(fcs[i]->range, vfcrange0123_0);
+
+                       if (i + 1 < batch_size)
+                               _mm256_storeu_pd(fcs[i + 1]->range, vfcrange0123_1);
+
+                       if (i + 2 < batch_size)
+                               _mm256_storeu_pd(fcs[i + 2]->range, vfcrange0123_2);
+
+                       if (i + 3 < batch_size)
+                               _mm256_storeu_pd(fcs[i + 3]->range, vfcrange0123_3);
+
+                       __m256d vfcdiv_flt_rate = _mm256_set_pd(
+                               i + 3 < batch_size ? fcs[i + 3]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 2 < batch_size ? fcs[i + 2]->div_flt_rate : fcs[i]->div_flt_rate,
+                               i + 1 < batch_size ? fcs[i + 1]->div_flt_rate : fcs[i]->div_flt_rate,
+                               fcs[i]->div_flt_rate
+                       );
+
+                       __m256d vf = _mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(M_PI2), vfcfreq), vfcdiv_flt_rate);
+
+                       __m256d vreso_db_cf_p = _mm256_set_pd(
+                               i + 3 < batch_size ? RESO_DB_CF_P(fcs[i + 3]->reso_dB) : RESO_DB_CF_P(fcs[i]->reso_dB),
+                               i + 2 < batch_size ? RESO_DB_CF_P(fcs[i + 2]->reso_dB) : RESO_DB_CF_P(fcs[i]->reso_dB),
+                               i + 1 < batch_size ? RESO_DB_CF_P(fcs[i + 1]->reso_dB) : RESO_DB_CF_P(fcs[i]->reso_dB),
+                               RESO_DB_CF_P(fcs[i]->reso_dB)
+                       );
+
+                       __m256d v1 = _mm256_set1_pd(1.0);
+                       __m256d v2 = _mm256_set1_pd(2.0);
+                       __m256d v0_5 = _mm256_set1_pd(0.5);
+
+                       __m256d vq = _mm256_sub_pd(v1, _mm256_div_pd(vf, MM256_FMA_PD(v2, _mm256_add_pd(vreso_db_cf_p, _mm256_div_pd(v0_5, _mm256_add_pd(v1, vf))), _mm256_sub_pd(vf, v2))));
+                       __m256d vc0 = _mm256_mul_pd(vq, vq);
+#ifdef USE_SVML
+                       __m256d vcosf = _mm256_cos_pd(vf);
+#else
+                       ALIGN FLOAT_T af[4];
+                       _mm256_storeu_pd(af, vf);
+                       __m256d vcosf = _mm256_set_pd(cos(af[3]), cos(af[2]), cos(af[1]), cos(af[0]));
+#endif
+                       __m256d vc1 = _mm256_sub_pd(_mm256_add_pd(vc0, v1), _mm256_mul_pd(_mm256_mul_pd(v2, vcosf), vq));
+
+                       __m256d vdc02 = _mm256_unpacklo_pd(vc0, vc1);
+                       __m256d vdc13 = _mm256_unpackhi_pd(vc0, vc1);
+                       _mm_storeu_pd(fcs[i]->dc, _mm256_castpd256_pd128(vdc02));
+
+                       if (i + 1 < batch_size)
+                               _mm_storeu_pd(fcs[i + 1]->dc, _mm256_castpd256_pd128(vdc13));
+                       if (i + 2 < batch_size)
+                               _mm_storeu_pd(fcs[i + 2]->dc, _mm256_extractf128_pd(vdc02, 1));
+                       if (i + 3 < batch_size)
+                               _mm_storeu_pd(fcs[i + 3]->dc, _mm256_extractf128_pd(vdc13, 1));
+               }
+       }
+}
+#endif
+
+void buffer_filter_batch(int batch_size, FilterCoefficients **fcs, DATA_T **sps, int32 *counts)
+{
+#ifdef _DEBUG
+       for (int i = 0; i < batch_size; i++) {
+               if (!fcs[i]) {
+                       ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter not initialized");
+                       return;
+               }
+       }
+
+       for (int i = 1; i < batch_size; i++) {
+               if (fcs[0]->type != fcs[i]->type) {
+                       ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: filter type mismatch");
+                       return;
+               }
+       }
+#endif
+       if (fcs[0]->type == FILTER_NONE)
+               return;
+
+       FILTER_T *dcs[MIX_VOICE_BATCH_SIZE];
+       FILTER_T *dbs[MIX_VOICE_BATCH_SIZE];
+
+       for (int i = 0; i < batch_size; i++) {
+               dcs[i] = &fcs[i]->dc;
+               dbs[i] = &fcs[i]->db[FILTER_FB_L];
+       }
+
+       switch (fcs[0]->type) {
+       case FILTER_LPF12_2:
+               recalc_filter_LPF12_2_batch(batch_size, fcs);
+               sample_filter_LPF12_2_batch(batch_size, dcs, dbs, sps, counts);
+               break;
+
+       //case FILTER_HPF12_2:
+       //      break;
+
+       default:
+               ctl->cmsg(CMSG_ERROR, VERB_NORMAL, "buffer_filter_batch(): error: unsupported filter type");
+               break;
+       }
+}
+
+void voice_filter_batch(int batch_size, int *vs, DATA_T **sps, int32 *counts)
+{
+       if (batch_size <= 0)
+               return;
+
+       FilterCoefficients *fcs[MIX_VOICE_BATCH_SIZE];
+
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
+               fcs[i] = (i < batch_size ? &voice[vs[i]].fc : NULL);
+
+       buffer_filter_batch(batch_size, fcs, sps, counts);
+
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++)
+               fcs[i] = (i < batch_size ? &voice[vs[i]].fc2 : NULL);
+
+       buffer_filter_batch(batch_size, fcs, sps, counts);
+}
 
 
+#endif // MIX_VOICE_BATCH
 
 
 /*************** antialiasing ********************/
index cab46d1..0cc17ee 100644 (file)
@@ -32,7 +32,7 @@
 
 #include "sysdep.h"
 
-
+#include "mix.h"
 
 enum{
        FILTER_NONE =0,
@@ -174,7 +174,9 @@ extern void set_resample_filter_ext_rate(FilterCoefficients *fc, FLOAT_T freq);
 extern void set_resample_filter_freq(FilterCoefficients *fc, FLOAT_T freq);
 extern void resample_filter(int v, DATA_T *sp, int32 count);
 
-
+#ifdef MIX_VOICE_BATCH
+extern void voice_filter_batch(int batch_size, int *vs, DATA_T **sps, int32 *counts);
+#endif // MIX_VOICE_BATCH
 
 
 /*************** antialiasing ********************/
index c245f05..1290284 100644 (file)
@@ -86,15 +86,23 @@ const int32 max_amp_value = MAX_AMP_VALUE;
 #endif // DATA_T_INT32
 
 
+void mix_mystery_signal(DATA_T *sp, DATA_T *lp, int v, int count);
 void mix_voice(DATA_T *, int, int32);
 
-
 #if 0 // dim voice buffer
 static ALIGN DATA_T voice_buffer[AUDIO_BUFFER_SIZE * RESAMPLE_OVER_SAMPLING_MAX];
 #else // malloc voice buffer
 static DATA_T *voice_buffer = NULL;
 #endif // malloc voice buffer
 
+#ifdef MIX_VOICE_BATCH
+static int voice_buffer_batch_current_count;
+static int voice_buffer_batch_v[MIX_VOICE_BATCH_SIZE]; // voice id
+static int32 voice_buffer_batch_count[MIX_VOICE_BATCH_SIZE];
+static DATA_T *voice_buffer_batch_data[MIX_VOICE_BATCH_SIZE];
+static DATA_T *voice_buffer_batch_dest[MIX_VOICE_BATCH_SIZE];
+#endif
+
 
 /*************** mix.c initialize uninitialize *****************/
 
@@ -124,12 +132,42 @@ void init_mix_c(void)
                voice_buffer = NULL;
        }
        voice_buffer = (DATA_T *)aligned_malloc(byte, ALIGN_SIZE);
+
+#ifdef MIX_VOICE_BATCH
+       voice_buffer_batch_current_count = 0;
+       memset(voice_buffer_batch_v, 0, sizeof(voice_buffer_batch_v));
+       memset(voice_buffer_batch_count, 0, sizeof(voice_buffer_batch_count));
+       memset(voice_buffer_batch_dest, 0, sizeof(voice_buffer_batch_dest));
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++) {
+               if (voice_buffer_batch_data[i]) {
+                       aligned_free(voice_buffer_batch_data[i]);
+                       voice_buffer_batch_data[i] = NULL;
+               }
+               voice_buffer_batch_data[i] = (DATA_T *)aligned_malloc(byte, ALIGN_SIZE);
+               memset(voice_buffer_batch_data[i], 0, byte);
+       }
+#endif
 #else
        if(voice_buffer){
                safe_free(voice_buffer);
                voice_buffer = NULL;
        }
        voice_buffer = (DATA_T *)safe_malloc(byte);
+
+#ifdef MIX_VOICE_BATCH
+       voice_buffer_batch_current_count = 0;
+       memset(voice_buffer_batch_v, 0, sizeof(voice_buffer_batch_v));
+       memset(voice_buffer_batch_count, 0, sizeof(voice_buffer_batch_count));
+       memset(voice_buffer_batch_dest, 0, sizeof(voice_buffer_batch_dest));
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++) {
+               if (voice_buffer_batch_data[i]) {
+                       safe_free(voice_buffer_batch_data[i]);
+                       voice_buffer_batch_data[i] = NULL;
+               }
+               voice_buffer_batch_data[i] = (DATA_T *)safe_malloc(byte);
+               memset(voice_buffer_batch_data[i], 0, byte);
+       }
+#endif
 #endif
 #endif // malloc voice_buffer
        memset(voice_buffer, 0, byte);
@@ -149,11 +187,35 @@ void free_mix_c(void)
                aligned_free(voice_buffer);
                voice_buffer = NULL;
        }
+#ifdef MIX_VOICE_BATCH
+       voice_buffer_batch_current_count = 0;
+       memset(voice_buffer_batch_v, 0, sizeof(voice_buffer_batch_v));
+       memset(voice_buffer_batch_count, 0, sizeof(voice_buffer_batch_count));
+       memset(voice_buffer_batch_dest, 0, sizeof(voice_buffer_batch_dest));
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++) {
+               if (voice_buffer_batch_data[i]) {
+                       aligned_free(voice_buffer_batch_data[i]);
+                       voice_buffer_batch_data[i] = NULL;
+               }
+       }
+#endif
 #else
        if(voice_buffer){
                safe_free(voice_buffer);
                voice_buffer = NULL;
        }
+#ifdef MIX_VOICE_BATCH
+       voice_buffer_batch_current_count = 0;
+       memset(voice_buffer_batch_v, 0, sizeof(voice_buffer_batch_v));
+       memset(voice_buffer_batch_count, 0, sizeof(voice_buffer_batch_count));
+       memset(voice_buffer_batch_dest, 0, sizeof(voice_buffer_batch_dest));
+       for (int i = 0; i < MIX_VOICE_BATCH_SIZE; i++) {
+               if (voice_buffer_batch_data[i]) {
+                       safe_free(voice_buffer_batch_data[i]);
+                       voice_buffer_batch_data[i] = NULL;
+               }
+       }
+#endif
 #endif
 #endif
 #ifdef MULTI_THREAD_COMPUTE
@@ -165,6 +227,59 @@ void free_mix_c(void)
 
 /****************  ****************/
 
+#ifdef MIX_VOICE_BATCH
+
+void mix_voice_flush_batch(void)
+{
+       voice_filter_batch(voice_buffer_batch_current_count, voice_buffer_batch_v, voice_buffer_batch_data, voice_buffer_batch_count);
+
+       for (int i= 0; i < voice_buffer_batch_current_count; i++) {
+               Voice *vp = voice + voice_buffer_batch_v[i];
+               mix_mystery_signal(voice_buffer_batch_data[i], voice_buffer_batch_dest[i], voice_buffer_batch_v[i], voice_buffer_batch_count[i]);
+               if (vp->finish_voice >= 2 || vp->finish_voice && !vp->sample->keep_voice)
+                       free_voice(voice_buffer_batch_v[i]);
+       }
+
+       voice_buffer_batch_current_count = 0;
+}
+
+static int mix_voice_batch_is_filter_type_supported(int8 type)
+{
+       switch (type) {
+       case FILTER_NONE:
+       case FILTER_LPF12_2:
+               return 1;
+
+       default:
+               return 0;
+       }
+}
+
+static int mix_voice_try_enqueue_batch(DATA_T *sp, DATA_T *lp, int v, int32 c)
+{
+       Voice *vp = voice + v;
+
+       if (!(mix_voice_batch_is_filter_type_supported(vp->fc.type) && mix_voice_batch_is_filter_type_supported(vp->fc2.type)))
+               return 0;
+
+       Voice *vp0 = voice + voice_buffer_batch_v[0];
+
+       if (voice_buffer_batch_current_count > 0 && !(vp->fc.type == vp0->fc.type && vp->fc2.type == vp0->fc2.type))
+               mix_voice_flush_batch();
+
+       voice_buffer_batch_v[voice_buffer_batch_current_count] = v;
+       voice_buffer_batch_count[voice_buffer_batch_current_count] = c;
+       memcpy(voice_buffer_batch_data[voice_buffer_batch_current_count], sp, c * sizeof(DATA_T));
+       voice_buffer_batch_dest[voice_buffer_batch_current_count] = lp;
+       voice_buffer_batch_current_count++;
+
+       if (voice_buffer_batch_current_count >= MIX_VOICE_BATCH_SIZE)
+               mix_voice_flush_batch();
+
+       return 1;
+}
+
+#endif // MIX_VOICE_BATCH
 
 static void compute_portament(int v, int32 c)
 {
@@ -574,6 +689,10 @@ void mix_voice(DATA_T *buf, int v, int32 c)
 #ifdef VOICE_EFFECT
        voice_effect(v, sp, c);
 #endif
+#ifdef MIX_VOICE_BATCH
+       if(mix_voice_try_enqueue_batch(sp, buf, v, c))
+               return;
+#endif
        voice_filter(v, sp, c);
        mix_mystery_signal(sp, buf, v, c);      
        if(vp->finish_voice >= 2 || vp->finish_voice && !vp->sample->keep_voice)
index 74434ad..fb4e92e 100644 (file)
 #ifndef ___MIX_H_
 #define ___MIX_H_
 
+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE) && defined(FLOAT_T_DOUBLE)
+#define MIX_VOICE_BATCH
+#define MIX_VOICE_BATCH_SIZE   8
+#endif
+
 ///r
 extern void mix_voice(DATA_T *, int, int32);
 
+#ifdef MIX_VOICE_BATCH
+extern void mix_voice_flush_batch(void);
+#endif // MIX_VOICE_BATCH
+
+
 extern int recompute_envelope(int);
 extern int apply_envelope_to_amp(int);
 extern int recompute_modulation_envelope(int);
index 4bce36b..5557c20 100644 (file)
@@ -289,6 +289,10 @@ enum{
 #define USE_X86_AMD_EXT_INTRIN  0
 #endif
 
+#if (defined(_MSC_VER) && _MSC_VER >= 1920) || defined(__INTEL_COMPILER)
+#define USE_SVML
+#endif
+
 #if defined(USE_AVX512)
 #define USE_X86_EXT_ASM     10  // F, CD, VL, DQ, BW
 #elif defined(USE_AVX2)
index bc70173..5f07db5 100644 (file)
@@ -11825,6 +11825,10 @@ static void do_compute_data_midi(int32 count)
                }
        }
 
+#ifdef MIX_VOICE_BATCH
+       mix_voice_flush_batch();
+#endif
+
        while(uv > 0 && voice[uv - 1].status == VOICE_FREE)     {uv--;}
        upper_voices = uv;