From 7f8dcb1a8da86f96725e8c2e1c78f89d8023dde8 Mon Sep 17 00:00:00 2001 From: Starg Date: Thu, 1 Apr 2021 22:28:25 +0900 Subject: [PATCH] [filter] Avoid storel and storeh --- timidity/filter.c | 56 +++++++++++++++++++++++++++--------------------------- timidity/optcode.h | 2 +- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/timidity/filter.c b/timidity/filter.c index b8aa904e..a695da6c 100644 --- a/timidity/filter.c +++ b/timidity/filter.c @@ -4115,41 +4115,41 @@ static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T __m512d vdb0123_37 = _mm512_permutex2var_pd(vdb01_1357, _mm512_set_epi64(15, 14, 7, 6, 11, 10, 3, 2), vdb23_1357); _mm256_storeu_pd(&dbs[i][0], _mm512_castpd512_pd256(vdb0123_04)); - _mm_storel_pd(&dbs[i][4], _mm512_castpd512_pd128(vdb4)); + dbs[i][4] = MM512_EXTRACT_F64(vdb4, 0); if (i + 1 < batch_size) { _mm256_storeu_pd(&dbs[i + 1][0], _mm512_castpd512_pd256(vdb0123_15)); - _mm_storeh_pd(&dbs[i + 1][4], _mm512_castpd512_pd128(vdb4)); + dbs[i + 1][4] = MM512_EXTRACT_F64(vdb4, 1); } if (i + 2 < batch_size) { _mm256_storeu_pd(&dbs[i + 2][0], _mm512_castpd512_pd256(vdb0123_26)); - _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1)); + dbs[i + 2][4] = MM512_EXTRACT_F64(vdb4, 2); } if (i + 3 < batch_size) { _mm256_storeu_pd(&dbs[i + 3][0], _mm512_castpd512_pd256(vdb0123_37)); - _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdb4), 1)); + dbs[i + 3][4] = MM512_EXTRACT_F64(vdb4, 3); } if (i + 4 < batch_size) { _mm256_storeu_pd(&dbs[i + 4][0], _mm512_extractf64x4_pd(vdb0123_04, 1)); - _mm_storel_pd(&dbs[i + 4][4], _mm512_extractf64x2_pd(vdb4, 2)); + dbs[i + 4][4] = MM512_EXTRACT_F64(vdb4, 4); } if (i + 5 < batch_size) { _mm256_storeu_pd(&dbs[i + 5][0], _mm512_extractf64x4_pd(vdb0123_15, 1)); - _mm_storeh_pd(&dbs[i + 5][4], _mm512_extractf64x2_pd(vdb4, 2)); + dbs[i + 5][4] = MM512_EXTRACT_F64(vdb4, 5); } if (i + 6 < batch_size) { _mm256_storeu_pd(&dbs[i + 6][0], _mm512_extractf64x4_pd(vdb0123_26, 1)); - _mm_storel_pd(&dbs[i + 6][4], _mm512_extractf64x2_pd(vdb4, 3)); + dbs[i + 6][4] = MM512_EXTRACT_F64(vdb4, 6); } if (i + 7 < batch_size) { _mm256_storeu_pd(&dbs[i + 7][0], _mm512_extractf64x4_pd(vdb0123_37, 1)); - _mm_storeh_pd(&dbs[i + 7][4], _mm512_extractf64x2_pd(vdb4, 3)); + dbs[i + 7][4] = MM512_EXTRACT_F64(vdb4, 7); } } } @@ -4290,21 +4290,21 @@ static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T vdb0123_3 = _mm256_permute2f128_pd(vdb01_13, vdb23_13, (3 << 4) | 1); _mm256_storeu_pd(&dbs[i][0], vdb0123_0); - _mm_storel_pd(&dbs[i][4], _mm256_castpd256_pd128(vdb4)); + dbs[i][4] = MM256_EXTRACT_F64(vdb4, 0); if (i + 1 < batch_size) { _mm256_storeu_pd(&dbs[i + 1][0], vdb0123_1); - _mm_storeh_pd(&dbs[i + 1][4], _mm256_castpd256_pd128(vdb4)); + dbs[i + 1][4] = MM256_EXTRACT_F64(vdb4, 1); } if (i + 2 < batch_size) { _mm256_storeu_pd(&dbs[i + 2][0], vdb0123_2); - _mm_storel_pd(&dbs[i + 2][4], _mm256_extractf128_pd(vdb4, 1)); + dbs[i + 2][4] = MM256_EXTRACT_F64(vdb4, 2); } if (i + 3 < batch_size) { _mm256_storeu_pd(&dbs[i + 3][0], vdb0123_3); - _mm_storeh_pd(&dbs[i + 3][4], _mm256_extractf128_pd(vdb4, 1)); + dbs[i + 3][4] = MM256_EXTRACT_F64(vdb4, 3); } } } @@ -4399,12 +4399,12 @@ static void sample_filter_LPF_BW_batch(int batch_size, FILTER_T **dcs, FILTER_T _mm_storeu_pd(&dbs[i][0], vdb01_0); _mm_storeu_pd(&dbs[i][2], vdb23_0); - _mm_storel_pd(&dbs[i][4], vdb4); + dbs[i][4] = MM_EXTRACT_F64(vdb4, 0); if (i + 1 < batch_size) { _mm_storeu_pd(&dbs[i + 1][0], vdb01_1); _mm_storeu_pd(&dbs[i + 1][2], vdb23_1); - _mm_storeh_pd(&dbs[i + 1][4], vdb4); + dbs[i + 1][4] = MM_EXTRACT_F64(vdb4, 1); } } } @@ -4579,42 +4579,42 @@ static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs) if (imask & 1) { _mm256_storeu_pd(&fcs[i]->dc[0], _mm512_castpd512_pd256(vdc0123_04)); - _mm_storel_pd(&fcs[i]->dc[4], _mm512_castpd512_pd128(vdc4)); + fcs[i]->dc[4] = MM512_EXTRACT_F64(vdc4, 0); } if (imask & (1 << 1)) { _mm256_storeu_pd(&fcs[i + 1]->dc[0], _mm512_castpd512_pd256(vdc0123_15)); - _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm512_castpd512_pd128(vdc4)); + fcs[i + 1]->dc[4] = MM512_EXTRACT_F64(vdc4, 1); } if (imask & (1 << 2)) { _mm256_storeu_pd(&fcs[i + 2]->dc[0], _mm512_castpd512_pd256(vdc0123_26)); - _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1)); + fcs[i + 2]->dc[4] = MM512_EXTRACT_F64(vdc4, 2); } if (imask & (1 << 3)) { _mm256_storeu_pd(&fcs[i + 3]->dc[0], _mm512_castpd512_pd256(vdc0123_37)); - _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(_mm512_castpd512_pd256(vdc4), 1)); + fcs[i + 3]->dc[4] = MM512_EXTRACT_F64(vdc4, 3); } if (imask & (1 << 4)) { _mm256_storeu_pd(&fcs[i + 4]->dc[0], _mm512_extractf64x4_pd(vdc0123_04, 1)); - _mm_storel_pd(&fcs[i + 4]->dc[4], _mm512_extractf64x2_pd(vdc4, 2)); + fcs[i + 4]->dc[4] = MM512_EXTRACT_F64(vdc4, 4); } if (imask & (1 << 5)) { _mm256_storeu_pd(&fcs[i + 5]->dc[0], _mm512_extractf64x4_pd(vdc0123_15, 1)); - _mm_storeh_pd(&fcs[i + 5]->dc[4], _mm512_extractf64x2_pd(vdc4, 2)); + fcs[i + 5]->dc[4] = MM512_EXTRACT_F64(vdc4, 5); } if (imask & (1 << 6)) { _mm256_storeu_pd(&fcs[i + 6]->dc[0], _mm512_extractf64x4_pd(vdc0123_26, 1)); - _mm_storel_pd(&fcs[i + 6]->dc[4], _mm512_extractf64x2_pd(vdc4, 3)); + fcs[i + 6]->dc[4] = MM512_EXTRACT_F64(vdc4, 6); } if (imask & (1 << 7)) { _mm256_storeu_pd(&fcs[i + 7]->dc[0], _mm512_extractf64x4_pd(vdc0123_37, 1)); - _mm_storeh_pd(&fcs[i + 7]->dc[4], _mm512_extractf64x2_pd(vdc4, 3)); + fcs[i + 7]->dc[4] = MM512_EXTRACT_F64(vdc4, 7); } } } @@ -4749,22 +4749,22 @@ static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs) if (imask & 1) { _mm256_storeu_pd(&fcs[i]->dc[0], vdc0123_0); - _mm_storel_pd(&fcs[i]->dc[4], _mm256_castpd256_pd128(vdc4)); + fcs[i]->dc[4] = MM256_EXTRACT_F64(vdc4, 0); } if (imask & (1 << 1)) { _mm256_storeu_pd(&fcs[i + 1]->dc[0], vdc0123_1); - _mm_storeh_pd(&fcs[i + 1]->dc[4], _mm256_castpd256_pd128(vdc4)); + fcs[i + 1]->dc[4] = MM256_EXTRACT_F64(vdc4, 1); } if (imask & (1 << 2)) { _mm256_storeu_pd(&fcs[i + 2]->dc[0], vdc0123_2); - _mm_storel_pd(&fcs[i + 2]->dc[4], _mm256_extractf128_pd(vdc4, 1)); + fcs[i + 2]->dc[4] = MM256_EXTRACT_F64(vdc4, 2); } if (imask & (1 << 3)) { _mm256_storeu_pd(&fcs[i + 3]->dc[0], vdc0123_3); - _mm_storeh_pd(&fcs[i + 3]->dc[4], _mm256_extractf128_pd(vdc4, 1)); + fcs[i + 3]->dc[4] = MM256_EXTRACT_F64(vdc4, 3); } } } @@ -4875,13 +4875,13 @@ static void recalc_filter_LPF_BW_batch(int batch_size, FilterCoefficients **fcs) if (imask & 1) { _mm_storeu_pd(&fcs[i]->dc[0], vdc01_0); _mm_storeu_pd(&fcs[i]->dc[2], vdc23_0); - _mm_storel_pd(&fcs[i]->dc[4], vdc4); + fcs[i]->dc[4] = MM_EXTRACT_F64(vdc4, 0); } if (imask & (1 << 1)) { _mm_storeu_pd(&fcs[i + 1]->dc[0], vdc01_1); _mm_storeu_pd(&fcs[i + 1]->dc[2], vdc23_1); - _mm_storeh_pd(&fcs[i + 1]->dc[4], vdc4); + fcs[i + 1]->dc[4] = MM_EXTRACT_F64(vdc4, 1); } } } diff --git a/timidity/optcode.h b/timidity/optcode.h index ca6443f8..65cffdd3 100644 --- a/timidity/optcode.h +++ b/timidity/optcode.h @@ -943,7 +943,7 @@ LSU : Unalignment (use loadu/storeu #define MM256_EXTRACT_I32(reg,idx) _mm256_extract_epi32(reg,idx) #define MM512_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_permute_ps(_mm512_extractf32x4_ps(reg, idx >> 2), idx & 3))) #define MM512_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_permute_pd(_mm512_extractf64x2_pd(reg, idx >> 1), idx & 1)) -#define MM512_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_bsrli_si128(_mm512_extracti32x4_epi32(reg, idx >> 2), (idx & 3) * 4)) +#define MM512_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(_mm512_extracti32x4_epi32(reg, idx >> 2), idx & 3)) #else #define MM_EXTRACT_F32(reg,idx) reg.m128_f32[idx] #define MM_EXTRACT_F64(reg,idx) reg.m128d_f64[idx] -- 2.11.0