OSDN Git Service

[optcode] Change transpose macros to accept arrays
authorStarg <starg@users.osdn.me>
Tue, 4 May 2021 17:33:45 +0000 (02:33 +0900)
committerStarg <starg@users.osdn.me>
Tue, 4 May 2021 17:33:45 +0000 (02:33 +0900)
timidity/optcode.h

index 44d512d..a35e434 100644 (file)
@@ -994,96 +994,93 @@ LSU : Unalignment (use loadu/storeu
 
 #if (USE_X86_EXT_INTRIN >= 10)
 
-#define MM512_TRANSPOSE8X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vin01_4, vin01_5, vin01_6, vin01_7, vout0_01234567, vout1_01234567)  do { \
+#define MM512_TRANSPOSE8X2_PD(vin, vout)  do { \
                __m256d v01_02, v01_13, v01_46, v01_57; \
                __m512d v01_0246, v01_1357; \
                \
-               v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_0)), (vin01_2), 1); \
-               v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_1)), (vin01_3), 1); \
-               v01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_4)), (vin01_6), 1); \
-               v01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_5)), (vin01_7), 1); \
+               v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[0]), (vin)[2], 1); \
+               v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[1]), (vin)[3], 1); \
+               v01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[4]), (vin)[6], 1); \
+               v01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[5]), (vin)[7], 1); \
                \
                v01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vin01_02), vin01_46, 1); \
                v01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vin01_13), vin01_57, 1); \
                \
-               (vout0_01234567) = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
-               (vout1_01234567) = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
+               (vout)[0] = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
+               (vout)[1] = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
        } while(0)
 
-#define MM512_TRANSPOSE2X8_PD(vin01234567_0, vin01234567_1, vout0_01, vout1_01, vout2_01, vout3_01, vout4_01, vout5_01, vout6_01, vout7_01)  do { \
+#define MM512_TRANSPOSE2X8_PD(vin, vout)  do { \
                __m512d v0246_01, v1357_01; \
-               v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
-               v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
+               v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+               v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
                \
-               (vout0_01) = _mm512_castpd512_pd128(v0246_01); \
-               (vout1_01) = _mm512_castpd512_pd128(v1357_01); \
-               (vout2_01) = _mm256_extractf128_pd(_mm512_castpd512_pd256(v0246_01), 1); \
-               (vout3_01) = _mm256_extractf128_pd(_mm512_castpd512_pd256(v1357_01), 1); \
-               (vout4_01) = _mm512_extractf64x2_pd(v0246_01, 2); \
-               (vout5_01) = _mm512_extractf64x2_pd(v1357_01, 2); \
-               (vout6_01) = _mm512_extractf64x2_pd(v0246_01, 3); \
-               (vout7_01) = _mm512_extractf64x2_pd(v1357_01, 3); \
+               (vout)[0] = _mm512_castpd512_pd128(v0246_01); \
+               (vout)[1] = _mm512_castpd512_pd128(v1357_01); \
+               (vout)[2] = _mm256_extractf128_pd(_mm512_castpd512_pd256(v0246_01), 1); \
+               (vout)[3] = _mm256_extractf128_pd(_mm512_castpd512_pd256(v1357_01), 1); \
+               (vout)[4] = _mm512_extractf64x2_pd(v0246_01, 2); \
+               (vout)[5] = _mm512_extractf64x2_pd(v1357_01, 2); \
+               (vout)[6] = _mm512_extractf64x2_pd(v0246_01, 3); \
+               (vout)[7] = _mm512_extractf64x2_pd(v1357_01, 3); \
        } while (0)
 
-#define MM512_TRANSPOSE8X4_PD(vin0123_0, vin0123_1, vin0123_2, vin0123_3, vin0123_4, vin0123_5, vin0123_6, vin0123_7, \
-       vout0_01234567, vout1_01234567, vout2_01234567, vout3_01234567) do { \
+#define MM512_TRANSPOSE8X4_PD(vin, vout) do { \
                __m512d v0123_02, v0123_13, v0123_46, v0123_57; \
                __m512d v01_0246, v01_1357, v23_0246, v23_1357; \
                \
-               v0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_0)), (vin0123_2), 1); \
-               v0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_1)), (vin0123_3), 1); \
-               v0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_4)), (vin0123_6), 1); \
-               v0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_5)), (vin0123_7), 1); \
+               v0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[0]), (vin)[2], 1); \
+               v0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[1]), (vin)[3], 1); \
+               v0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[4]), (vin)[6], 1); \
+               v0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[5]), (vin)[7], 1); \
                \
                v01_0246 = _mm512_shuffle_f64x2(v0123_02, v0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
                v01_1357 = _mm512_shuffle_f64x2(v0123_13, v0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
                v23_0246 = _mm512_shuffle_f64x2(v0123_02, v0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
                v23_1357 = _mm512_shuffle_f64x2(v0123_13, v0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
                \
-               (vout0_01234567) = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
-               (vout1_01234567) = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
-               (vout2_01234567) = MM512_UNPACKLO_PD(v23_0246, v23_1357); \
-               (vout3_01234567) = MM512_UNPACKHI_PD(v23_0246, v23_1357); \
+               (vout)[0] = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
+               (vout)[1] = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
+               (vout)[2] = MM512_UNPACKLO_PD(v23_0246, v23_1357); \
+               (vout)[3] = MM512_UNPACKHI_PD(v23_0246, v23_1357); \
        } while (0)
 
-#define MM512_TRANSPOSE4X8_PD(vin01234567_0, vin01234567_1, vin01234567_2, vin01234567_3, \
-       vout0_0123, vout1_0123, vout2_0123, vout3_0123, vout4_0123, vout5_0123, vout6_0123, vout7_0123) do { \
+#define MM512_TRANSPOSE4X8_PD(vin, vout) do { \
                __m512d v0246_01, v1357_01, v0246_23, v1357_23; \
                __m512d v04_0123, v15_0123, v26_0123, v37_0123; \
                \
-               v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
-               v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
-               v0246_23 = MM512_UNPACKLO_PD((vin01234567_2), (vin01234567_3)); \
-               v1357_23 = MM512_UNPACKHI_PD((vin01234567_2), (vin01234567_3)); \
+               v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+               v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
+               v0246_23 = MM512_UNPACKLO_PD((vin)[2], (vin)[3]); \
+               v1357_23 = MM512_UNPACKHI_PD((vin)[2], (vin)[3]); \
                \
                v04_0123 = _mm512_mask_permutex_pd(v0246_01, 0xCC, v0246_23, (1 << 6) | (0 << 4)); \
                v15_0123 = _mm512_mask_permutex_pd(v1357_01, 0xCC, v1357_23, (1 << 6) | (0 << 4)); \
                v26_0123 = _mm512_mask_permutex_pd(v0246_23, 0x33, v0246_01, (3 << 2) | 2); \
                v37_0123 = _mm512_mask_permutex_pd(v1357_23, 0x33, v1357_01, (3 << 2) | 2); \
                \
-               (vout0_0123) = _mm512_castpd512_pd256(v04_0123); \
-               (vout1_0123) = _mm512_castpd512_pd256(v15_0123); \
-               (vout2_0123) = _mm512_castpd512_pd256(v26_0123); \
-               (vout3_0123) = _mm512_castpd512_pd256(v37_0123); \
-               (vout4_0123) = _mm512_extractf64x4_pd(v04_0123, 1); \
-               (vout5_0123) = _mm512_extractf64x4_pd(v15_0123, 1); \
-               (vout6_0123) = _mm512_extractf64x4_pd(v26_0123, 1); \
-               (vout7_0123) = _mm512_extractf64x4_pd(v37_0123, 1); \
+               (vout)[0] = _mm512_castpd512_pd256(v04_0123); \
+               (vout)[1] = _mm512_castpd512_pd256(v15_0123); \
+               (vout)[2] = _mm512_castpd512_pd256(v26_0123); \
+               (vout)[3] = _mm512_castpd512_pd256(v37_0123); \
+               (vout)[4] = _mm512_extractf64x4_pd(v04_0123, 1); \
+               (vout)[5] = _mm512_extractf64x4_pd(v15_0123, 1); \
+               (vout)[6] = _mm512_extractf64x4_pd(v26_0123, 1); \
+               (vout)[7] = _mm512_extractf64x4_pd(v37_0123, 1); \
        } while(0)
 
-#define MM512_TRANSPOSE8X8_PD(vin01234567_0, vin01234567_1, vin01234567_2, vin01234567_3, vin01234567_4, vin01234567_5, vin01234567_6, vin01234567_7, \
-       vout0_01234567, vout1_01234567, vout2_01234567, vout3_01234567, vout4_01234567, vout5_01234567, vout6_01234567, vout7_01234567) do { \
+#define MM512_TRANSPOSE8X8_PD(vin, vout) do { \
                __m512d v0246_01, v1357_01, v0246_23, v1357_23, v0246_45, v1357_45, v0246_67, v1357_67; \
                __m512d v04_0123, v26_0123, v15_0123, v37_0123, v04_4567, v26_4567, v15_4567, v37_4567; \
                \
-               v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
-               v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
-               v0246_23 = MM512_UNPACKLO_PD((vin01234567_2), (vin01234567_3)); \
-               v1357_23 = MM512_UNPACKHI_PD((vin01234567_2), (vin01234567_3)); \
-               v0246_45 = MM512_UNPACKLO_PD((vin01234567_4), (vin01234567_5)); \
-               v1357_45 = MM512_UNPACKHI_PD((vin01234567_4), (vin01234567_5)); \
-               v0246_67 = MM512_UNPACKLO_PD((vin01234567_6), (vin01234567_7)); \
-               v1357_67 = MM512_UNPACKHI_PD((vin01234567_6), (vin01234567_7)); \
+               v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+               v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
+               v0246_23 = MM512_UNPACKLO_PD((vin)[2], (vin)[3]); \
+               v1357_23 = MM512_UNPACKHI_PD((vin)[2], (vin)[3]); \
+               v0246_45 = MM512_UNPACKLO_PD((vin)[4], (vin)[5]); \
+               v1357_45 = MM512_UNPACKHI_PD((vin)[4], (vin)[5]); \
+               v0246_67 = MM512_UNPACKLO_PD((vin)[6], (vin)[7]); \
+               v1357_67 = MM512_UNPACKHI_PD((vin)[6], (vin)[7]); \
                \
                v04_0123 = _mm512_shuffle_f64x2(v0246_01, v0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
                v26_0123 = _mm512_shuffle_f64x2(v0246_01, v0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
@@ -1094,74 +1091,74 @@ LSU : Unalignment (use loadu/storeu
                v15_4567 = _mm512_shuffle_f64x2(v1357_45, v1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
                v37_4567 = _mm512_shuffle_f64x2(v1357_45, v1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
                \
-               (vout0_01234567) = _mm512_shuffle_f64x2(v04_0123, v04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
-               (vout4_01234567) = _mm512_shuffle_f64x2(v04_0123, v04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
-               (vout1_01234567) = _mm512_shuffle_f64x2(v15_0123, v15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
-               (vout5_01234567) = _mm512_shuffle_f64x2(v15_0123, v15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
-               (vout2_01234567) = _mm512_shuffle_f64x2(v26_0123, v26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
-               (vout6_01234567) = _mm512_shuffle_f64x2(v26_0123, v26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
-               (vout3_01234567) = _mm512_shuffle_f64x2(v37_0123, v37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
-               (vout7_01234567) = _mm512_shuffle_f64x2(v37_0123, v37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+               (vout)[0] = _mm512_shuffle_f64x2(v04_0123, v04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+               (vout)[4] = _mm512_shuffle_f64x2(v04_0123, v04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+               (vout)[1] = _mm512_shuffle_f64x2(v15_0123, v15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+               (vout)[5] = _mm512_shuffle_f64x2(v15_0123, v15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+               (vout)[2] = _mm512_shuffle_f64x2(v26_0123, v26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+               (vout)[6] = _mm512_shuffle_f64x2(v26_0123, v26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+               (vout)[3] = _mm512_shuffle_f64x2(v37_0123, v37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+               (vout)[7] = _mm512_shuffle_f64x2(v37_0123, v37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
        } while(0)
 
 #endif // (USE_X86_EXT_INTRIN >= 10)
 
 #if (USE_X86_EXT_INTRIN >= 8)
 
-#define MM256_TRANSPOSE4X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vout0_0123, vout1_0123) do { \
+#define MM256_TRANSPOSE4X2_PD(vin, vout) do { \
                __m256d v01_02, v01_13; \
-               v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_0)), (vin01_2), 1); \
-               v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_1)), (vin01_3), 1); \
+               v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[0]), (vin)[2], 1); \
+               v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[1]), (vin)[3], 1); \
                \
-               (vout0_0123) = MM256_UNPACKLO_PD(v01_02, v01_13); \
-               (vout1_0123) = MM256_UNPACKHI_PD(v01_02, v01_13); \
+               (vout)[0] = MM256_UNPACKLO_PD(v01_02, v01_13); \
+               (vout)[1] = MM256_UNPACKHI_PD(v01_02, v01_13); \
        } while (0)
 
-#define MM256_TRANSPOSE2X4_PD(vin0123_0, vin0123_1, vout0_01, vout1_01, vout2_01, vout3_01) do { \
+#define MM256_TRANSPOSE2X4_PD(vin, vout) do { \
                __m256d v02_01, v13_01; \
-               v02_01 = MM256_UNPACKLO_PD((vin0123_0), (vin0123_1)); \
-               v13_01 = MM256_UNPACKHI_PD((vin0123_0), (vin0123_1)); \
+               v02_01 = MM256_UNPACKLO_PD((vin)[0], (vin)[1]); \
+               v13_01 = MM256_UNPACKHI_PD((vin)[0], (vin)[1]); \
                \
-               (vout0_01) = _mm256_castpd256_pd128(v02_01); \
-               (vout1_01) = _mm256_castpd256_pd128(v13_01); \
-               (vout2_01) = _mm256_extractf128_pd(v02_01, 1); \
-               (vout3_01) = _mm256_extractf128_pd(v13_01, 1); \
+               (vout)[0] = _mm256_castpd256_pd128(v02_01); \
+               (vout)[1] = _mm256_castpd256_pd128(v13_01); \
+               (vout)[2] = _mm256_extractf128_pd(v02_01, 1); \
+               (vout)[3] = _mm256_extractf128_pd(v13_01, 1); \
        } while(0)
 
-#define MM256_TRANSPOSE4X4_PD(vin0123_0, vin0123_1, vin0123_2, vin0123_3, vout0_0123, vout1_0123, vout2_0123, vout3_0123) do { \
+#define MM256_TRANSPOSE4X4_PD(vin, vout) do { \
                __m256d v01_02, v01_13, v23_02, v23_13; \
-               v01_02 = _mm256_insertf128_pd((vin0123_0), _mm256_castpd256_pd128((vin0123_2)), 1); \
-               v01_13 = _mm256_insertf128_pd((vin0123_1), _mm256_castpd256_pd128((vin0123_3)), 1); \
-               v23_02 = _mm256_permute2f128_pd((vin0123_0), (vin0123_2), (3 << 4) | 1); \
-               v23_13 = _mm256_permute2f128_pd((vin0123_1), (vin0123_3), (3 << 4) | 1); \
+               v01_02 = _mm256_insertf128_pd((vin)[0], _mm256_castpd256_pd128((vin)[2]), 1); \
+               v01_13 = _mm256_insertf128_pd((vin)[1], _mm256_castpd256_pd128((vin)[3]), 1); \
+               v23_02 = _mm256_permute2f128_pd((vin)[0], (vin)[2], (3 << 4) | 1); \
+               v23_13 = _mm256_permute2f128_pd((vin)[1], (vin)[3], (3 << 4) | 1); \
                \
-               (vout0_0123) = MM256_UNPACKLO_PD(v01_02, v01_13); \
-               (vout1_0123) = MM256_UNPACKHI_PD(v01_02, v01_13); \
-               (vout2_0123) = MM256_UNPACKLO_PD(v23_02, v23_13); \
-               (vout3_0123) = MM256_UNPACKHI_PD(v23_02, v23_13); \
+               (vout)[0] = MM256_UNPACKLO_PD(v01_02, v01_13); \
+               (vout)[1] = MM256_UNPACKHI_PD(v01_02, v01_13); \
+               (vout)[2] = MM256_UNPACKLO_PD(v23_02, v23_13); \
+               (vout)[3] = MM256_UNPACKHI_PD(v23_02, v23_13); \
        } while(0)
 
 #endif // (USE_X86_EXT_INTRIN >= 8)
 
 #if (USE_X86_EXT_INTRIN >= 3)
 
-#define MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01) do { \
+#define MM_TRANSPOSE2X2_PD(vin, vout) do { \
                __m128d v0_01, v1_01; \
-               v0_01 = MM_UNPACKLO_PD((vin01_0), (vin01_1)); \
-               v1_01 = MM_UNPACKHI_PD((vin01_0), (vin01_1)); \
+               v0_01 = MM_UNPACKLO_PD((vin)[0], (vin)[1]); \
+               v1_01 = MM_UNPACKHI_PD((vin)[0], (vin)[1]); \
                \
-               (vout0_01) = v0_01; \
-               (vout1_01) = v1_01; \
+               (vout)[0] = v0_01; \
+               (vout)[1] = v1_01; \
        } while(0)
 
-#define MM_TRANSPOSE4X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vout0_01, vout0_23, vout1_01, vout1_23) do { \
-               MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01); \
-               MM_TRANSPOSE2X2_PD(vin01_2, vin01_3, vout0_23, vout1_23); \
+#define MM_TRANSPOSE4X2_PD(vin, vout01, vout23) do { \
+               MM_TRANSPOSE2X2_PD(vin, vout01); \
+               MM_TRANSPOSE2X2_PD(&vin[2], vout23); \
        } while(0)
 
-#define MM_TRANSPOSE2X4_PD(vin01_0, vin23_0, vin01_1, vin23_1, vout0_01, vout1_01, vout2_01, vout3_01) do { \
-               MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01); \
-               MM_TRANSPOSE2X2_PD(vin23_0, vin23_1, vout2_01, vout3_01); \
+#define MM_TRANSPOSE2X4_PD(vin01, vin23, vout) do { \
+               MM_TRANSPOSE2X2_PD(vin01, vout0_01, vout); \
+               MM_TRANSPOSE2X2_PD(vin23, vout2_01, &vout[2]); \
        } while(0)
 
 #endif // (USE_X86_EXT_INTRIN >= 3)