#if (USE_X86_EXT_INTRIN >= 10)
-#define MM512_TRANSPOSE8X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vin01_4, vin01_5, vin01_6, vin01_7, vout0_01234567, vout1_01234567) do { \
+#define MM512_TRANSPOSE8X2_PD(vin, vout) do { \
__m256d v01_02, v01_13, v01_46, v01_57; \
__m512d v01_0246, v01_1357; \
\
- v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_0)), (vin01_2), 1); \
- v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_1)), (vin01_3), 1); \
- v01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_4)), (vin01_6), 1); \
- v01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_5)), (vin01_7), 1); \
+ v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[0]), (vin)[2], 1); \
+ v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[1]), (vin)[3], 1); \
+ v01_46 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[4]), (vin)[6], 1); \
+ v01_57 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[5]), (vin)[7], 1); \
\
v01_0246 = _mm512_insertf64x4(_mm512_castpd256_pd512(vin01_02), vin01_46, 1); \
v01_1357 = _mm512_insertf64x4(_mm512_castpd256_pd512(vin01_13), vin01_57, 1); \
\
- (vout0_01234567) = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
- (vout1_01234567) = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
+ (vout)[0] = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
+ (vout)[1] = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
} while(0)
-#define MM512_TRANSPOSE2X8_PD(vin01234567_0, vin01234567_1, vout0_01, vout1_01, vout2_01, vout3_01, vout4_01, vout5_01, vout6_01, vout7_01) do { \
+#define MM512_TRANSPOSE2X8_PD(vin, vout) do { \
__m512d v0246_01, v1357_01; \
- v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
- v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
+ v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+ v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
\
- (vout0_01) = _mm512_castpd512_pd128(v0246_01); \
- (vout1_01) = _mm512_castpd512_pd128(v1357_01); \
- (vout2_01) = _mm256_extractf128_pd(_mm512_castpd512_pd256(v0246_01), 1); \
- (vout3_01) = _mm256_extractf128_pd(_mm512_castpd512_pd256(v1357_01), 1); \
- (vout4_01) = _mm512_extractf64x2_pd(v0246_01, 2); \
- (vout5_01) = _mm512_extractf64x2_pd(v1357_01, 2); \
- (vout6_01) = _mm512_extractf64x2_pd(v0246_01, 3); \
- (vout7_01) = _mm512_extractf64x2_pd(v1357_01, 3); \
+ (vout)[0] = _mm512_castpd512_pd128(v0246_01); \
+ (vout)[1] = _mm512_castpd512_pd128(v1357_01); \
+ (vout)[2] = _mm256_extractf128_pd(_mm512_castpd512_pd256(v0246_01), 1); \
+ (vout)[3] = _mm256_extractf128_pd(_mm512_castpd512_pd256(v1357_01), 1); \
+ (vout)[4] = _mm512_extractf64x2_pd(v0246_01, 2); \
+ (vout)[5] = _mm512_extractf64x2_pd(v1357_01, 2); \
+ (vout)[6] = _mm512_extractf64x2_pd(v0246_01, 3); \
+ (vout)[7] = _mm512_extractf64x2_pd(v1357_01, 3); \
} while (0)
-#define MM512_TRANSPOSE8X4_PD(vin0123_0, vin0123_1, vin0123_2, vin0123_3, vin0123_4, vin0123_5, vin0123_6, vin0123_7, \
- vout0_01234567, vout1_01234567, vout2_01234567, vout3_01234567) do { \
+#define MM512_TRANSPOSE8X4_PD(vin, vout) do { \
__m512d v0123_02, v0123_13, v0123_46, v0123_57; \
__m512d v01_0246, v01_1357, v23_0246, v23_1357; \
\
- v0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_0)), (vin0123_2), 1); \
- v0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_1)), (vin0123_3), 1); \
- v0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_4)), (vin0123_6), 1); \
- v0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin0123_5)), (vin0123_7), 1); \
+ v0123_02 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[0]), (vin)[2], 1); \
+ v0123_13 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[1]), (vin)[3], 1); \
+ v0123_46 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[4]), (vin)[6], 1); \
+ v0123_57 = _mm512_insertf64x4(_mm512_castpd256_pd512((vin)[5]), (vin)[7], 1); \
\
v01_0246 = _mm512_shuffle_f64x2(v0123_02, v0123_46, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
v01_1357 = _mm512_shuffle_f64x2(v0123_13, v0123_57, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
v23_0246 = _mm512_shuffle_f64x2(v0123_02, v0123_46, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
v23_1357 = _mm512_shuffle_f64x2(v0123_13, v0123_57, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
\
- (vout0_01234567) = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
- (vout1_01234567) = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
- (vout2_01234567) = MM512_UNPACKLO_PD(v23_0246, v23_1357); \
- (vout3_01234567) = MM512_UNPACKHI_PD(v23_0246, v23_1357); \
+ (vout)[0] = MM512_UNPACKLO_PD(v01_0246, v01_1357); \
+ (vout)[1] = MM512_UNPACKHI_PD(v01_0246, v01_1357); \
+ (vout)[2] = MM512_UNPACKLO_PD(v23_0246, v23_1357); \
+ (vout)[3] = MM512_UNPACKHI_PD(v23_0246, v23_1357); \
} while (0)
-#define MM512_TRANSPOSE4X8_PD(vin01234567_0, vin01234567_1, vin01234567_2, vin01234567_3, \
- vout0_0123, vout1_0123, vout2_0123, vout3_0123, vout4_0123, vout5_0123, vout6_0123, vout7_0123) do { \
+#define MM512_TRANSPOSE4X8_PD(vin, vout) do { \
__m512d v0246_01, v1357_01, v0246_23, v1357_23; \
__m512d v04_0123, v15_0123, v26_0123, v37_0123; \
\
- v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
- v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
- v0246_23 = MM512_UNPACKLO_PD((vin01234567_2), (vin01234567_3)); \
- v1357_23 = MM512_UNPACKHI_PD((vin01234567_2), (vin01234567_3)); \
+ v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+ v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
+ v0246_23 = MM512_UNPACKLO_PD((vin)[2], (vin)[3]); \
+ v1357_23 = MM512_UNPACKHI_PD((vin)[2], (vin)[3]); \
\
v04_0123 = _mm512_mask_permutex_pd(v0246_01, 0xCC, v0246_23, (1 << 6) | (0 << 4)); \
v15_0123 = _mm512_mask_permutex_pd(v1357_01, 0xCC, v1357_23, (1 << 6) | (0 << 4)); \
v26_0123 = _mm512_mask_permutex_pd(v0246_23, 0x33, v0246_01, (3 << 2) | 2); \
v37_0123 = _mm512_mask_permutex_pd(v1357_23, 0x33, v1357_01, (3 << 2) | 2); \
\
- (vout0_0123) = _mm512_castpd512_pd256(v04_0123); \
- (vout1_0123) = _mm512_castpd512_pd256(v15_0123); \
- (vout2_0123) = _mm512_castpd512_pd256(v26_0123); \
- (vout3_0123) = _mm512_castpd512_pd256(v37_0123); \
- (vout4_0123) = _mm512_extractf64x4_pd(v04_0123, 1); \
- (vout5_0123) = _mm512_extractf64x4_pd(v15_0123, 1); \
- (vout6_0123) = _mm512_extractf64x4_pd(v26_0123, 1); \
- (vout7_0123) = _mm512_extractf64x4_pd(v37_0123, 1); \
+ (vout)[0] = _mm512_castpd512_pd256(v04_0123); \
+ (vout)[1] = _mm512_castpd512_pd256(v15_0123); \
+ (vout)[2] = _mm512_castpd512_pd256(v26_0123); \
+ (vout)[3] = _mm512_castpd512_pd256(v37_0123); \
+ (vout)[4] = _mm512_extractf64x4_pd(v04_0123, 1); \
+ (vout)[5] = _mm512_extractf64x4_pd(v15_0123, 1); \
+ (vout)[6] = _mm512_extractf64x4_pd(v26_0123, 1); \
+ (vout)[7] = _mm512_extractf64x4_pd(v37_0123, 1); \
} while(0)
-#define MM512_TRANSPOSE8X8_PD(vin01234567_0, vin01234567_1, vin01234567_2, vin01234567_3, vin01234567_4, vin01234567_5, vin01234567_6, vin01234567_7, \
- vout0_01234567, vout1_01234567, vout2_01234567, vout3_01234567, vout4_01234567, vout5_01234567, vout6_01234567, vout7_01234567) do { \
+#define MM512_TRANSPOSE8X8_PD(vin, vout) do { \
__m512d v0246_01, v1357_01, v0246_23, v1357_23, v0246_45, v1357_45, v0246_67, v1357_67; \
__m512d v04_0123, v26_0123, v15_0123, v37_0123, v04_4567, v26_4567, v15_4567, v37_4567; \
\
- v0246_01 = MM512_UNPACKLO_PD((vin01234567_0), (vin01234567_1)); \
- v1357_01 = MM512_UNPACKHI_PD((vin01234567_0), (vin01234567_1)); \
- v0246_23 = MM512_UNPACKLO_PD((vin01234567_2), (vin01234567_3)); \
- v1357_23 = MM512_UNPACKHI_PD((vin01234567_2), (vin01234567_3)); \
- v0246_45 = MM512_UNPACKLO_PD((vin01234567_4), (vin01234567_5)); \
- v1357_45 = MM512_UNPACKHI_PD((vin01234567_4), (vin01234567_5)); \
- v0246_67 = MM512_UNPACKLO_PD((vin01234567_6), (vin01234567_7)); \
- v1357_67 = MM512_UNPACKHI_PD((vin01234567_6), (vin01234567_7)); \
+ v0246_01 = MM512_UNPACKLO_PD((vin)[0], (vin)[1]); \
+ v1357_01 = MM512_UNPACKHI_PD((vin)[0], (vin)[1]); \
+ v0246_23 = MM512_UNPACKLO_PD((vin)[2], (vin)[3]); \
+ v1357_23 = MM512_UNPACKHI_PD((vin)[2], (vin)[3]); \
+ v0246_45 = MM512_UNPACKLO_PD((vin)[4], (vin)[5]); \
+ v1357_45 = MM512_UNPACKHI_PD((vin)[4], (vin)[5]); \
+ v0246_67 = MM512_UNPACKLO_PD((vin)[6], (vin)[7]); \
+ v1357_67 = MM512_UNPACKHI_PD((vin)[6], (vin)[7]); \
\
v04_0123 = _mm512_shuffle_f64x2(v0246_01, v0246_23, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
v26_0123 = _mm512_shuffle_f64x2(v0246_01, v0246_23, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
v15_4567 = _mm512_shuffle_f64x2(v1357_45, v1357_67, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
v37_4567 = _mm512_shuffle_f64x2(v1357_45, v1357_67, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
\
- (vout0_01234567) = _mm512_shuffle_f64x2(v04_0123, v04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
- (vout4_01234567) = _mm512_shuffle_f64x2(v04_0123, v04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
- (vout1_01234567) = _mm512_shuffle_f64x2(v15_0123, v15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
- (vout5_01234567) = _mm512_shuffle_f64x2(v15_0123, v15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
- (vout2_01234567) = _mm512_shuffle_f64x2(v26_0123, v26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
- (vout6_01234567) = _mm512_shuffle_f64x2(v26_0123, v26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
- (vout3_01234567) = _mm512_shuffle_f64x2(v37_0123, v37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
- (vout7_01234567) = _mm512_shuffle_f64x2(v37_0123, v37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+ (vout)[0] = _mm512_shuffle_f64x2(v04_0123, v04_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+ (vout)[4] = _mm512_shuffle_f64x2(v04_0123, v04_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+ (vout)[1] = _mm512_shuffle_f64x2(v15_0123, v15_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+ (vout)[5] = _mm512_shuffle_f64x2(v15_0123, v15_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+ (vout)[2] = _mm512_shuffle_f64x2(v26_0123, v26_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+ (vout)[6] = _mm512_shuffle_f64x2(v26_0123, v26_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
+ (vout)[3] = _mm512_shuffle_f64x2(v37_0123, v37_4567, (2 << 6) | (0 << 4) | (2 << 2) | 0); \
+ (vout)[7] = _mm512_shuffle_f64x2(v37_0123, v37_4567, (3 << 6) | (1 << 4) | (3 << 2) | 1); \
} while(0)
#endif // (USE_X86_EXT_INTRIN >= 10)
#if (USE_X86_EXT_INTRIN >= 8)
-#define MM256_TRANSPOSE4X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vout0_0123, vout1_0123) do { \
+#define MM256_TRANSPOSE4X2_PD(vin, vout) do { \
__m256d v01_02, v01_13; \
- v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_0)), (vin01_2), 1); \
- v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin01_1)), (vin01_3), 1); \
+ v01_02 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[0]), (vin)[2], 1); \
+ v01_13 = _mm256_insertf128_pd(_mm256_castpd128_pd256((vin)[1]), (vin)[3], 1); \
\
- (vout0_0123) = MM256_UNPACKLO_PD(v01_02, v01_13); \
- (vout1_0123) = MM256_UNPACKHI_PD(v01_02, v01_13); \
+ (vout)[0] = MM256_UNPACKLO_PD(v01_02, v01_13); \
+ (vout)[1] = MM256_UNPACKHI_PD(v01_02, v01_13); \
} while (0)
-#define MM256_TRANSPOSE2X4_PD(vin0123_0, vin0123_1, vout0_01, vout1_01, vout2_01, vout3_01) do { \
+#define MM256_TRANSPOSE2X4_PD(vin, vout) do { \
__m256d v02_01, v13_01; \
- v02_01 = MM256_UNPACKLO_PD((vin0123_0), (vin0123_1)); \
- v13_01 = MM256_UNPACKHI_PD((vin0123_0), (vin0123_1)); \
+ v02_01 = MM256_UNPACKLO_PD((vin)[0], (vin)[1]); \
+ v13_01 = MM256_UNPACKHI_PD((vin)[0], (vin)[1]); \
\
- (vout0_01) = _mm256_castpd256_pd128(v02_01); \
- (vout1_01) = _mm256_castpd256_pd128(v13_01); \
- (vout2_01) = _mm256_extractf128_pd(v02_01, 1); \
- (vout3_01) = _mm256_extractf128_pd(v13_01, 1); \
+ (vout)[0] = _mm256_castpd256_pd128(v02_01); \
+ (vout)[1] = _mm256_castpd256_pd128(v13_01); \
+ (vout)[2] = _mm256_extractf128_pd(v02_01, 1); \
+ (vout)[3] = _mm256_extractf128_pd(v13_01, 1); \
} while(0)
-#define MM256_TRANSPOSE4X4_PD(vin0123_0, vin0123_1, vin0123_2, vin0123_3, vout0_0123, vout1_0123, vout2_0123, vout3_0123) do { \
+#define MM256_TRANSPOSE4X4_PD(vin, vout) do { \
__m256d v01_02, v01_13, v23_02, v23_13; \
- v01_02 = _mm256_insertf128_pd((vin0123_0), _mm256_castpd256_pd128((vin0123_2)), 1); \
- v01_13 = _mm256_insertf128_pd((vin0123_1), _mm256_castpd256_pd128((vin0123_3)), 1); \
- v23_02 = _mm256_permute2f128_pd((vin0123_0), (vin0123_2), (3 << 4) | 1); \
- v23_13 = _mm256_permute2f128_pd((vin0123_1), (vin0123_3), (3 << 4) | 1); \
+ v01_02 = _mm256_insertf128_pd((vin)[0], _mm256_castpd256_pd128((vin)[2]), 1); \
+ v01_13 = _mm256_insertf128_pd((vin)[1], _mm256_castpd256_pd128((vin)[3]), 1); \
+ v23_02 = _mm256_permute2f128_pd((vin)[0], (vin)[2], (3 << 4) | 1); \
+ v23_13 = _mm256_permute2f128_pd((vin)[1], (vin)[3], (3 << 4) | 1); \
\
- (vout0_0123) = MM256_UNPACKLO_PD(v01_02, v01_13); \
- (vout1_0123) = MM256_UNPACKHI_PD(v01_02, v01_13); \
- (vout2_0123) = MM256_UNPACKLO_PD(v23_02, v23_13); \
- (vout3_0123) = MM256_UNPACKHI_PD(v23_02, v23_13); \
+ (vout)[0] = MM256_UNPACKLO_PD(v01_02, v01_13); \
+ (vout)[1] = MM256_UNPACKHI_PD(v01_02, v01_13); \
+ (vout)[2] = MM256_UNPACKLO_PD(v23_02, v23_13); \
+ (vout)[3] = MM256_UNPACKHI_PD(v23_02, v23_13); \
} while(0)
#endif // (USE_X86_EXT_INTRIN >= 8)
#if (USE_X86_EXT_INTRIN >= 3)
-#define MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01) do { \
+#define MM_TRANSPOSE2X2_PD(vin, vout) do { \
__m128d v0_01, v1_01; \
- v0_01 = MM_UNPACKLO_PD((vin01_0), (vin01_1)); \
- v1_01 = MM_UNPACKHI_PD((vin01_0), (vin01_1)); \
+ v0_01 = MM_UNPACKLO_PD((vin)[0], (vin)[1]); \
+ v1_01 = MM_UNPACKHI_PD((vin)[0], (vin)[1]); \
\
- (vout0_01) = v0_01; \
- (vout1_01) = v1_01; \
+ (vout)[0] = v0_01; \
+ (vout)[1] = v1_01; \
} while(0)
-#define MM_TRANSPOSE4X2_PD(vin01_0, vin01_1, vin01_2, vin01_3, vout0_01, vout0_23, vout1_01, vout1_23) do { \
- MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01); \
- MM_TRANSPOSE2X2_PD(vin01_2, vin01_3, vout0_23, vout1_23); \
+#define MM_TRANSPOSE4X2_PD(vin, vout01, vout23) do { \
+ MM_TRANSPOSE2X2_PD(vin, vout01); \
+ MM_TRANSPOSE2X2_PD(&vin[2], vout23); \
} while(0)
-#define MM_TRANSPOSE2X4_PD(vin01_0, vin23_0, vin01_1, vin23_1, vout0_01, vout1_01, vout2_01, vout3_01) do { \
- MM_TRANSPOSE2X2_PD(vin01_0, vin01_1, vout0_01, vout1_01); \
- MM_TRANSPOSE2X2_PD(vin23_0, vin23_1, vout2_01, vout3_01); \
+#define MM_TRANSPOSE2X4_PD(vin01, vin23, vout) do { \
+ MM_TRANSPOSE2X2_PD(vin01, vout0_01, vout); \
+ MM_TRANSPOSE2X2_PD(vin23, vout2_01, &vout[2]); \
} while(0)
#endif // (USE_X86_EXT_INTRIN >= 3)