From 49f31255be201c5fe03debfd729b3011e65619df Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Thu, 11 Sep 2014 16:51:10 +0000 Subject: [PATCH] [AVX512] Fix miscompile for unpack r189189 implemented AVX512 unpack by essentially performing a 256-bit unpack between the low and the high 256 bits of src1 into the low part of the destination and another unpack of the low and high 256 bits of src2 into the high part of the destination. I don't think that's how unpack works. AVX512 unpack simply has more 128-bit lanes but other than it works the same way as AVX. So in each 128-bit lane, we're always interleaving certain parts of both operands rather different parts of one of the operands. E.g. for this: __v16sf a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; __v16sf b = { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; __v16sf c = __builtin_shufflevector(a, b, 0, 8, 1, 9, 4, 12, 5, 13, 16, 24, 17, 25, 20, 28, 21, 29); we generated punpcklps (notice how the elements of a and b are not interleaved in the shuffle). In turn, c was set to this: 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 Obviously this should have just returned the mask vector of the shuffle vector. I mostly reverted this change and made sure the original AVX code worked for 512-bit vectors as well. Also updated the tests because they matched the logic from the code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217602 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 93 +++++++++++++++----------------------- test/CodeGen/X86/avx512-shuffle.ll | 20 ++++++-- 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0448d5c43da..f04a7810a13 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4259,43 +4259,34 @@ static bool isUNPCKLMask(ArrayRef Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) return false; } } } + return true; } @@ -4306,39 +4297,29 @@ static bool isUNPCKHMask(ArrayRef Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) return false; } } diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 964971e6f62..9d1e3441ce4 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -255,7 +255,10 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { ; CHECK: vpunpckhdq %zmm ; CHECK: ret define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> ret <16 x i32> %b } @@ -263,7 +266,10 @@ define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { ; CHECK: vpunpckldq %zmm ; CHECK: ret define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> ret <16 x i32> %b } @@ -271,7 +277,10 @@ define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { ; CHECK: vpunpckhqdq %zmm ; CHECK: ret define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32> + %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32> ret <8 x i64> %b } @@ -279,7 +288,10 @@ define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { ; CHECK: vunpcklps %zmm ; CHECK: ret define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> + %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> ret <16 x float> %b } -- 2.11.0