From a341a8070a5b389eacb11269c261e97c91103f79 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 6 Aug 2014 10:16:36 +0000 Subject: [PATCH] [x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 116 ++++++++++++++++++++---------- test/CodeGen/X86/vector-shuffle-128-v8.ll | 63 ++++++++-------- 2 files changed, 108 insertions(+), 71 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7bf58e20dac..ab87d172297 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7386,9 +7386,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. - auto fixInPlaceInputs = [&PSHUFDMask]( - ArrayRef InPlaceInputs, MutableArrayRef SourceHalfMask, - MutableArrayRef HalfMask, int HalfOffset) { + auto fixInPlaceInputs = + [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, + MutableArrayRef SourceHalfMask, + MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { @@ -7397,6 +7398,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = @@ -7408,10 +7417,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; - if (!HToLInputs.empty()) - fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); - if (!LToHInputs.empty()) - fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. @@ -7420,7 +7427,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, - int SourceOffset, int DestOffset) { + MutableArrayRef FinalSourceHalfMask, int SourceOffset, + int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; @@ -7498,18 +7506,68 @@ static SDValue lowerV8I16SingleInputVectorShuffle( } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { - int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; - assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && - "Not all dwords can be clobbered!"); - SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; - SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + // We have two non-adjacent or clobbered inputs we need to extract from + // the source half. To do this, we need to map them into some adjacent + // dword slot in the source mask. + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, + IncomingInputs[1] - SourceOffset}; + + // If there is a free slot in the source half mask adjacent to one of + // the inputs, place the other input in it. We use (Index XOR 1) to + // compute an adjacent index. + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && + SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + InputsFixed[1] = InputsFixed[0] ^ 1; + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && + SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; + InputsFixed[0] = InputsFixed[1] ^ 1; + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + // The two inputs are in the same DWord but it is clobbered and the + // adjacent DWord isn't used at all. Move both inputs to the free + // slot. + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; + } else { + // The only way we hit this point is if there is no clobbering + // (because there are no off-half inputs to this half) and there is no + // free slot adjacent to one of the inputs. In this case, we have to + // swap an input with a non-input. + for (int i = 0; i < 4; ++i) + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + "We can't handle any clobbers here!"); + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && + "Cannot have adjacent inputs here!"); + + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; + + // We also have to update the final source mask in this case because + // it may need to undo the above swap. + for (int &M : FinalSourceHalfMask) + if (M == (InputsFixed[0] ^ 1)) + M = InputsFixed[1]; + else if (M == InputsFixed[1]) + M = InputsFixed[0] ^ 1; + + InputsFixed[1] = InputsFixed[0] ^ 1; + } + + // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) - M = SourceDWordBase + SourceOffset; + M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) - M = SourceDWordBase + 1 + SourceOffset; - IncomingInputs[0] = SourceDWordBase + SourceOffset; - IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + M = InputsFixed[1] + SourceOffset; + + IncomingInputs[0] = InputsFixed[0] + SourceOffset; + IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); @@ -7524,9 +7582,9 @@ static SDValue lowerV8I16SingleInputVectorShuffle( if (M == Input) M = FreeDWord * 2 + Input % 2; }; - moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); - moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their @@ -19391,26 +19449,6 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, // Other-half shuffles are no-ops. continue; - - case X86ISD::PSHUFD: { - // We can only handle pshufd if the half we are combining either stays in - // its half, or switches to the other half. Bail if one of these isn't - // true. - SmallVector VMask = getPSHUFShuffleMask(V); - int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; - if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || - (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) - return false; - - // Map the mask through the pshufd and keep walking up the chain. - for (int i = 0; i < 4; ++i) - Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; - - // Switch halves if the pshufd does. - CombineOpcode = - VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; - continue; - } } // Break out of the loop if we break out of the switch. break; diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index ffe537fc351..b10c3b6c3d2 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -173,9 +173,9 @@ define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @shuffle_v8i16_26405173 ; SSE2: # BB#0: ; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4] ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] ; SSE2-NEXT: retq ; @@ -190,9 +190,9 @@ define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @shuffle_v8i16_20645173 ; SSE2: # BB#0: ; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4] ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] ; SSE2-NEXT: retq ; @@ -207,9 +207,9 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @shuffle_v8i16_26401375 ; SSE2: # BB#0: ; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4] ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] -; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v8i16_26401375 @@ -404,9 +404,8 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @shuffle_v8i16_45630127 ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,3,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v8i16_45630127 @@ -652,8 +651,9 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @shuffle_v8i16_XXX1X579 ; SSE2: # BB#0: -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] @@ -663,36 +663,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: @shuffle_v8i16_XXX1X579 ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+}},2,3,10,11,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}] ; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7] ; SSSE3-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { -; FIXME-SSE2-LABEL: @shuffle_v8i16_XX4X8acX -; FIXME-SSE2: # BB#0: -; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7] -; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; FIXME-SSE2-NEXT: punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,1,0,2,4,5,6,7] -; FIXME-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; FIXME-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,1,3,4,5,6,7] -; FIXME-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,7] -; FIXME-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v8i16_XX4X8acX +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,1,2,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,4,7] +; SSE2-NEXT: retq ; -; FIXME-SSSE3-LABEL: @shuffle_v8i16_XX4X8acX -; FIXME-SSSE3: # BB#0: -; FIXME-SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; FIXME-SSSE3-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,5,7,6,7] -; FIXME-SSSE3-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; FIXME-SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; FIXME-SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,{{[0-9]+,[0-9]+}},2,3,6,7,10,11{{[0-9]+,[0-9]+}}] -; FIXME-SSSE3-NEXT: retq +; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*}} # [[X:xmm[0-9]+]] = xmm0[2,1,2,3] +; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},2,3,{{[0-9]+,[0-9]+}},0,1,4,5,8,9,{{[0-9]+,[0-9]+}}] +; SSSE3-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } -- 2.11.0