From 1029c7003f0785216ecd1a52838f67d24465f521 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 2 Aug 2014 10:39:15 +0000 Subject: [PATCH] [x86] Largely complete the use of PSHUFB in the new vector shuffle lowering with a small addition to it and adding PSHUFB combining. There is one obvious place in the new vector shuffle lowering where we should form PSHUFBs directly: when without them we will unpack a vector of i8s across two different registers and do a potentially 4-way blend as i16s only to re-pack them into i8s afterward. This is the crazy expensive fallback path for i8 shuffles and we can just directly use pshufb here as it will always be cheaper (the unpack and pack are two instructions so even a single shuffle between them hits our three instruction limit for forming PSHUFB). However, this doesn't generate very good code in many cases, and it leaves a bunch of common patterns not using PSHUFB. So this patch also adds support for extracting a shuffle mask from PSHUFB in the X86 lowering code, and uses it to handle PSHUFBs in the recursive shuffle combining. This allows us to combine through them, combine multiple ones together, and generally produce sufficiently high quality code. Extracting the PSHUFB mask is annoyingly complex because it could be either pre-legalization or post-legalization. At least this doesn't have to deal with re-materialized constants. =] I've added decode routines to handle the different patterns that show up at this level and we dispatch through them as appropriate. The two primary test cases are updated. For the v16 test case there is still a lot of room for improvement. Since I was going through it systematically I left behind a bunch of FIXME lines that I'm hoping to turn into ALL lines by the end of this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214628 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/Utils/X86ShuffleDecode.cpp | 20 +- lib/Target/X86/Utils/X86ShuffleDecode.h | 7 + lib/Target/X86/X86ISelLowering.cpp | 125 ++++++++++-- test/CodeGen/X86/vector-shuffle-128-v16.ll | 314 ++++++++++++++++++----------- test/CodeGen/X86/vector-shuffle-128-v8.ll | 3 +- 5 files changed, 340 insertions(+), 129 deletions(-) diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 83ee12b450c..863da74efbc 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -208,7 +208,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } -/// \brief Decode PSHUFB masks stored in an LLVM Constant. void DecodePSHUFBMask(const ConstantDataSequential *C, SmallVectorImpl &ShuffleMask) { Type *MaskTy = C->getType(); @@ -240,6 +239,25 @@ void DecodePSHUFBMask(const ConstantDataSequential *C, } } +void DecodePSHUFBMask(ArrayRef RawMask, + SmallVectorImpl &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + int Index = Base + M; + assert((Index >= 0 && (unsigned)Index < RawMask.size()) && + "Out of bounds shuffle index for pshub instruction!"); + ShuffleMask.push_back(Index); + } + } +} + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 3ac0afe1e34..dd93c096b34 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -16,6 +16,7 @@ #define X86_SHUFFLE_DECODE_H #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ArrayRef.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding @@ -60,9 +61,15 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask); +/// \brief Decode a PSHUFB mask from an IR-level vector constant. void DecodePSHUFBMask(const ConstantDataSequential *C, SmallVectorImpl &ShuffleMask); +/// \brief Decode a PSHUFB mask from a raw array of constants such as from +/// BUILD_VECTOR. +void DecodePSHUFBMask(ArrayRef RawMask, + SmallVectorImpl &ShuffleMask); + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1b65e741533..eb722daef7f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3408,6 +3408,7 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: @@ -5186,6 +5187,67 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT VT = MaskNode.getValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + auto *CN = dyn_cast(MaskNode->getOperand(i)); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast(MaskCP->getConstVal())) { + // FIXME: Support AVX-512 here. + if (!C->getType()->isVectorTy() || + (C->getNumElements() != 16 && C->getNumElements() != 32)) + return false; + + assert(C->getType()->isVectorTy() && "Expected a vector constant."); + DecodePSHUFBMask(C, Mask); + break; + } + + return false; + } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast(ImmN)->getZExtValue(), Mask); @@ -7843,6 +7905,37 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); } + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + SDValue V1Mask[16]; + SDValue V2Mask[16]; + for (int i = 0; i < 16; ++i) + if (Mask[i] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8); + } else { + V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); + V2Mask[i] = + DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8); + } + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + } + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; @@ -18712,7 +18805,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, - int Depth, SelectionDAG &DAG, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); @@ -18794,15 +18887,16 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, } } - // Bail if we have fewer than 3 shuffle instructions in the chain. - if (Depth < 3) + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) return false; - // If we have 3 or more shuffle instructions, we can replace them with - // a single PSHUFB instruction profitably. Intel's manuals suggest only using - // PSHUFB if doing so replacing 5 instructions, but in practice PSHUFB tends - // to be *very* fast so we're more aggressive. - if (Subtarget->hasSSSE3()) { + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector PSHUFBMask; assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); int Ratio = 16 / Mask.size(); @@ -18861,7 +18955,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, ArrayRef IncomingMask, int Depth, - SelectionDAG &DAG, + bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // Bound the depth of our recursive combine because this is ultimately @@ -18923,12 +19017,14 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: if (Op.getOperand(0).hasOneUse() && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - DAG, DCI, Subtarget)) + HasPSHUFB, DAG, DCI, Subtarget)) return true; break; @@ -18938,7 +19034,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // We can't check for single use, we have to check that this shuffle is the only user. if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - DAG, DCI, Subtarget)) + HasPSHUFB, DAG, DCI, Subtarget)) return true; break; } @@ -18962,7 +19058,8 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, Mask.swap(NewMask); } - return combineX86ShuffleChain(Op, Root, Mask, Depth, DAG, DCI, Subtarget); + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. @@ -19391,7 +19488,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, - /*Depth*/ 1, DAG, DCI, Subtarget)) + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } @@ -22431,6 +22529,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4da7e42caab..49620e93d6f 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1,196 +1,284 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; FIXME: # BB#0: +; FIXME-NEXT: punpcklbw %xmm0, %xmm0 +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: punpcklwd %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: punpckhwd %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_0101010101010101 +; FIXME: # BB#0: +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: @shuffle_v16i8_0101010101010101 +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_0101010101010101 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 -; CHECK-SSE2: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 +; ALL: punpcklbw %xmm1, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm1, %xmm1 +; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: punpcklbw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw %xmm1, %xmm1 +; SSSE3-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSSE3-NEXT: punpcklbw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw %xmm1, %xmm2 +; SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklbw %xmm2, %xmm3 +; SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] +; SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] +; SSE2-NEXT: punpckhbw %xmm2, %xmm1 +; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-LABEL: @zext_to_v8i16_shuffle +; ALL: pxor %xmm1, %xmm1 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-LABEL: @zext_to_v4i32_shuffle +; ALL: pxor %xmm1, %xmm1 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @trunc_v4i32_shuffle +; FIXME: # BB#0: +; FIXME-NEXT: pand +; FIXME-NEXT: packuswb %xmm0, %xmm0 +; FIXME-NEXT: packuswb %xmm0, %xmm0 +; FIXME-NEXT: retq +; +; SSE2-LABEL: @trunc_v4i32_shuffle +; SSE2: # BB#0: +; SSE2-NEXT: pand +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @trunc_v4i32_shuffle +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> ret <16 x i8> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 575dc25f2ec..9a719c8c7ee 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -214,8 +214,7 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: @shuffle_v8i16_26401375 ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,14,15,10,11] -; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] ; SSSE3-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle -- 2.11.0