From a14491aa36fe817f29d2479ccd94070d1035deef Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Aug 2016 10:03:23 +0000 Subject: [PATCH] [X86][SSE] Add support for combining target shuffles to PALIGNR byte rotations git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278787 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 77 +++++++++++++++------- test/CodeGen/X86/vector-shuffle-256-v16.ll | 8 +-- test/CodeGen/X86/vector-shuffle-256-v4.ll | 6 +- test/CodeGen/X86/vector-shuffle-combining-ssse3.ll | 14 ++++ 4 files changed, 74 insertions(+), 31 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bc5e1e6b72e..04b068f834c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7747,13 +7747,8 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); - +static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; @@ -7769,20 +7764,28 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue Lo, Hi; for (int l = 0; l < NumElts; l += NumLaneElts) { for (int i = 0; i < NumLaneElts; ++i) { - if (Mask[l + i] < 0) + int M = Mask[l + i]; + + if (M == SM_SentinelUndef) continue; + if (M == SM_SentinelZero) + return -1; + + assert(0 <= M && M < (2*NumElts) && "Unexpected mask index."); + // Get the mod-Size index and lane correct it. - int LaneIdx = (Mask[l + i] % NumElts) - l; + int LaneIdx = (M % NumElts) - l; + // Make sure it was in this lane. if (LaneIdx < 0 || LaneIdx >= NumLaneElts) - return SDValue(); + return -1; // Determine where a rotated vector would have started. int StartIdx = i - LaneIdx; if (StartIdx == 0) // The identity rotation isn't interesting, stop. - return SDValue(); + return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the @@ -7793,10 +7796,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. - return SDValue(); + return -1; // Compute which value this mask is pointing at. - SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; + SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low @@ -7810,7 +7813,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. - return SDValue(); + return -1; } } @@ -7822,15 +7825,32 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, else if (!Hi) Hi = Lo; - // Cast the inputs to i8 vector of correct length to match PALIGNR or - // PSLLDQ/PSRLDQ. - MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); - Lo = DAG.getBitcast(ByteVT, Lo); - Hi = DAG.getBitcast(ByteVT, Hi); + V1 = Lo; + V2 = Hi; // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int Scale = 16 / NumLaneElts; + return Rotation * Scale; +} + +static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SDValue Lo = V1, Hi = V2; + int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); + if (ByteRotation <= 0) + return SDValue(); + + // Cast the inputs to i8 vector of correct length to match PALIGNR or + // PSLLDQ/PSRLDQ. + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + Lo = DAG.getBitcast(ByteVT, Lo); + Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { @@ -7838,7 +7858,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, - DAG.getConstant(Rotation * Scale, DL, MVT::i8))); + DAG.getConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && @@ -7849,8 +7869,8 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation - int LoByteShift = 16 - Rotation * Scale; - int HiByteShift = Rotation * Scale; + int LoByteShift = 16 - ByteRotation; + int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); @@ -25198,6 +25218,19 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + bool FloatDomain = MaskVT.isFloatingPoint(); + + // Attempt to match against PALIGNR byte rotate. + if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); + if (0 < ByteRotation) { + Shuffle = X86ISD::PALIGNR; + ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); + PermuteImm = ByteRotation; + return true; + } + } // Attempt to blend with zero. if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index dce66d8449e..c7d9ca86102 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2971,9 +2971,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3215,9 +3213,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 22b6e28fae1..f3b6891349c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -576,7 +576,7 @@ define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0112: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -748,7 +748,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0412: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] @@ -775,7 +775,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_4012: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index d0ad07d6d07..3ef956f1c61 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -184,6 +184,20 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { ret <16 x i8> %2 } +define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) { +; SSE-LABEL: combine_pshufb_as_palignr: +; SSE: # BB#0: +; SSE-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufb_as_palignr: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; AVX-NEXT: retq + %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) + ret <16 x i8> %res0 +} + define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_pslldq: ; SSE: # BB#0: -- 2.11.0