From 28f299b62df4f9f5ee2d4cf19826ff420dc68dd5 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 15 Feb 2015 13:19:52 +0000 Subject: [PATCH] [X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2 This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets. It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions. Differential Revision: http://reviews.llvm.org/D7596 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229311 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 135 ++++++++++++++++------------- lib/Target/X86/X86InstrSSE.td | 6 ++ test/CodeGen/X86/vector-shuffle-256-v16.ll | 34 ++++++++ test/CodeGen/X86/vector-shuffle-256-v32.ll | 34 ++++++++ test/CodeGen/X86/vector-shuffle-256-v4.ll | 30 +++++++ test/CodeGen/X86/vector-shuffle-256-v8.ll | 31 +++++++ 6 files changed, 208 insertions(+), 62 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b63911b006f..e539c390c98 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7834,12 +7834,10 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). /// -/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ /// byte-shift instructions. The mask must consist of a shifted sequential /// shuffle from one of the input vectors and zeroable elements for the /// remaining 'shifted in' elements. -/// -/// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { @@ -7847,63 +7845,56 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - int Size = Mask.size(); - int Scale = 16 / Size; - - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); - bool ValidShiftRight2 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + int Scale = 16 / NumLaneElts; + MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes); + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + auto MatchByteShift = [&](int Shift) -> SDValue { + bool MatchLeft = true, MatchRight = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < Shift; ++i) + MatchLeft &= Zeroable[l + i]; + for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i) + MatchRight &= Zeroable[l + i]; } + if (!(MatchLeft || MatchRight)) + return SDValue(); - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } - - if (ZeroableLeft) { - bool ValidShiftLeft1 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); - bool ValidShiftLeft2 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); - - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + bool MatchV1 = true, MatchV2 = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + unsigned Pos = MatchLeft ? Shift + l : l; + unsigned Low = MatchLeft ? l : Shift + l; + unsigned Len = NumLaneElts - Shift; + MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); + MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts); } - } + if (!(MatchV1 || MatchV2)) + return SDValue(); + + int ByteShift = Shift * Scale; + unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ; + SDValue V = MatchV1 ? V1 : V2; + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(Op, DL, ShiftVT, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + for (int Shift = 1; Shift < NumLaneElts; ++Shift) + if (SDValue S = MatchByteShift(Shift)) + return S; + + // no match return SDValue(); } @@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } // AVX2 provides a direct instruction for permuting a single input across @@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single @@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v8i32, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i16, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v32i8, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d6849767705..dc54fc5c9ca 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in { (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))), + (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))), + (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>; } let Predicates = [UseSSE2] in { diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 32d54c31db5..caec89baa96 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1349,6 +1349,40 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2 ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + ; ; Shuffle to logical bit shifts ; diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 4c8c02ed033..07e464d4120 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1627,6 +1627,40 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX1: # BB#0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + ; ; Shuffle to logical bit shifts ; diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index e0ded7817c9..1707f9270a8 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -745,6 +745,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_z4z6: +; AVX1: # BB#0: +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_z4z6: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_5zuz: +; AVX1: # BB#0: +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_5zuz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: stress_test1: ; ALL: retq diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index b9e8acf62a9..ce099659aaa 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1815,6 +1815,37 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_9ubzdefz: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_9ubzdefz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x float> @splat_mem_v8f32_2(float* %p) { ; ALL-LABEL: splat_mem_v8f32_2: ; ALL: # BB#0: -- 2.11.0