From 878eaf48c11e4702a2cb5662e608cea2540a208c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 24 Oct 2015 22:45:04 +0000 Subject: [PATCH] [X86][SSE] Use lowerVectorShuffleWithUNPCK instead of custom matches. Most 128-bit and 256-bit shuffles were manually matching UNPCK patterns - use lowerVectorShuffleWithUNPCK to be more thorough. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251211 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 144 ++++++++--------------------- test/CodeGen/X86/vec_int_to_fp.ll | 2 +- test/CodeGen/X86/vector-sext.ll | 4 +- test/CodeGen/X86/vector-shuffle-256-v16.ll | 40 ++++---- test/CodeGen/X86/vector-zext.ll | 2 +- 5 files changed, 64 insertions(+), 128 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9fcff312e66..83c733a523d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8229,10 +8229,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, @@ -8324,10 +8323,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8532,14 +8530,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); @@ -8616,14 +8609,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -9207,10 +9195,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); - if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, @@ -9254,10 +9241,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -9498,16 +9484,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 0, 16, 1, 17, 2, 18, 3, 19, - // High half. - 4, 20, 5, 21, 6, 22, 7, 23})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 8, 24, 9, 25, 10, 26, 11, 27, - // High half. - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -10236,16 +10215,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -10336,14 +10309,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -10401,14 +10369,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -10501,14 +10464,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; } // Try to use shift instructions. @@ -10575,18 +10533,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10675,22 +10624,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - // Note that these are repeated 128-bit lane unpacks, not unpacks across all - // 256-bit lanes. - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 0f483d3d8c1..32c040fc844 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -1756,7 +1756,7 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { ; AVX1-LABEL: uitofp_8i8_to_8f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index fb95138b6ce..b7faafdb503 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1437,7 +1437,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: retq @@ -1552,7 +1552,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; X32-SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X32-SSE41-NEXT: pslld $31, %xmm0 ; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-SSE41-NEXT: pslld $31, %xmm1 ; X32-SSE41-NEXT: psrad $31, %xmm1 ; X32-SSE41-NEXT: retl diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 1de75bcf562..4ba4b391ceb 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1701,21 +1701,21 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1 ; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1725,21 +1725,21 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1 ; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -2443,13 +2443,13 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2 ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2497,13 +2497,13 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3 ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2646,13 +2646,13 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1 ; ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -2673,13 +2673,13 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 ; ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 9feac0559e3..27d25c3af2e 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -137,7 +137,7 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_16i8_to_8i32: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -- 2.11.0