From 8dc93a81b4f88bb94b794a3a773acbe7e9c807ac Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 28 Jan 2019 15:51:34 +0000 Subject: [PATCH] [x86] allow more shuffle splitting to avoid vpermps (PR40434) This is tricky to make optimal: sometimes we're better off using a single wider op, but other times it makes more sense to combine a narrow ops to achieve the same result. This solves the case from: https://bugs.llvm.org/show_bug.cgi?id=40434 There's potentially a similar change for vectors with 64-bit elements, but it needs adjustments similar to rL352333 to avoid creating infinite loops. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352380 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 4 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 133 +++++++---------------------- test/CodeGen/X86/vector-shuffle-512-v16.ll | 8 +- 3 files changed, 40 insertions(+), 105 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5047500a672..13216220bd9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14487,8 +14487,10 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, if (NumUpperHalves == 1) { // AVX2 has efficient 32/64-bit element cross-lane shuffles. if (Subtarget.hasAVX2()) { + // extract128 + vunpckhps, is better than vblend + vpermps. // TODO: Refine to account for unary shuffle, splat, and other masks? - if (EltWidth == 32 && NumLowerHalves == 1) + if (EltWidth == 32 && NumLowerHalves && + HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask)) return SDValue(); if (EltWidth == 64) return SDValue(); diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 1404a1d7e2b..63c65d7abda 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2878,143 +2878,76 @@ entry: ret <8 x float> %tmp6 } -; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) ; PR40434: https://bugs.llvm.org/show_bug.cgi?id=40434 define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) { -; AVX1-LABEL: unpckh_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: unpckh_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: unpckh_v8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> -; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: unpckh_v8i32: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> ret <8 x i32> %unpckh } -; FIXME: Same as above but with floats. AVX1 lowering is better than AVX2 (and AVX512?) +; Same as above but with floats. define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) { -; AVX1-LABEL: unpckh_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: unpckh_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: unpckh_v8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> -; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: unpckh_v8f32: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> ret <8 x float> %unpckh } -; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) ; Alternate form of the above - make sure we don't have conflicting transforms. define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) { -; AVX1-LABEL: blend_perm_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: blend_perm_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: blend_perm_v8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> -; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: blend_perm_v8i32: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> %r = shufflevector <8 x i32> %unpckh, <8 x i32> undef, <8 x i32> ret <8 x i32> %r } -; FIXME: Same as above but with floats. AVX1 lowering is better than AVX2 (and AVX512?) +; Same as above but with floats. define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) { -; AVX1-LABEL: blend_perm_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: blend_perm_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: blend_perm_v8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> -; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: blend_perm_v8f32: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> %r = shufflevector <8 x float> %unpckh, <8 x float> undef, <8 x i32> ret <8 x float> %r } -; FIXME: AVX1 lowering is better than AVX2/AVX512. ; Another variation of the above - make sure we don't have conflicting transforms. define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) { -; AVX1-LABEL: unpckh_v8i32_unary: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2OR512VL-LABEL: unpckh_v8i32_unary: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2OR512VL-NEXT: retq +; ALL-LABEL: unpckh_v8i32_unary: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> ret <8 x i32> %r } -; FIXME: Same as above but with floats. AVX1 lowering is better than AVX2/AVX512. +; Same as above but with floats. define <8 x float> @unpckh_v8f32_unary(<8 x float> %x) { -; AVX1-LABEL: unpckh_v8f32_unary: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2OR512VL-LABEL: unpckh_v8f32_unary: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2OR512VL-NEXT: retq +; ALL-LABEL: unpckh_v8f32_unary: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> ret <8 x float> %r } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index b7ea04d23df..1c35be04f7f 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -311,12 +311,12 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_4_8_12: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vmovaps {{.*#+}} ymm2 = -; ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,u,u,u,u,u> ; ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> -- 2.11.0