From d14347a186058f1105e87b13f6dd7704ade9687e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 21 Feb 2017 15:09:00 +0000 Subject: [PATCH] [X86][SSE] Prefer to combine shuffles to VZEXT over VZEXT_MOVL. This matches what is already done during shuffle lowering and helps prevent the need for a zero-vector in cases where shuffles match both patterns. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295723 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 18 ++++++++-------- test/CodeGen/X86/vector-shuffle-combining-sse41.ll | 25 +++++----------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8a9a8fa06bf..3aecdc269b5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26387,15 +26387,6 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). - if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && - isUndefOrEqual(Mask[0], 0) && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - // Match against a VZEXT instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || @@ -26421,6 +26412,15 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } } + // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + isUndefOrEqual(Mask[0], 0) && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 9352e90952d..29e2124a168 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -8,31 +8,16 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) -; FIXME: We can avoid the zero vector generation if we use PMOVZX instead define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { ; SSE-LABEL: combine_vpshufb_as_movzx: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vpshufb_as_movzx: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vpshufb_as_movzx: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vpshufb_as_movzx: -; AVX512F: # BB#0: -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512F-NEXT: retq +; AVX-LABEL: combine_vpshufb_as_movzx: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 } -- 2.11.0