From 810572205c9702e0c42f76e68287e0d97273945a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Feb 2018 23:32:27 +0000 Subject: [PATCH] [X86] Teach lower1BitVectorShuffle to recognize shuffles that are just filling upper elements with zero. Replace with insert_subvector. There's still some extra kshifts in one of the modified test cases here, but hopefully that's only a DAG combine away. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324782 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 31 ++++++++++++++++++++++++++++++- test/CodeGen/X86/avx512-mask-op.ll | 22 ++++++++-------------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f5000385010..3c0ad7e125f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14414,8 +14414,36 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, // vector, shuffle and then truncate it back. static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + unsigned NumElts = Mask.size(); + + // Try to recognize shuffles that are just padding a subvector with zeros. + unsigned SubvecElts = 0; + for (int i = 0; i != (int)NumElts; ++i) { + if (Mask[i] >= 0 && Mask[i] != i) + break; + + ++SubvecElts; + } + assert(SubvecElts != NumElts && "Identity shuffle?"); + + // Clip to a power 2. + SubvecElts = PowerOf2Floor(SubvecElts); + + // Make sure the number of zeroable bits in the top at least covers the bits + // not covered by the subvector. + if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { + MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, + V1, DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), + Extract, DAG.getIntPtrConstant(0, DL)); + } + + assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); MVT ExtVT; @@ -14624,7 +14652,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, DAG); if (Is1BitVector) - return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); + return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, + DAG); llvm_unreachable("Unimplemented!"); } diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 40d91356843..44e0a74db2e 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -2757,19 +2757,16 @@ define <8 x i64> @mask_widening(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; KNL-NEXT: kshiftlw $12, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vmovdqa %ymm0, %ymm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: mask_widening: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; SKX-NEXT: vpmovm2d %k0, %zmm0 -; SKX-NEXT: vmovdqa %ymm0, %ymm0 -; SKX-NEXT: vpmovd2m %zmm0, %k1 +; SKX-NEXT: kmovb %k0, %k1 ; SKX-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; SKX-NEXT: retq ; @@ -2779,10 +2776,9 @@ define <8 x i64> @mask_widening(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 ; AVX512BW-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -2793,9 +2789,7 @@ define <8 x i64> @mask_widening(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i ; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: kmovb %k0, %k1 ; AVX512DQ-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: retq entry: -- 2.11.0