From: Simon Pilgrim Date: Sun, 1 May 2016 16:41:22 +0000 (+0000) Subject: [InstCombine][AVX2] Combine VPERMD/VPERMPS intrinsics with constant masks to shufflev... X-Git-Tag: android-x86-7.1-r4~34218 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=95730ae9bd0d2f3b7c6e0cbcb7bea58245cff790;p=android-x86%2Fexternal-llvm.git [InstCombine][AVX2] Combine VPERMD/VPERMPS intrinsics with constant masks to shufflevector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@268199 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 10be4b9877c..307bb9f9806 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -681,6 +681,37 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II, return Builder.CreateShuffleVector(V1, V2, ShuffleMask); } +/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. +static Value *simplifyX86vpermv(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *V = dyn_cast(II.getArgOperand(1)); + if (!V) + return nullptr; + + VectorType *VecTy = cast(II.getType()); + unsigned Size = VecTy->getNumElements(); + assert(Size == 8 && "Unexpected shuffle mask size"); + + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[8] = {0}; + + for (unsigned I = 0; I < Size; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || !isa(COp)) + return nullptr; + + APInt Index = cast(COp)->getValue(); + Index = Index.getLoBits(3); + Indexes[I] = (uint32_t)Index.getZExtValue(); + } + + auto ShuffleMask = + ConstantDataVector::get(II.getContext(), makeArrayRef(Indexes, Size)); + auto V1 = II.getArgOperand(0); + auto V2 = UndefValue::get(VecTy); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. @@ -1751,6 +1782,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + if (Value *V = simplifyX86vpermv(*II, *Builder)) + return replaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_avx_vperm2f128_pd_256: case Intrinsic::x86_avx_vperm2f128_ps_256: case Intrinsic::x86_avx_vperm2f128_si_256: diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/x86-avx2.ll index ef6d4e6d22d..8d1fd89f642 100644 --- a/test/Transforms/InstCombine/x86-avx2.ll +++ b/test/Transforms/InstCombine/x86-avx2.ll @@ -2,12 +2,11 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; FIXME: Verify that instcombine is able to fold identity shuffles. +; Verify that instcombine is able to fold identity shuffles. define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: @identity_test_vpermd( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> ) -; CHECK-NEXT: ret <8 x i32> [[A]] +; CHECK-NEXT: ret <8 x i32> %a0 ; %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) ret <8 x i32> %a @@ -15,20 +14,19 @@ define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) { define <8 x float> @identity_test_vpermps(<8 x float> %a0) { ; CHECK-LABEL: @identity_test_vpermps( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> ) -; CHECK-NEXT: ret <8 x float> [[A]] +; CHECK-NEXT: ret <8 x float> %a0 ; %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) ret <8 x float> %a } -; FIXME: Instcombine should be able to fold the following shuffle to a builtin shufflevector +; Instcombine should be able to fold the following shuffle to a builtin shufflevector ; with a shuffle mask of all zeroes. define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: @zero_test_vpermd( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> zeroinitializer) -; CHECK-NEXT: ret <8 x i32> [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) ret <8 x i32> %a @@ -36,19 +34,19 @@ define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) { define <8 x float> @zero_test_vpermps(<8 x float> %a0) { ; CHECK-LABEL: @zero_test_vpermps( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> zeroinitializer) -; CHECK-NEXT: ret <8 x float> [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) ret <8 x float> %a } -; FIXME: Verify that instcombine is able to fold constant shuffles. +; Verify that instcombine is able to fold constant shuffles. define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: @shuffle_test_vpermd( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> ) -; CHECK-NEXT: ret <8 x i32> [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) ret <8 x i32> %a @@ -56,8 +54,8 @@ define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) { define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) { ; CHECK-LABEL: @shuffle_test_vpermps( -; CHECK-NEXT: [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> ) -; CHECK-NEXT: ret <8 x float> [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) ret <8 x float> %a