From cb2522448c5c6303dde50efe186ae67b27aef5b0 Mon Sep 17 00:00:00 2001 From: Yi Jiang Date: Tue, 2 Sep 2014 21:00:39 +0000 Subject: [PATCH] Generate extract for in-tree uses if the use is scalar operand in vectorized instruction. radar://18144665 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216946 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 87 +++++++++++++++++----- .../SLPVectorizer/X86/extract_in_tree_user.ll | 70 +++++++++++++++++ 2 files changed, 139 insertions(+), 18 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index bbf3e03854b..65987626edd 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -342,6 +342,33 @@ static void reorderInputsAccordingToOpcode(ArrayRef VL, } } +/// \returns True if in-tree use also needs extract. This refers to +/// possible scalar operand in vectorized instruction. +static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, + TargetLibraryInfo *TLI) { + + unsigned Opcode = UserInst->getOpcode(); + switch (Opcode) { + case Instruction::Load: { + LoadInst *LI = cast(UserInst); + return (LI->getPointerOperand() == Scalar); + } + case Instruction::Store: { + StoreInst *SI = cast(UserInst); + return (SI->getPointerOperand() == Scalar); + } + case Instruction::Call: { + CallInst *CI = cast(UserInst); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + return (CI->getArgOperand(1) == Scalar); + } + } + default: + return false; + } +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -864,18 +891,27 @@ void BoUpSLP::buildTree(ArrayRef Roots, for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); - // Skip in-tree scalars that become vectors. - if (ScalarToTreeEntry.count(U)) { - DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << - *U << ".\n"); - int Idx = ScalarToTreeEntry[U]; (void) Idx; - assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); - continue; - } Instruction *UserInst = dyn_cast(U); if (!UserInst) continue; + // Skip in-tree scalars that become vectors + if (ScalarToTreeEntry.count(U)) { + int Idx = ScalarToTreeEntry[U]; + TreeEntry *UseEntry = &VectorizableTree[Idx]; + Value *UseScalar = UseEntry->Scalars[0]; + // Some in-tree scalars will remain as scalar in vectorized + // instructions. If that is the case, the one in Lane 0 will + // be used. + if (UseScalar != U || + !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { + DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U + << ".\n"); + assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + continue; + } + } + // Ignore users in the user ignore list. if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != UserIgnoreList.end()) @@ -1190,16 +1226,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } } - // We combine only GEPs with a single use. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: not-vectorizable GEP (multiple uses).\n"); - BS.cancelScheduling(VL); - newTreeEntry(VL, false); - return; - } - } - // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = cast(VL0)->getOperand(0)->getType(); @@ -2023,6 +2049,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo(AS)); + + // The pointer operand uses an in-tree scalar so we add the new BitCast to + // ExternalUses list to make sure that an extract will be generated in the + // future. + if (ScalarToTreeEntry.count(LI->getPointerOperand())) + ExternalUses.push_back( + ExternalUser(LI->getPointerOperand(), cast(VecPtr), 0)); + unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); if (!Alignment) @@ -2047,6 +2081,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + + // The pointer operand uses an in-tree scalar so we add the new BitCast to + // ExternalUses list to make sure that an extract will be generated in the + // future. + if (ScalarToTreeEntry.count(SI->getPointerOperand())) + ExternalUses.push_back( + ExternalUser(SI->getPointerOperand(), cast(VecPtr), 0)); + if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); S->setAlignment(Alignment); @@ -2088,6 +2130,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; + Value *ScalarArg = nullptr; if (CI && (FI = CI->getCalledFunction())) { IID = (Intrinsic::ID) FI->getIntrinsicID(); } @@ -2098,6 +2141,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // a scalar. This argument should not be vectorized. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { CallInst *CEI = cast(E->Scalars[0]); + ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; } @@ -2116,6 +2160,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; Function *CF = Intrinsic::getDeclaration(M, ID, Tys); Value *V = Builder.CreateCall(CF, OpVecs); + + // The scalar argument uses an in-tree scalar so we add the new vectorized + // call to ExternalUses list to make sure that an extract will be + // generated in the future. + if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) + ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); + E->VectorizedValue = V; ++NumVectorInstructions; return V; diff --git a/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll new file mode 100644 index 00000000000..3628042cdb4 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +@a = common global i64* null, align 8 + +; Function Attrs: nounwind ssp uwtable +define i32 @fn1() { +entry: + %0 = load i64** @a, align 8 + %add.ptr = getelementptr inbounds i64* %0, i64 11 + %1 = ptrtoint i64* %add.ptr to i64 + store i64 %1, i64* %add.ptr, align 8 + %add.ptr1 = getelementptr inbounds i64* %0, i64 56 + %2 = ptrtoint i64* %add.ptr1 to i64 + %arrayidx2 = getelementptr inbounds i64* %0, i64 12 + store i64 %2, i64* %arrayidx2, align 8 + ret i32 undef +; CHECK-LABEL: @fn1( +; CHECK: extractelement <2 x i64*> +; CHECK: ret +} + + +declare float @llvm.powi.f32(float, i32) +define void @fn2(i32* %a, i32* %b, float* %c) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %fp1 = sitofp i32 %add1 to float + %call1 = tail call float @llvm.powi.f32(float %fp1,i32 %add1) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %fp2 = sitofp i32 %add2 to float + %call2 = tail call float @llvm.powi.f32(float %fp2,i32 %add1) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %fp3 = sitofp i32 %add3 to float + %call3 = tail call float @llvm.powi.f32(float %fp3,i32 %add1) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %fp4 = sitofp i32 %add4 to float + %call4 = tail call float @llvm.powi.f32(float %fp4,i32 %add1) nounwind readnone + + store float %call1, float* %c, align 4 + %arrayidx8 = getelementptr inbounds float* %c, i32 1 + store float %call2, float* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds float* %c, i32 2 + store float %call3, float* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds float* %c, i32 3 + store float %call4, float* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @fn2( +; CHECK: extractelement <4 x i32> +; CHECK: ret +} -- 2.11.0