From 5dbe64e2bc2e4b96654703e85f909536df7ddb84 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 19 Oct 2012 23:05:40 +0000 Subject: [PATCH] Vectorizer: Add support for loop reductions. For example: for (i=0; i DefaultVectorizationFactor("default-loop-vectorize-width", cl::init(4), cl::Hidden, cl::desc("Set the default loop vectorization width")); - namespace { +// Forward declaration. +class LoopVectorizationLegality; + /// Vectorize a simple loop. This class performs the widening of simple single /// basic block loops into vectors. It does not perform any /// vectorization-legality checks, and just does it. It widens the vectors @@ -67,23 +71,28 @@ public: SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), - Builder(Se->getContext()), Induction(0), OldInduction(0) { } + Builder(0), Induction(0), OldInduction(0) { } + + ~SingleBlockLoopVectorizer() { + delete Builder; + } // Perform the actual loop widening (vectorization). - void vectorize() { + void vectorize(LoopVectorizationLegality *Legal) { ///Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); /// Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); + /// Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); // register the new loop. cleanup(); - } + } private: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(); + void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. void cleanup(); @@ -113,6 +122,10 @@ private: /// broadcast them into a vector. Value *getVectorValue(Value *V); + /// Get a uniform vector of constant integers. We use this to get + /// vectors of ones and zeros for the reduction code. + Constant* getUniformVector(unsigned Val, Type* ScalarTy); + typedef DenseMap ValueMap; /// The original loop. @@ -127,10 +140,21 @@ private: unsigned VF; // The builder that we use - IRBuilder<> Builder; + IRBuilder<> *Builder; // --- Vectorization state --- + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + /// The new Induction variable which was added to the new block. PHINode *Induction; /// The induction variable of the old basic block. @@ -146,7 +170,23 @@ private: class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): - TheLoop(Lp), SE(Se), DL(Dl) { } + TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } + + /// This represents the kinds of reductions that we support. + enum ReductionKind { + IntegerAdd, /// Sum of numbers. + IntegerMult, /// Product of numbers. + NoReduction /// Not a reduction. + }; + + // Holds a pairing of reduction instruction and the reduction kind. + typedef std::pair ReductionPair; + + /// ReductionList contains the reduction variables + /// as well as a single EXIT (from the block) value and the kind of + /// reduction variable.. + /// Notice that the EXIT instruction can also be the PHI itself. + typedef DenseMap ReductionList; /// Returns the maximum vectorization factor that we *can* use to vectorize /// this loop. This does not mean that it is profitable to vectorize this @@ -154,6 +194,12 @@ public: /// can vectorize to any SIMD width below this number. unsigned getLoopMaxVF(); + /// Returns the Induction variable. + PHINode *getInduction() {return Induction;} + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -164,12 +210,32 @@ private: // Example: Alloca, Global, NoAlias. bool isIdentifiedSafeObject(Value* Val); + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Checks if a constant matches the reduction kind. + /// Sums starts with zero. Products start at one. + bool isReductionConstant(Value *V, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; + + // --- vectorization state --- // + + /// Holds the induction variable. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet AllowedExit; }; struct LoopVectorize : public LoopPass { @@ -184,6 +250,7 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // Only vectorize innermost loops. if (!L->empty()) return false; @@ -209,7 +276,7 @@ struct LoopVectorize : public LoopPass { // If we decided that is is *legal* to vectorizer the loop. Do it. SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); - LB.vectorize(); + LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; @@ -218,6 +285,7 @@ struct LoopVectorize : public LoopPass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { LoopPass::getAnalysisUsage(AU); AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); } @@ -237,10 +305,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); // Insert the value into a new vector. - Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); + Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); + Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); // We are accessing the induction variable. Make sure to promote the // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. if (V == Induction) @@ -265,7 +333,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + return Builder->CreateAdd(Val, Cv, "induction"); } @@ -297,10 +365,11 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { } Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. ValueMap::iterator it = WidenMap.find(V); if (it != WidenMap.end()) - return it->second; + return it->second; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); @@ -308,6 +377,17 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { return B; } +Constant* +SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { + SmallVector Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF; ++i) + Indices.push_back(ConstantInt::get(ScalarTy, Val)); + + // Add the consecutive indices to the vector value. + return ConstantVector::get(Indices); +} + void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -360,18 +440,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value *Op = Params[op]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) - Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); + Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + Builder->Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) - VecResults = Builder.CreateInsertElement(VecResults, Cloned, - Builder.getInt32(i)); + VecResults = Builder->CreateInsertElement(VecResults, Cloned, + Builder->getInt32(i)); } if (!IsVoidRetTy) @@ -417,16 +497,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); + "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); + "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); OldInduction = dyn_cast(OldBasicBlock->begin()); @@ -435,10 +514,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder = new IRBuilder<>(VecBody); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - Induction = Builder.CreatePHI(IdxTy, 2, "index"); + Induction = Builder->CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); @@ -489,12 +569,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); + Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(Zero, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); + Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); + Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); @@ -504,7 +584,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { OldInduction->setIncomingValue(BlockIdx, CountRoundDown); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); // Register the new loop. Loop* Lp = new Loop(); @@ -518,22 +598,52 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); } + + // Save the state. + LoopMiddleBlock = MiddleBlock; + LoopExitBlock = ExitBlock; + LoopVectorBody = VecBody; + LoopScalarBody = OldBasicBlock; + LoopBypassBlock = BypassBlock; } -void SingleBlockLoopVectorizer::vectorizeLoop() { +void +SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + typedef SmallVector PhiVector; BasicBlock &BB = *Orig->getHeader(); + // In order to support reduction variables we need to be able to vectorize + // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two + // steages. First, we create a new vector PHI node with no incoming edges. + // We use this value when we vectorize all of the instructions that use the + // PHI. Next, after all of the instructions in the block are complete we + // add the new incoming edges to the PHI. At this point all of the + // instructions in the basic block are vectorized, so we can use them to + // construct the PHI. + PhiVector PHIsToFix; + // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *Inst = it; switch (Inst->getOpcode()) { - case Instruction::PHI: case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - + case Instruction::PHI:{ + PHINode* P = cast(Inst); + // Special handling for the induction var. + if (OldInduction == Inst) + continue; + // This is phase I of vectorizing PHIs. + // This has to be a reduction variable. + assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi"); + PHIsToFix.push_back(P); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -557,15 +667,17 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); break; } case Instruction::Select: { // Widen selects. + // TODO: If the selector is loop invariant we can issue a select + // instruction with a scalar condition. Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(A, B, C); + WidenMap[Inst] = Builder->CreateSelect(A, B, C); break; } @@ -577,9 +689,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); break; } @@ -600,10 +712,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); - Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); break; } case Instruction::Load: { @@ -624,9 +736,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder.CreateLoad(Ptr); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder->CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. WidenMap[Inst] = LI; @@ -648,7 +760,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { CastInst *CI = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); break; } @@ -658,6 +770,102 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { break; }// end of switch. }// end of for_each instr. + + // At this point every instruction in the original loop is widended to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionPair ReductionVar = + (*Legal->getReductionVars())[RdxPhi]; + + // This is the vector-clone of the value that leaves the loop. + Value *VectorExit = getVectorValue(ReductionVar.first); + Type *VecTy = VectorExit->getType(); + + // This is the kind of reduction. + LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second; + // Find the reduction identity variable. + // Zero for addition. One for Multiplication. + unsigned IdentitySclr = + (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1); + Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType()); + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + VecRdxPhi->addIncoming(Identity, VecPreheader); + unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + VecRdxPhi->addIncoming(Val, LoopVectorBody); + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + // This PHINode contains the vectorized reduction variable, or + // the identity vector, if we bypass the vector loop. + PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + NewPhi->addIncoming(Identity, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); + + // Extract the first scalar. + Value *Scalar0 = + Builder->CreateExtractElement(NewPhi, Builder->getInt32(0)); + // Extract and sum the remaining vector elements. + for (unsigned i=1; i < VF; ++i) { + Value *Scalar1 = + Builder->CreateExtractElement(NewPhi, Builder->getInt32(i)); + if (RdxKind == LoopVectorizationLegality::IntegerAdd) { + Scalar0 = Builder->CreateAdd(Scalar0, Scalar1); + } else { + Scalar0 = Builder->CreateMul(Scalar0, Scalar1); + } + } + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first); + }// end of for each redux variable. } void SingleBlockLoopVectorizer::cleanup() { @@ -710,31 +918,35 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { ValueVector Reads; ValueVector Writes; - SmallPtrSet AnalyzedPtrs; - unsigned NumPhis = 0; for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; PHINode *Phi = dyn_cast(I); if (Phi) { - NumPhis++; + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } // We only look at integer phi nodes. if (!Phi->getType()->isIntegerTy()) { DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); return false; } - - // If we found an induction variable. - if (NumPhis > 1) { - DEBUG(dbgs() << "LV: Found more than one PHI.\n"); - return false; + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; } - - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + continue; + } + if (Induction) { + DEBUG(dbgs() << "LV: Found too many PHIs.\n"); return false; } + // Found the induction variable. + Induction = Phi; // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); @@ -751,7 +963,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); return false; } - } + }// end of PHI handling // If this is a load, record its pointer. If it is not a load, abort. // Notice that we don't handle function calls that read or write. @@ -764,8 +976,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } Value* Ptr = Ld->getPointerOperand(); - if (AnalyzedPtrs.insert(Ptr)) - GetUnderlyingObjects(Ptr, Reads, DL); + GetUnderlyingObjects(Ptr, Reads, DL); } // Record store pointers. Abort on all other instructions that write to @@ -779,8 +990,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } Value* Ptr = St->getPointerOperand(); - if (AnalyzedPtrs.insert(Ptr)) - GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); + GetUnderlyingObjects(Ptr, Writes, DL); } // We still don't handle functions. @@ -797,21 +1007,26 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); return false; } - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator it = I->use_begin(), e = I->use_end(); - it != e; ++it) { - Instruction *U = cast(*it); - BasicBlock *Parent = U->getParent(); - if (Parent != &BB) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; - } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(I)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + // This user may be a reduction exit value. + BasicBlock *Parent = U->getParent(); + if (Parent != &BB) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } } } // next instr. - if (NumPhis != 1) { - DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); - return false; + if (!Induction) { + DEBUG(dbgs() << "LV: Did not find an induction var.\n"); + return false; } // Check that the underlying objects of the reads and writes are either @@ -866,6 +1081,110 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { return A->hasNoAliasAttr(); } +bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, + ReductionKind Kind) { + if (Phi->getNumIncomingValues() != 2) + return false; + + // Find the possible incoming reduction variable. + BasicBlock *BB = Phi->getParent(); + int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); + int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. + Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); + + // We must have a constant that starts the reduction. + if (!isReductionConstant(RdxStart, Kind)) + return false; + + // ExitInstruction is the single value which is used outside the loop. + // We only allow for a single reduction value to be used outside the loop. + // This includes users of the reduction, variables (which form a cycle + // which ends in the phi node). + Instruction *ExitInstruction = 0; + + // Iter is our iterator. We start with the PHI node and scan for all of the + // users of this instruction. All users must be instructions which can be + // used as reduction variables (such as ADD). We may have a single + // out-of-block user. They cycle must end with the original PHI. + // Also, we can't have multiple block-local users. + Instruction *Iter = Phi; + while (true) { + // Any reduction instr must be of one of the allowed kinds. + if (!isReductionInstr(Iter, Kind)) + return false; + + // Did we found a user inside this block ? + bool FoundInBlockUser = false; + // Did we reach the initial PHI node ? + bool FoundStartPHI = false; + // For each of the *users* of iter. + for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + // We already know that the PHI is a user. + if (U == Phi) { + FoundStartPHI = true; + continue; + } + // Check if we found the exit user. + BasicBlock *Parent = U->getParent(); + if (Parent != BB) { + // We must have a single exit instruction. + if (ExitInstruction != 0) + return false; + ExitInstruction = Iter; + } + // We can't have multiple inside users. + if (FoundInBlockUser) + return false; + FoundInBlockUser = true; + Iter = U; + } + + // We found a reduction var if we have reached the original + // phi node and we only have a single instruction with out-of-loop + // users. + if (FoundStartPHI && ExitInstruction) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); + // Mark this as a reduction var. + Reductions[Phi] = std::make_pair(ExitInstruction, Kind); + return true; + } + } +} + +bool +LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) { + ConstantInt *CI = dyn_cast(V); + if (!CI) + return false; + if (Kind == IntegerMult && CI->isOne()) + return true; + if (Kind == IntegerAdd && CI->isZero()) + return true; + return false; +} + +bool +LoopVectorizationLegality::isReductionInstr(Instruction *I, + ReductionKind Kind) { + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + // possibly. + return true; + case Instruction::Add: + case Instruction::Sub: + return Kind == IntegerAdd; + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return Kind == IntegerMult; + } +} + } // namespace char LoopVectorize::ID = 0; @@ -880,6 +1199,5 @@ namespace llvm { Pass *createLoopVectorizePass() { return new LoopVectorize(); } - } diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll index 4e9e6f940ec..6fb1792b2c8 100644 --- a/test/Transforms/LoopVectorize/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/gcc-examples.ll @@ -202,9 +202,8 @@ define void @example8(i32 %x) nounwind uwtable ssp { ret void } -; We can't vectorize because it has a reduction variable. ;CHECK: @example9 -;CHECK-NOT: <4 x i32> +;CHECK: phi <4 x i32> ;CHECK: ret i32 define i32 @example9() nounwind uwtable readonly ssp { br label %1 diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll deleted file mode 100644 index e944a9af92d..00000000000 --- a/test/Transforms/LoopVectorize/increment.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.8.0" - -@a = common global [2048 x i32] zeroinitializer, align 16 - -; This is the loop. -; for (i=0; i -;CHECK: add <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret void -define void @inc(i32 %n) nounwind uwtable noinline ssp { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv - %3 = load i32* %2, align 4 - %4 = trunc i64 %indvars.iv to i32 - %5 = add nsw i32 %3, %4 - store i32 %5, i32* %2, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - ret void -} diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll new file mode 100644 index 00000000000..257f661a634 --- /dev/null +++ b/test/Transforms/LoopVectorize/reduction.ll @@ -0,0 +1,122 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK: @reduction_sum +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: ret i32 +define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %sum.02, %6 + %8 = add i32 %7, %3 + %9 = add i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +;CHECK: @reduction_prod +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: mul <4 x i32> +;CHECK: ret i32 +define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = mul i32 %prod.02, %6 + %8 = mul i32 %7, %3 + %9 = mul i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ] + ret i32 %prod.0.lcssa +} + +;CHECK: @reduction_mix +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: mul <4 x i32> +;CHECK: ret i32 +define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = mul nsw i32 %5, %3 + %7 = trunc i64 %indvars.iv to i32 + %8 = add i32 %sum.02, %7 + %9 = add i32 %8, %6 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +;CHECK: @reduction_bad +;CHECK-NOT: <4 x i32> +;CHECK: ret i32 +define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %3, %6 + %8 = add i32 %7, %5 + %9 = mul i32 %8, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} -- 2.11.0