From: Kevin Qin Date: Mon, 29 Sep 2014 11:15:00 +0000 (+0000) Subject: Use a loop to simplify the runtime unrolling prologue. X-Git-Tag: android-x86-7.1-r4~57074 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=dbaeb6e7cbc50d41c67e8e2c72b52f58cc551979;p=android-x86%2Fexternal-llvm.git Use a loop to simplify the runtime unrolling prologue. Runtime unrolling will create a prologue to execute the extra iterations which is can't divided by the unroll factor. It generates an if-then-else sequence to jump into a factor -1 times unrolled loop body, like extraiters = tripcount % loopfactor if (extraiters == 0) jump Loop: if (extraiters == loopfactor) jump L1 if (extraiters == loopfactor-1) jump L2 ... L1: LoopBody; L2: LoopBody; ... if tripcount < loopfactor jump End Loop: ... End: It means if the unroll factor is 4, the loop body will be 7 times unrolled, 3 are in loop prologue, and 4 are in the loop. This commit is to use a loop to execute the extra iterations in prologue, like extraiters = tripcount % loopfactor if (extraiters == 0) jump Loop: else jump Prol Prol: LoopBody; extraiters -= 1 // Omitted if unroll factor is 2. if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. if (tripcount < loopfactor) jump End Loop: ... End: Then when unroll factor is 4, the loop body will be copied by only 5 times, 1 in the prologue loop, 4 in the original loop. And if the unroll factor is 2, new loop won't be created, just as the original solution. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218604 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a96c46ad63e..4241fcaa880 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -57,7 +58,7 @@ STATISTIC(NumRuntimeUnrolled, static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &LVMap, Pass *P) { + ValueToValueMapTy &VMap, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -86,7 +87,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, Value *V = PN->getIncomingValueForBlock(Latch); if (Instruction *I = dyn_cast(V)) { if (L->contains(I)) { - V = LVMap[I]; + V = VMap[I]; } } // Adding a value to the new PHI node from the last prolog block @@ -127,76 +128,122 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, } /// Create a clone of the blocks in a loop and connect them together. -/// This function doesn't create a clone of the loop structure. +/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new +/// loop will be created including all cloned blocks, and the iterator of it +/// switches to count NewIter down to 0. /// -/// There are two value maps that are defined and used. VMap is -/// for the values in the current loop instance. LVMap contains -/// the values from the last loop instance. We need the LVMap values -/// to update the initial values for the current loop instance. -/// -static void CloneLoopBlocks(Loop *L, - bool FirstCopy, - BasicBlock *InsertTop, - BasicBlock *InsertBot, +static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, + BasicBlock *InsertTop, BasicBlock *InsertBot, std::vector &NewBlocks, - LoopBlocksDFS &LoopBlocks, - ValueToValueMapTy &VMap, - ValueToValueMapTy &LVMap, + LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, LoopInfo *LI) { - BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); Function *F = Header->getParent(); LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); + Loop *NewLoop = 0; + Loop *ParentLoop = L->getParentLoop(); + if (!UnrollProlog) { + NewLoop = new Loop(); + if (ParentLoop) + ParentLoop->addChildLoop(NewLoop); + else + LI->addTopLevelLoop(NewLoop); + } + // For each block in the original loop, create a new copy, // and update the value map with the newly created values. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { - BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F); + BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F); NewBlocks.push_back(NewBB); - if (Loop *ParentLoop = L->getParentLoop()) + if (NewLoop) + NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + else if (ParentLoop) ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); VMap[*BB] = NewBB; if (Header == *BB) { // For the first block, add a CFG connection to this newly - // created block + // created block. InsertTop->getTerminator()->setSuccessor(0, NewBB); - // Change the incoming values to the ones defined in the - // previously cloned loop. - for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { - PHINode *NewPHI = cast(VMap[I]); - if (FirstCopy) { - // We replace the first phi node with the value from the preheader - VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); - NewBB->getInstList().erase(NewPHI); - } else { - // Update VMap with values from the previous block - unsigned idx = NewPHI->getBasicBlockIndex(Latch); - Value *InVal = NewPHI->getIncomingValue(idx); - if (Instruction *I = dyn_cast(InVal)) - if (L->contains(I)) - InVal = LVMap[InVal]; - NewPHI->setIncomingValue(idx, InVal); - NewPHI->setIncomingBlock(idx, InsertTop); - } - } } - if (Latch == *BB) { + // For the last block, if UnrollProlog is true, create a direct jump to + // InsertBot. If not, create a loop back to cloned head. VMap.erase((*BB)->getTerminator()); - NewBB->getTerminator()->eraseFromParent(); - BranchInst::Create(InsertBot, NewBB); + BasicBlock *FirstLoopBB = cast(VMap[Header]); + BranchInst *LatchBR = cast(NewBB->getTerminator()); + if (UnrollProlog) { + LatchBR->eraseFromParent(); + BranchInst::Create(InsertBot, NewBB); + } else { + PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter", + FirstLoopBB->getFirstNonPHI()); + IRBuilder<> Builder(LatchBR); + Value *IdxSub = + Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".sub"); + Value *IdxCmp = + Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp"); + BranchInst::Create(FirstLoopBB, InsertBot, IdxCmp, NewBB); + NewIdx->addIncoming(NewIter, InsertTop); + NewIdx->addIncoming(IdxSub, NewBB); + LatchBR->eraseFromParent(); + } } } - // LastValueMap is updated with the values for the current loop - // which are used the next time this function is called. - for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); - VI != VE; ++VI) { - LVMap[VI->first] = VI->second; + + // Change the incoming values to the ones defined in the preheader or + // cloned loop. + for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { + PHINode *NewPHI = cast(VMap[I]); + if (UnrollProlog) { + VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); + cast(VMap[Header])->getInstList().erase(NewPHI); + } else { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + BasicBlock *NewLatch = cast(VMap[Latch]); + idx = NewPHI->getBasicBlockIndex(Latch); + Value *InVal = NewPHI->getIncomingValue(idx); + NewPHI->setIncomingBlock(idx, NewLatch); + if (VMap[InVal]) + NewPHI->setIncomingValue(idx, VMap[InVal]); + } + } + if (NewLoop) { + // Add unroll disable metadata to disable future unrolling for this loop. + SmallVector Vals; + // Reserve first location for self reference to the LoopID metadata node. + Vals.push_back(nullptr); + MDNode *LoopID = NewLoop->getLoopID(); + if (LoopID) { + // First remove any existing loop unrolling metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + } + } + + LLVMContext &Context = NewLoop->getHeader()->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + Vals.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + NewLoop->setLoopID(NewLoopID); } } @@ -212,18 +259,16 @@ static void CloneLoopBlocks(Loop *L, /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// -/// extraiters = tripcount % loopfactor -/// if (extraiters == 0) jump Loop: -/// if (extraiters == loopfactor) jump L1 -/// if (extraiters == loopfactor-1) jump L2 -/// ... -/// L1: LoopBody; -/// L2: LoopBody; -/// ... -/// if tripcount < loopfactor jump End -/// Loop: -/// ... -/// End: +/// extraiters = tripcount % loopfactor +/// if (extraiters == 0) jump Loop: +/// else jump Prol +/// Prol: LoopBody; +/// extraiters -= 1 // Omitted if unroll factor is 2. +/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. +/// if (tripcount < loopfactor) jump End +/// Loop: +/// ... +/// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, LPPassManager *LPM) { @@ -284,26 +329,21 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - // Check if for no extra iterations, then jump to unrolled loop. We have to - // check that the trip count computation didn't overflow when adding one to - // the backedge taken count. + // Check if for no extra iterations, then jump to cloned/unrolled loop. + // We have to check that the trip count computation didn't overflow when + // adding one to the backedge taken count. Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); - // Branch to either the extra iterations or the unrolled loop + // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); - - ValueToValueMapTy LVMap; Function *F = Header->getParent(); - // These variables are used to update the CFG links in each iteration - BasicBlock *CompareBB = nullptr; - BasicBlock *LastLoopBB = PH; // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); @@ -314,62 +354,34 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // - for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { - std::vector NewBlocks; - ValueToValueMapTy VMap; - - // Clone all the basic blocks in the loop, but we don't clone the loop - // This function adds the appropriate CFG connections. - CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, - LoopBlocks, VMap, LVMap, LI); - LastLoopBB = cast(VMap[Latch]); - - // Insert the cloned blocks into function just before the original loop - F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), - NewBlocks[0], F->end()); - - // Generate the code for the comparison which determines if the loop - // prolog code needs to be executed. - if (leftOverIters == Count-1) { - // There is no compare block for the fall-thru case when for the last - // left over iteration - CompareBB = NewBlocks[0]; - } else { - // Create a new block for the comparison - BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", - F, CompareBB); - if (Loop *ParentLoop = L->getParentLoop()) { - // Add the new block to the parent loop, if needed - ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); - } - - // The comparison w/ the extra iteration value and branch - Type *CountTy = TripCount->getType(); - Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, - ConstantInt::get(CountTy, leftOverIters), - "un.tmp"); - // Branch to either the extra iterations or the unrolled loop - BranchInst::Create(NewBlocks[0], CompareBB, - BranchVal, NewBB); - CompareBB = NewBB; - PH->getTerminator()->setSuccessor(0, NewBB); - VMap[NewPH] = CompareBB; - } - - // Rewrite the cloned instruction operands to use the values - // created when the clone is created. - for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { - for (BasicBlock::iterator I = NewBlocks[i]->begin(), - E = NewBlocks[i]->end(); I != E; ++I) { - RemapInstruction(I, VMap, - RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); - } + std::vector NewBlocks; + ValueToValueMapTy VMap; + + // Clone all the basic blocks in the loop. If Count is 2, we don't clone + // the loop, otherwise we create a cloned loop to execute the extra + // iterations. This function adds the appropriate CFG connections. + CloneLoopBlocks(L, ModVal, Count == 2, PH, PEnd, NewBlocks, LoopBlocks, VMap, + LI); + + // Insert the cloned blocks into function just before the original loop + F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], + F->end()); + + // Rewrite the cloned instruction operands to use the values + // created when the clone is created. + for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { + for (BasicBlock::iterator I = NewBlocks[i]->begin(), + E = NewBlocks[i]->end(); + I != E; ++I) { + RemapInstruction(I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } // Connect the prolog code to the original loop and update the // PHI functions. - ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, + BasicBlock *LastLoopBB = cast(VMap[Latch]); + ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, LPM->getAsPass()); NumRuntimeUnrolled++; return true; diff --git a/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll index 17c91e5c07b..aae79cbac78 100644 --- a/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll +++ b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll @@ -41,8 +41,7 @@ for.end: ; preds = %for.body, %entry } ; CHECK-LABEL: @test -; CHECK: unr.cmp{{.*}}: -; CHECK: for.body.unr{{.*}}: +; CHECK: for.body.prol{{.*}}: ; CHECK: for.body: ; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll index a14087dcdce..05d03f2568d 100644 --- a/test/Transforms/LoopUnroll/runtime-loop.ll +++ b/test/Transforms/LoopUnroll/runtime-loop.ll @@ -3,15 +3,16 @@ ; Tests for unrolling loops with run-time trip counts ; CHECK: %xtraiter = and i32 %n -; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; CHECK: %lcmp.overflow = icmp eq i32 %n, 0 -; CHECK: %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod -; CHECK: br i1 %lcmp.or, label %unr.cmp +; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; CHECK: %lcmp.overflow = icmp eq i32 %n, 0 +; CHECK: %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod +; CHECK: br i1 %lcmp.or, label %for.body.prol, label %for.body.preheader.split -; CHECK: unr.cmp{{.*}}: -; CHECK: for.body.unr{{.*}}: -; CHECK: for.body: -; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body +; CHECK: for.body.prol: +; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ] +; CHECK: %prol.iter.sub = sub i32 %prol.iter, 1 +; CHECK: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0 +; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: @@ -39,7 +40,7 @@ for.end: ; preds = %for.body, %entry ; even if the -unroll-runtime is specified ; CHECK: for.body: -; CHECK-NOT: for.body.unr: +; CHECK-NOT: for.body.prol: define i32 @test1(i32* nocapture %a) nounwind uwtable readonly { entry: @@ -85,8 +86,8 @@ cond_true138: ; Test run-time unrolling for a loop that counts down by -2. -; CHECK: for.body.unr: -; CHECK: br i1 %cmp.7, label %for.cond.for.end_crit_edge{{.*}}, label %for.body +; CHECK: for.body.prol: +; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly { entry: @@ -113,3 +114,7 @@ for.end: ; preds = %for.cond.for.end_cr %res.0.lcssa = phi i16 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] ret i16 %res.0.lcssa } + +; CHECK: !0 = metadata !{metadata !0, metadata !1} +; CHECK: !1 = metadata !{metadata !"llvm.loop.unroll.disable"} + diff --git a/test/Transforms/LoopUnroll/runtime-loop1.ll b/test/Transforms/LoopUnroll/runtime-loop1.ll index ad99b8cd9c6..5ff75e33f7f 100644 --- a/test/Transforms/LoopUnroll/runtime-loop1.ll +++ b/test/Transforms/LoopUnroll/runtime-loop1.ll @@ -1,11 +1,11 @@ -; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=4 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s ; This tests that setting the unroll count works -; CHECK: unr.cmp: -; CHECK: for.body.unr: +; CHECK: for.body.prol: +; CHECK: br label %for.body.preheader.split ; CHECK: for.body: -; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body +; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { diff --git a/test/Transforms/LoopUnroll/runtime-loop2.ll b/test/Transforms/LoopUnroll/runtime-loop2.ll index cbc7af58ff5..7205c686065 100644 --- a/test/Transforms/LoopUnroll/runtime-loop2.ll +++ b/test/Transforms/LoopUnroll/runtime-loop2.ll @@ -3,8 +3,7 @@ ; Choose a smaller, power-of-two, unroll count if the loop is too large. ; This test makes sure we're not unrolling 'odd' counts -; CHECK: unr.cmp: -; CHECK: for.body.unr: +; CHECK: for.body.prol: ; CHECK: for.body: ; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body