From f47d520c1a91bb029d73a5a5f7c0e416ed763f23 Mon Sep 17 00:00:00 2001 From: Manasij Mukherjee Date: Tue, 12 Jul 2016 16:59:17 -0700 Subject: [PATCH] Loop Invariant Code Motion Implemented behind the new -licm flag. Hoists invariant arithmetic instructions from loop bodies to pre-headers. Does not trigger for loops where headers have two incoming edges from outside the loop. Also enables multi block address optimization, because most of the instructions hoisted are address calculations coming from gep. Does not touch memory operations. This algorithm does not seem to work well for load-hoisting. BUG=none R=jpp@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/2138443002 . --- src/IceCfg.cpp | 102 +++++++++++++++++++++++++++- src/IceCfg.h | 6 +- src/IceClFlags.def | 3 + src/IceLoopAnalyzer.cpp | 7 ++ src/IceLoopAnalyzer.h | 7 +- src/IceTargetLoweringX86BaseImpl.h | 31 +++++---- src/IceTimerTree.def | 1 + tests_lit/llvm2ice_tests/licm.ll | 36 ++++++++++ tests_lit/llvm2ice_tests/loop-nest-depth.ll | 18 ++--- 9 files changed, 184 insertions(+), 27 deletions(-) create mode 100644 tests_lit/llvm2ice_tests/licm.ll diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp index f43353309..3a5aa1e64 100644 --- a/src/IceCfg.cpp +++ b/src/IceCfg.cpp @@ -640,6 +640,103 @@ void Cfg::localCSE() { } } +void Cfg::loopInvariantCodeMotion() { + TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this); + // Does not introduce new nodes as of now. + for (auto &Pair : LoopInfo) { + CfgNode *Header = Nodes[Pair.first]; + assert(Header); + if (Header->getLoopNestDepth() < 1) + continue; + CfgNode *PreHeader = nullptr; + for (auto *Pred : Header->getInEdges()) { + if (Pred->getLoopNestDepth() == Header->getLoopNestDepth() - 1) { + if (PreHeader == nullptr) { + PreHeader = Pred; + } else { + PreHeader = nullptr; + break; + // Do not consider cases with two incoming edges. + // Will require insertion of nodes. + } + } + } + if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) { + continue; // to next loop + } + + auto &Insts = PreHeader->getInsts(); + auto &LastInst = Insts.back(); + Insts.pop_back(); + + for (auto *Inst : findLoopInvariantInstructions(Pair.first)) { + PreHeader->appendInst(Inst); + } + PreHeader->appendInst(&LastInst); + } +} + +Ice::CfgVector +Cfg::findLoopInvariantInstructions(Ice::SizeT LoopHeaderIndex) { + CfgUnorderedSet InvariantInsts; + CfgUnorderedSet InvariantVars; + for (auto *Var : getArgs()) { + InvariantVars.insert(Var); + } + bool Changed = false; + do { + Changed = false; + for (auto NodeIndex : LoopInfo[LoopHeaderIndex]) { + auto *Node = Nodes[NodeIndex]; + CfgVector> Insts(Node->getInsts().begin(), + Node->getInsts().end()); + + for (auto &InstRef : Insts) { + auto &Inst = InstRef.get(); + if (Inst.isDeleted() || + InvariantInsts.find(&Inst) != InvariantInsts.end()) + continue; + switch (Inst.getKind()) { + case Inst::InstKind::Alloca: + case Inst::InstKind::Br: + case Inst::InstKind::Ret: + case Inst::InstKind::Phi: + case Inst::InstKind::Call: + case Inst::InstKind::IntrinsicCall: + case Inst::InstKind::Load: + case Inst::InstKind::Store: + case Inst::InstKind::Switch: + continue; + default: + break; + } + + bool IsInvariant = true; + for (SizeT i = 0; i < Inst.getSrcSize(); ++i) { + if (auto *Var = llvm::dyn_cast(Inst.getSrc(i))) { + if (InvariantVars.find(Var) == InvariantVars.end()) { + IsInvariant = false; + } + } + } + if (IsInvariant) { + Changed = true; + InvariantInsts.insert(&Inst); + Node->getInsts().remove(Inst); + if (Inst.getDest() != nullptr) { + InvariantVars.insert(Inst.getDest()); + } + } + } + } + } while (Changed); + + CfgVector InstVector(InvariantInsts.begin(), InvariantInsts.end()); + std::sort(InstVector.begin(), InstVector.end(), + [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); }); + return InstVector; +} + void Cfg::shortCircuitJumps() { // Split Nodes whenever an early jump is possible. // __N : @@ -1338,10 +1435,9 @@ void Cfg::genFrame() { getTarget()->addEpilog(Node); } -void Cfg::computeLoopNestDepth() { +void Cfg::generateLoopInfo() { TimerMarker T(TimerStack::TT_computeLoopNestDepth, this); - LoopAnalyzer LA(this); - LA.computeLoopNestDepth(); + LoopInfo = LoopAnalyzer(this).getLoopInfo(); } // This is a lightweight version of live-range-end calculation. Marks the last diff --git a/src/IceCfg.h b/src/IceCfg.h index 42e6ef121..490d1d065 100644 --- a/src/IceCfg.h +++ b/src/IceCfg.h @@ -202,6 +202,7 @@ public: void shuffleNodes(); void localCSE(); void shortCircuitJumps(); + void loopInvariantCodeMotion(); /// Scan allocas to determine whether we need to use a frame pointer. /// If SortAndCombine == true, merge all the fixed-size allocas in the @@ -215,7 +216,7 @@ public: void doNopInsertion(); void genCode(); void genFrame(); - void computeLoopNestDepth(); + void generateLoopInfo(); void livenessLightweight(); void liveness(LivenessMode Mode); bool validateLiveness() const; @@ -300,6 +301,7 @@ private: uint32_t CombinedAlignment, InstList &Insts, AllocaBaseVariableType BaseVariableType); void findRematerializable(); + CfgVector findLoopInvariantInstructions(SizeT LoopHeaderIndex); GlobalContext *Ctx; uint32_t SequenceNumber; /// output order for emission @@ -330,7 +332,7 @@ private: /// Globals required by this CFG. Mostly used for the profiler's globals. std::unique_ptr GlobalInits; CfgVector JumpTables; - + CfgUnorderedMap> LoopInfo; /// CurrentNode is maintained during dumping/emitting just for validating /// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but /// before global operations like register allocation, resetCurrentNode() diff --git a/src/IceClFlags.def b/src/IceClFlags.def index bd0c20f80..0fabb9cf5 100644 --- a/src/IceClFlags.def +++ b/src/IceClFlags.def @@ -186,6 +186,9 @@ struct dev_list_flag {}; X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \ cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \ \ + X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm", \ + cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false)) \ + \ X(LogFilename, std::string, dev_opt_flag, "log", \ cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename")) \ \ diff --git a/src/IceLoopAnalyzer.cpp b/src/IceLoopAnalyzer.cpp index a76dfe297..344ffcea6 100644 --- a/src/IceLoopAnalyzer.cpp +++ b/src/IceLoopAnalyzer.cpp @@ -54,6 +54,7 @@ LoopAnalyzer::LoopAnalyzer(Cfg *Fn) : Func(Fn) { // Create the LoopNodes from the function's CFG for (CfgNode *Node : Nodes) AllNodes.emplace_back(Node); + computeLoopNestDepth(); } void LoopAnalyzer::computeLoopNestDepth() { @@ -141,6 +142,12 @@ LoopAnalyzer::processNode(LoopAnalyzer::LoopNode &Node) { if (*It == &Node) { (*It)->setDeleted(); ++NumDeletedNodes; + CfgVector LoopNodes; + for (auto LoopIter = It.base() - 1; LoopIter != LoopStack.end(); + ++LoopIter) { + LoopNodes.push_back((*LoopIter)->getNode()->getIndex()); + } + Loops[(*It)->getNode()->getIndex()] = LoopNodes; LoopStack.erase(It.base() - 1, LoopStack.end()); break; } diff --git a/src/IceLoopAnalyzer.h b/src/IceLoopAnalyzer.h index eaaad9862..73458fa46 100644 --- a/src/IceLoopAnalyzer.h +++ b/src/IceLoopAnalyzer.h @@ -39,9 +39,10 @@ public: // is bounded linear. ncbray suggests another algorithm which is linear in // practice but not bounded linear. I think it also finds dominators. // http://lenx.100871.net/papers/loop-SAS.pdf - void computeLoopNestDepth(); + CfgUnorderedMap> getLoopInfo() { return Loops; } private: + void computeLoopNestDepth(); using IndexT = uint32_t; static constexpr IndexT UndefinedIndex = 0; static constexpr IndexT FirstDefinedIndex = 1; @@ -80,6 +81,8 @@ private: void incrementLoopNestDepth(); bool hasSelfEdge() const; + CfgNode *getNode() { return BB; } + private: CfgNode *BB; NodeList::const_iterator Succ; @@ -110,6 +113,8 @@ private: /// The number of nodes which have been marked deleted. This is used to track /// when the iteration should end. LoopNodePtrList::size_type NumDeletedNodes = 0; + /// Detailed loop information + CfgUnorderedMap> Loops; }; } // end of namespace Ice diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h index daa72147b..435332bbe 100644 --- a/src/IceTargetLoweringX86BaseImpl.h +++ b/src/IceTargetLoweringX86BaseImpl.h @@ -443,6 +443,18 @@ template void TargetX86Base::translateO2() { Func->processAllocas(SortAndCombineAllocas); Func->dump("After Alloca processing"); + // Run this early so it can be used to focus optimizations on potentially hot + // code. + // TODO(stichnot,ascull): currently only used for regalloc not + // expensive high level optimizations which could be focused on potentially + // hot code. + Func->generateLoopInfo(); + Func->dump("After loop analysis"); + if (getFlags().getLoopInvariantCodeMotion()) { + Func->loopInvariantCodeMotion(); + Func->dump("After LICM"); + } + if (getFlags().getEnableExperimental()) { Func->localCSE(); Func->dump("After Local CSE"); @@ -466,14 +478,6 @@ template void TargetX86Base::translateO2() { Func->dump("After Phi lowering"); } - // Run this early so it can be used to focus optimizations on potentially hot - // code. - // TODO(stichnot,ascull): currently only used for regalloc not - // expensive high level optimizations which could be focused on potentially - // hot code. - Func->computeLoopNestDepth(); - Func->dump("After loop nest depth analysis"); - // Address mode optimization. Func->getVMetadata()->init(VMK_SingleDefs); Func->doAddressOpt(); @@ -5415,10 +5419,13 @@ TargetX86Base::computeAddressOpt(const Inst *Instr, Type MemType, // don't go further. Alternatively (?), never consider a transformation that // would change a variable that is currently *not* live across basic block // boundaries into one that *is*. - if (Func->getVMetadata()->isMultiBlock( - NewAddr.Base) /* || Base->getUseCount() > 1*/) - return nullptr; - + if (!getFlags().getLoopInvariantCodeMotion()) { + // Need multi block address opt when licm is enabled. + // Might make sense to restrict to current node and loop header. + if (Func->getVMetadata()->isMultiBlock( + NewAddr.Base) /* || Base->getUseCount() > 1*/) + return nullptr; + } AddressOptimizer AddrOpt(Func); const bool MockBounds = getFlags().getMockBoundsCheck(); const Inst *Reason = nullptr; diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def index d0421759a..0ebcfd3e9 100644 --- a/src/IceTimerTree.def +++ b/src/IceTimerTree.def @@ -42,6 +42,7 @@ X(llvmConvert) \ X(loadOpt) \ X(localCse) \ + X(loopInvariantCodeMotion) \ X(lowerPhiAssignments) \ X(materializeVectorShuffles) \ X(parse) \ diff --git a/tests_lit/llvm2ice_tests/licm.ll b/tests_lit/llvm2ice_tests/licm.ll new file mode 100644 index 000000000..53761bb36 --- /dev/null +++ b/tests_lit/llvm2ice_tests/licm.ll @@ -0,0 +1,36 @@ +; Tests if the licm flag successfully hoists the add from loop0 to entry + +; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \ +; RUN: -O2 -licm | FileCheck --check-prefix ENABLE %s + +; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \ +; RUN: -O2 | FileCheck --check-prefix NOENABLE %s + +define internal void @dummy() { +entry: + ret void +} +define internal i32 @test_licm(i32 %a32, i32 %b, i32 %c) { +entry: + %a = trunc i32 %a32 to i1 + br label %loop0 +loop0: ; <-+ + call void @dummy() ; | + %add1 = add i32 %b, %c ; | + br label %loop1 ; | +loop1: ; | + br i1 %a, label %loop0, label %out ; --+ +out: + ret i32 %add1 +} + +; CHECK-LABEL: test_licm + +; ENABLE: add +; ENABLE: call + +; NOENABLE: call +; NOENABLE-NEXT: mov +; NOENABLE-NEXT: add + +; CHECK: ret \ No newline at end of file diff --git a/tests_lit/llvm2ice_tests/loop-nest-depth.ll b/tests_lit/llvm2ice_tests/loop-nest-depth.ll index 63c9670a9..e43abb945 100644 --- a/tests_lit/llvm2ice_tests/loop-nest-depth.ll +++ b/tests_lit/llvm2ice_tests/loop-nest-depth.ll @@ -20,7 +20,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0: @@ -48,7 +48,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0: @@ -78,7 +78,7 @@ out: ; <---+ ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0: @@ -110,7 +110,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0_0: @@ -148,7 +148,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0_0: @@ -190,7 +190,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0_0: @@ -235,7 +235,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: loop0_0: @@ -269,7 +269,7 @@ out: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: left: @@ -296,7 +296,7 @@ exit: ret void } -; CHECK-LABEL: After loop nest depth analysis +; CHECK-LABEL: After loop analysis ; CHECK-NEXT: entry: ; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: body: -- 2.11.0