}
}
+void Cfg::loopInvariantCodeMotion() {
+ TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
+ // Does not introduce new nodes as of now.
+ for (auto &Pair : LoopInfo) {
+ CfgNode *Header = Nodes[Pair.first];
+ assert(Header);
+ if (Header->getLoopNestDepth() < 1)
+ continue;
+ CfgNode *PreHeader = nullptr;
+ for (auto *Pred : Header->getInEdges()) {
+ if (Pred->getLoopNestDepth() == Header->getLoopNestDepth() - 1) {
+ if (PreHeader == nullptr) {
+ PreHeader = Pred;
+ } else {
+ PreHeader = nullptr;
+ break;
+ // Do not consider cases with two incoming edges.
+ // Will require insertion of nodes.
+ }
+ }
+ }
+ if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
+ continue; // to next loop
+ }
+
+ auto &Insts = PreHeader->getInsts();
+ auto &LastInst = Insts.back();
+ Insts.pop_back();
+
+ for (auto *Inst : findLoopInvariantInstructions(Pair.first)) {
+ PreHeader->appendInst(Inst);
+ }
+ PreHeader->appendInst(&LastInst);
+ }
+}
+
+Ice::CfgVector<Inst *>
+Cfg::findLoopInvariantInstructions(Ice::SizeT LoopHeaderIndex) {
+ CfgUnorderedSet<Inst *> InvariantInsts;
+ CfgUnorderedSet<Variable *> InvariantVars;
+ for (auto *Var : getArgs()) {
+ InvariantVars.insert(Var);
+ }
+ bool Changed = false;
+ do {
+ Changed = false;
+ for (auto NodeIndex : LoopInfo[LoopHeaderIndex]) {
+ auto *Node = Nodes[NodeIndex];
+ CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
+ Node->getInsts().end());
+
+ for (auto &InstRef : Insts) {
+ auto &Inst = InstRef.get();
+ if (Inst.isDeleted() ||
+ InvariantInsts.find(&Inst) != InvariantInsts.end())
+ continue;
+ switch (Inst.getKind()) {
+ case Inst::InstKind::Alloca:
+ case Inst::InstKind::Br:
+ case Inst::InstKind::Ret:
+ case Inst::InstKind::Phi:
+ case Inst::InstKind::Call:
+ case Inst::InstKind::IntrinsicCall:
+ case Inst::InstKind::Load:
+ case Inst::InstKind::Store:
+ case Inst::InstKind::Switch:
+ continue;
+ default:
+ break;
+ }
+
+ bool IsInvariant = true;
+ for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
+ if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
+ if (InvariantVars.find(Var) == InvariantVars.end()) {
+ IsInvariant = false;
+ }
+ }
+ }
+ if (IsInvariant) {
+ Changed = true;
+ InvariantInsts.insert(&Inst);
+ Node->getInsts().remove(Inst);
+ if (Inst.getDest() != nullptr) {
+ InvariantVars.insert(Inst.getDest());
+ }
+ }
+ }
+ }
+ } while (Changed);
+
+ CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
+ std::sort(InstVector.begin(), InstVector.end(),
+ [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
+ return InstVector;
+}
+
void Cfg::shortCircuitJumps() {
// Split Nodes whenever an early jump is possible.
// __N :
getTarget()->addEpilog(Node);
}
-void Cfg::computeLoopNestDepth() {
+void Cfg::generateLoopInfo() {
TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
- LoopAnalyzer LA(this);
- LA.computeLoopNestDepth();
+ LoopInfo = LoopAnalyzer(this).getLoopInfo();
}
// This is a lightweight version of live-range-end calculation. Marks the last
void shuffleNodes();
void localCSE();
void shortCircuitJumps();
+ void loopInvariantCodeMotion();
/// Scan allocas to determine whether we need to use a frame pointer.
/// If SortAndCombine == true, merge all the fixed-size allocas in the
void doNopInsertion();
void genCode();
void genFrame();
- void computeLoopNestDepth();
+ void generateLoopInfo();
void livenessLightweight();
void liveness(LivenessMode Mode);
bool validateLiveness() const;
uint32_t CombinedAlignment, InstList &Insts,
AllocaBaseVariableType BaseVariableType);
void findRematerializable();
+ CfgVector<Inst *> findLoopInvariantInstructions(SizeT LoopHeaderIndex);
GlobalContext *Ctx;
uint32_t SequenceNumber; /// output order for emission
/// Globals required by this CFG. Mostly used for the profiler's globals.
std::unique_ptr<VariableDeclarationList> GlobalInits;
CfgVector<InstJumpTable *> JumpTables;
-
+ CfgUnorderedMap<SizeT, CfgVector<SizeT>> LoopInfo;
/// CurrentNode is maintained during dumping/emitting just for validating
/// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but
/// before global operations like register allocation, resetCurrentNode()
X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \
cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \
\
+ X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm", \
+ cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false)) \
+ \
X(LogFilename, std::string, dev_opt_flag, "log", \
cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename")) \
\
// Create the LoopNodes from the function's CFG
for (CfgNode *Node : Nodes)
AllNodes.emplace_back(Node);
+ computeLoopNestDepth();
}
void LoopAnalyzer::computeLoopNestDepth() {
if (*It == &Node) {
(*It)->setDeleted();
++NumDeletedNodes;
+ CfgVector<SizeT> LoopNodes;
+ for (auto LoopIter = It.base() - 1; LoopIter != LoopStack.end();
+ ++LoopIter) {
+ LoopNodes.push_back((*LoopIter)->getNode()->getIndex());
+ }
+ Loops[(*It)->getNode()->getIndex()] = LoopNodes;
LoopStack.erase(It.base() - 1, LoopStack.end());
break;
}
// is bounded linear. ncbray suggests another algorithm which is linear in
// practice but not bounded linear. I think it also finds dominators.
// http://lenx.100871.net/papers/loop-SAS.pdf
- void computeLoopNestDepth();
+ CfgUnorderedMap<SizeT, CfgVector<SizeT>> getLoopInfo() { return Loops; }
private:
+ void computeLoopNestDepth();
using IndexT = uint32_t;
static constexpr IndexT UndefinedIndex = 0;
static constexpr IndexT FirstDefinedIndex = 1;
void incrementLoopNestDepth();
bool hasSelfEdge() const;
+ CfgNode *getNode() { return BB; }
+
private:
CfgNode *BB;
NodeList::const_iterator Succ;
/// The number of nodes which have been marked deleted. This is used to track
/// when the iteration should end.
LoopNodePtrList::size_type NumDeletedNodes = 0;
+ /// Detailed loop information
+ CfgUnorderedMap<SizeT, CfgVector<SizeT>> Loops;
};
} // end of namespace Ice
Func->processAllocas(SortAndCombineAllocas);
Func->dump("After Alloca processing");
+ // Run this early so it can be used to focus optimizations on potentially hot
+ // code.
+ // TODO(stichnot,ascull): currently only used for regalloc not
+ // expensive high level optimizations which could be focused on potentially
+ // hot code.
+ Func->generateLoopInfo();
+ Func->dump("After loop analysis");
+ if (getFlags().getLoopInvariantCodeMotion()) {
+ Func->loopInvariantCodeMotion();
+ Func->dump("After LICM");
+ }
+
if (getFlags().getEnableExperimental()) {
Func->localCSE();
Func->dump("After Local CSE");
Func->dump("After Phi lowering");
}
- // Run this early so it can be used to focus optimizations on potentially hot
- // code.
- // TODO(stichnot,ascull): currently only used for regalloc not
- // expensive high level optimizations which could be focused on potentially
- // hot code.
- Func->computeLoopNestDepth();
- Func->dump("After loop nest depth analysis");
-
// Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt();
// don't go further. Alternatively (?), never consider a transformation that
// would change a variable that is currently *not* live across basic block
// boundaries into one that *is*.
- if (Func->getVMetadata()->isMultiBlock(
- NewAddr.Base) /* || Base->getUseCount() > 1*/)
- return nullptr;
-
+ if (!getFlags().getLoopInvariantCodeMotion()) {
+ // Need multi block address opt when licm is enabled.
+ // Might make sense to restrict to current node and loop header.
+ if (Func->getVMetadata()->isMultiBlock(
+ NewAddr.Base) /* || Base->getUseCount() > 1*/)
+ return nullptr;
+ }
AddressOptimizer AddrOpt(Func);
const bool MockBounds = getFlags().getMockBoundsCheck();
const Inst *Reason = nullptr;
X(llvmConvert) \
X(loadOpt) \
X(localCse) \
+ X(loopInvariantCodeMotion) \
X(lowerPhiAssignments) \
X(materializeVectorShuffles) \
X(parse) \
--- /dev/null
+; Tests if the licm flag successfully hoists the add from loop0 to entry
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 -licm | FileCheck --check-prefix ENABLE %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 | FileCheck --check-prefix NOENABLE %s
+
+define internal void @dummy() {
+entry:
+ ret void
+}
+define internal i32 @test_licm(i32 %a32, i32 %b, i32 %c) {
+entry:
+ %a = trunc i32 %a32 to i1
+ br label %loop0
+loop0: ; <-+
+ call void @dummy() ; |
+ %add1 = add i32 %b, %c ; |
+ br label %loop1 ; |
+loop1: ; |
+ br i1 %a, label %loop0, label %out ; --+
+out:
+ ret i32 %add1
+}
+
+; CHECK-LABEL: test_licm
+
+; ENABLE: add
+; ENABLE: call
+
+; NOENABLE: call
+; NOENABLE-NEXT: mov
+; NOENABLE-NEXT: add
+
+; CHECK: ret
\ No newline at end of file
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: left:
ret void
}
-; CHECK-LABEL: After loop nest depth analysis
+; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: body: