/// non-null and can be used to continue processing the "top" of the
/// post-order walk.
LazyCallGraph::SCC *UpdatedC;
+
+ /// A hacky area where the inliner can retain history about inlining
+ /// decisions that mutated the call graph's SCC structure in order to avoid
+ /// infinite inlining. See the comments in the inliner's CG update logic.
+ ///
+ /// FIXME: Keeping this here seems like a big layering issue, we should look
+ /// for a better technique.
+ SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
+ &InlinedInternalEdges;
};
/// \brief The core module pass which does a post-order walk of the SCCs and
SmallPtrSet<LazyCallGraph::RefSCC *, 4> InvalidRefSCCSet;
SmallPtrSet<LazyCallGraph::SCC *, 4> InvalidSCCSet;
- CGSCCUpdateResult UR = {RCWorklist, CWorklist, InvalidRefSCCSet,
- InvalidSCCSet, nullptr, nullptr};
+ SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
+ InlinedInternalEdges;
+
+ CGSCCUpdateResult UR = {RCWorklist, CWorklist, InvalidRefSCCSet,
+ InvalidSCCSet, nullptr, nullptr,
+ InlinedInternalEdges};
PreservedAnalyses PA = PreservedAnalyses::all();
CG.buildRefSCCs();
// invalid SCC and RefSCCs respectively. But we will short circuit
// the processing when we check them in the loop above.
} while (UR.UpdatedC);
-
} while (!CWorklist.empty());
+
+ // We only need to keep internal inlined edge information within
+ // a RefSCC, clear it to save on space and let the next time we visit
+ // any of these functions have a fresh start.
+ InlinedInternalEdges.clear();
} while (!RCWorklist.empty());
}
InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory))
continue;
+ // Check if this inlining may repeat breaking an SCC apart that has
+ // already been split once before. In that case, inlining here may
+ // trigger infinite inlining, much like is prevented within the inliner
+ // itself by the InlineHistory above, but spread across CGSCC iterations
+ // and thus hidden from the full inline history.
+ if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
+ UR.InlinedInternalEdges.count({&N, C})) {
+ DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+ "previously split out of this SCC by inlining: "
+ << F.getName() << " -> " << Callee.getName() << "\n");
+ continue;
+ }
+
// Check whether we want to inline this callsite.
if (!shouldInline(CS, GetInlineCost, ORE))
continue;
for (LazyCallGraph::Edge &E : *CalleeN)
RC->insertTrivialRefEdge(N, E.getNode());
}
- InlinedCallees.clear();
// At this point, since we have made changes we have at least removed
// a call instruction. However, in the process we do some incremental
// simplification of the surrounding code. This simplification can
// essentially do all of the same things as a function pass and we can
// re-use the exact same logic for updating the call graph to reflect the
- // change..
+ // change.
+ LazyCallGraph::SCC *OldC = C;
C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
RC = &C->getOuterRefSCC();
+
+ // If this causes an SCC to split apart into multiple smaller SCCs, there
+ // is a subtle risk we need to prepare for. Other transformations may
+ // expose an "infinite inlining" opportunity later, and because of the SCC
+ // mutation, we will revisit this function and potentially re-inline. If we
+ // do, and that re-inlining also has the potentially to mutate the SCC
+ // structure, the infinite inlining problem can manifest through infinite
+ // SCC splits and merges. To avoid this, we capture the originating caller
+ // node and the SCC containing the call edge. This is a slight over
+ // approximation of the possible inlining decisions that must be avoided,
+ // but is relatively efficient to store.
+ // FIXME: This seems like a very heavyweight way of retaining the inline
+ // history, we should look for a more efficient way of tracking it.
+ if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
+ return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
+ })) {
+ DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+ "retaining this to avoid infinite inlining.\n");
+ UR.InlinedInternalEdges.insert({&N, OldC});
+ }
+ InlinedCallees.clear();
}
// Now that we've finished inlining all of the calls across this SCC, delete
--- /dev/null
+; This test contains extremely tricky call graph structures for the inliner to
+; handle correctly. They form cycles where the inliner introduces code that is
+; immediately or can eventually be transformed back into the original code. And
+; each step changes the call graph and so will trigger iteration. This requires
+; some out-of-band way to prevent infinitely re-inlining and re-transforming the
+; code.
+;
+; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s
+
+
+; The `test1_*` collection of functions form a directly cycling pattern.
+
+define void @test1_a(i8** %ptr) {
+; CHECK-LABEL: define void @test1_a(
+entry:
+ call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; over and over again. However, each inlining increments the count, and so we
+; expect this test case to stop after one round of inlining with a final
+; argument of '1'.
+; CHECK-NOT: call
+; CHECK: call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 1)
+; CHECK-NOT: call
+
+ ret void
+}
+
+define void @test1_b(i8* %arg, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test1_b(
+entry:
+ %a = alloca i8*
+ store i8* %arg, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK: %[[A:.*]] = alloca
+; CHECK: store i8* %arg, i8** %[[A]]
+
+ br i1 %flag, label %bb1, label %bb2
+
+bb1:
+ call void @test1_a(i8** %a) noinline
+ br label %bb2
+
+bb2:
+ %cast = bitcast i8** %a to void (i8*, i1, i32)**
+ %p = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %cast
+ %inline_count_inc = add i32 %inline_count, 1
+ call void %p(i8* %arg, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i1, i32)**
+; CHECK: %[[P:.*]] = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %[[CAST]]
+; CHECK: call void %[[P]](
+
+ ret void
+}
+
+define void @test2_a(i8** %ptr) {
+; CHECK-LABEL: define void @test2_a(
+entry:
+ call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; but only after doing two rounds if inlining, first from @test2_b then
+; @test2_c. We check the exact number of inlining rounds before we cut off to
+; break the cycle by inspecting the last paramater that gets incremented with
+; each inlined function body.
+; CHECK-NOT: call
+; CHECK: call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 2)
+; CHECK-NOT: call
+ ret void
+}
+
+define void @test2_b(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_b(
+entry:
+ %a = alloca i8*
+ store i8* %arg2, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK: %[[A:.*]] = alloca
+; CHECK: store i8* %arg2, i8** %[[A]]
+
+ br i1 %flag, label %bb1, label %bb2
+
+bb1:
+ call void @test2_a(i8** %a) noinline
+ br label %bb2
+
+bb2:
+ %p = load i8*, i8** %a
+ %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+ %inline_count_inc = add i32 %inline_count, 1
+ call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK: %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK: call void %[[P]](
+
+ ret void
+}
+
+define void @test2_c(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_c(
+entry:
+ %a = alloca i8*
+ store i8* %arg1, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK: %[[A:.*]] = alloca
+; CHECK: store i8* %arg1, i8** %[[A]]
+
+ br i1 %flag, label %bb1, label %bb2
+
+bb1:
+ call void @test2_a(i8** %a) noinline
+ br label %bb2
+
+bb2:
+ %p = load i8*, i8** %a
+ %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+ %inline_count_inc = add i32 %inline_count, 1
+ call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK: %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK: call void %[[P]](
+
+ ret void
+}