[PM] Fix a bug where through CGSCC iteration we can get

author Chandler Carruth <chandlerc@gmail.com>

Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h

index 32868cb..12584b1 100644 (file)
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -275,6 +275,15 @@ struct CGSCCUpdateResult {
    /// non-null and can be used to continue processing the "top" of the
    /// post-order walk.
    LazyCallGraph::SCC *UpdatedC;
+
+  /// A hacky area where the inliner can retain history about inlining
+  /// decisions that mutated the call graph's SCC structure in order to avoid
+  /// infinite inlining. See the comments in the inliner's CG update logic.
+  ///
+  /// FIXME: Keeping this here seems like a big layering issue, we should look
+  /// for a better technique.
+  SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
+      &InlinedInternalEdges;
  };
  
  /// \brief The core module pass which does a post-order walk of the SCCs and
@@ -330,8 +339,12 @@ public:
      SmallPtrSet<LazyCallGraph::RefSCC *, 4> InvalidRefSCCSet;
      SmallPtrSet<LazyCallGraph::SCC *, 4> InvalidSCCSet;
  
-    CGSCCUpdateResult UR = {RCWorklist,    CWorklist, InvalidRefSCCSet,
-                            InvalidSCCSet, nullptr,   nullptr};
+    SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
+        InlinedInternalEdges;
+
+    CGSCCUpdateResult UR = {RCWorklist,          CWorklist, InvalidRefSCCSet,
+                            InvalidSCCSet,       nullptr,   nullptr,
+                            InlinedInternalEdges};
  
      PreservedAnalyses PA = PreservedAnalyses::all();
      CG.buildRefSCCs();
@@ -433,8 +446,12 @@ public:
              // invalid SCC and RefSCCs respectively. But we will short circuit
              // the processing when we check them in the loop above.
            } while (UR.UpdatedC);
-
          } while (!CWorklist.empty());
+
+        // We only need to keep internal inlined edge information within
+        // a RefSCC, clear it to save on space and let the next time we visit
+        // any of these functions have a fresh start.
+        InlinedInternalEdges.clear();
        } while (!RCWorklist.empty());
      }
  
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp

index 317770d..aca67d4 100644 (file)
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -872,6 +872,19 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
            InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory))
          continue;
  
+      // Check if this inlining may repeat breaking an SCC apart that has
+      // already been split once before. In that case, inlining here may
+      // trigger infinite inlining, much like is prevented within the inliner
+      // itself by the InlineHistory above, but spread across CGSCC iterations
+      // and thus hidden from the full inline history.
+      if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
+          UR.InlinedInternalEdges.count({&N, C})) {
+        DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+                        "previously split out of this SCC by inlining: "
+                     << F.getName() << " -> " << Callee.getName() << "\n");
+        continue;
+      }
+
        // Check whether we want to inline this callsite.
        if (!shouldInline(CS, GetInlineCost, ORE))
          continue;
@@ -949,17 +962,38 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
        for (LazyCallGraph::Edge &E : *CalleeN)
          RC->insertTrivialRefEdge(N, E.getNode());
      }
-    InlinedCallees.clear();
  
      // At this point, since we have made changes we have at least removed
      // a call instruction. However, in the process we do some incremental
      // simplification of the surrounding code. This simplification can
      // essentially do all of the same things as a function pass and we can
      // re-use the exact same logic for updating the call graph to reflect the
-    // change..
+    // change.
+    LazyCallGraph::SCC *OldC = C;
      C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
      DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
      RC = &C->getOuterRefSCC();
+
+    // If this causes an SCC to split apart into multiple smaller SCCs, there
+    // is a subtle risk we need to prepare for. Other transformations may
+    // expose an "infinite inlining" opportunity later, and because of the SCC
+    // mutation, we will revisit this function and potentially re-inline. If we
+    // do, and that re-inlining also has the potentially to mutate the SCC
+    // structure, the infinite inlining problem can manifest through infinite
+    // SCC splits and merges. To avoid this, we capture the originating caller
+    // node and the SCC containing the call edge. This is a slight over
+    // approximation of the possible inlining decisions that must be avoided,
+    // but is relatively efficient to store.
+    // FIXME: This seems like a very heavyweight way of retaining the inline
+    // history, we should look for a more efficient way of tracking it.
+    if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
+          return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
+        })) {
+      DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+                      "retaining this to avoid infinite inlining.\n");
+      UR.InlinedInternalEdges.insert({&N, OldC});
+    }
+    InlinedCallees.clear();
    }
  
    // Now that we've finished inlining all of the calls across this SCC, delete
diff --git a/test/Transforms/Inline/cgscc-cycle.ll b/test/Transforms/Inline/cgscc-cycle.ll

new file mode 100644 (file)

index 0000000..69874c3
--- /dev/null
+++ b/test/Transforms/Inline/cgscc-cycle.ll
@@ -0,0 +1,125 @@
+; This test contains extremely tricky call graph structures for the inliner to
+; handle correctly. They form cycles where the inliner introduces code that is
+; immediately or can eventually be transformed back into the original code. And
+; each step changes the call graph and so will trigger iteration. This requires
+; some out-of-band way to prevent infinitely re-inlining and re-transforming the
+; code.
+;
+; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s
+
+
+; The `test1_*` collection of functions form a directly cycling pattern.
+
+define void @test1_a(i8** %ptr) {
+; CHECK-LABEL: define void @test1_a(
+entry:
+  call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; over and over again. However, each inlining increments the count, and so we
+; expect this test case to stop after one round of inlining with a final
+; argument of '1'.
+; CHECK-NOT:     call
+; CHECK:         call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 1)
+; CHECK-NOT:     call
+
+  ret void
+}
+
+define void @test1_b(i8* %arg, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test1_b(
+entry:
+  %a = alloca i8*
+  store i8* %arg, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test1_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %cast = bitcast i8** %a to void (i8*, i1, i32)**
+  %p = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %cast
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %p(i8* %arg, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
+
+define void @test2_a(i8** %ptr) {
+; CHECK-LABEL: define void @test2_a(
+entry:
+  call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; but only after doing two rounds if inlining, first from @test2_b then
+; @test2_c. We check the exact number of inlining rounds before we cut off to
+; break the cycle by inspecting the last paramater that gets incremented with
+; each inlined function body.
+; CHECK-NOT:     call
+; CHECK:         call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 2)
+; CHECK-NOT:     call
+  ret void
+}
+
+define void @test2_b(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_b(
+entry:
+  %a = alloca i8*
+  store i8* %arg2, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg2, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test2_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %p = load i8*, i8** %a
+  %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
+
+define void @test2_c(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_c(
+entry:
+  %a = alloca i8*
+  store i8* %arg1, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg1, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test2_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %p = load i8*, i8** %a
+  %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
author	Chandler Carruth <chandlerc@gmail.com>
	Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Wed, 2 Aug 2017 02:09:22 +0000 (02:09 +0000)
include/llvm/Analysis/CGSCCPassManager.h		patch \| blob \| history
lib/Transforms/IPO/Inliner.cpp		patch \| blob \| history
test/Transforms/Inline/cgscc-cycle.ll	[new file with mode: 0644]	patch \| blob