[LSR] Limit the recursion for setup cost

author David Green <david.green@arm.com>

Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)

committer David Green <david.green@arm.com>

Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)
author David Green <david.green@arm.com>
Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)
committer David Green <david.green@arm.com>
Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index 0b11c15..7688601 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -164,9 +164,9 @@ static cl::opt<unsigned> ComplexityLimit(
    cl::init(std::numeric_limits<uint16_t>::max()),
    cl::desc("LSR search space complexity limit"));
  
-static cl::opt<bool> EnableRecursiveSetupCost(
-  "lsr-recursive-setupcost", cl::Hidden, cl::init(true),
-  cl::desc("Enable more thorough lsr setup cost calculation"));
+static cl::opt<unsigned> SetupCostDepthLimit(
+    "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
+    cl::desc("The limit on recursion depth for LSRs setup cost"));
  
  #ifndef NDEBUG
  // Stress test IV chain generation.
@@ -1212,22 +1212,23 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                   bool HasBaseReg, int64_t Scale,
                                   Instruction *Fixup = nullptr);
  
-static unsigned getSetupCost(const SCEV *Reg) {
+static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
    if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
      return 1;
-  if (!EnableRecursiveSetupCost)
+  if (Depth == 0)
      return 0;
    if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
-    return getSetupCost(S->getStart());
+    return getSetupCost(S->getStart(), Depth - 1);
    if (auto S = dyn_cast<SCEVCastExpr>(Reg))
-    return getSetupCost(S->getOperand());
+    return getSetupCost(S->getOperand(), Depth - 1);
    if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
      return std::accumulate(S->op_begin(), S->op_end(), 0,
-                           [](unsigned i, const SCEV *Reg) {
-                             return i + getSetupCost(Reg);
+                           [&](unsigned i, const SCEV *Reg) {
+                             return i + getSetupCost(Reg, Depth - 1);
                             });
    if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
-    return getSetupCost(S->getLHS()) + getSetupCost(S->getRHS());
+    return getSetupCost(S->getLHS(), Depth - 1) +
+           getSetupCost(S->getRHS(), Depth - 1);
    return 0;
  }
  
@@ -1293,7 +1294,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
  
    // Rough heuristic; favor registers which don't require extra setup
    // instructions in the preheader.
-  C.SetupCost += getSetupCost(Reg);
+  C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
+  // Ensure we don't, even with the recusion limit, produce invalid costs.
+  C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
  
    C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
                 SE->hasComputableLoopEvolution(Reg, L);
diff --git a/test/CodeGen/Hexagon/swp-carried-1.ll b/test/CodeGen/Hexagon/swp-carried-1.ll

index 7408027..b33cf52 100644 (file)
--- a/test/CodeGen/Hexagon/swp-carried-1.ll
+++ b/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-recursive-setupcost=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s | FileCheck %s
  
  ; Test that we generate the correct code when a loop carried value
  ; is scheduled one stage earlier than it's use. The code in
diff --git a/test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll b/test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll

new file mode 100644 (file)

index 0000000..adcc5e3
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; Check this completes sensibly. The sequence here is quick for scev to
+; calculate, but creates a very large tree if recursed into, the same node
+; being processed many times. I will also naturally have a setupcost of
+; 0xffffffff, which LSR will treat as invalid.
+; CHECK-LABEL: func
+; CHECK: load i32, i32* %gep
+
+define i32 @func(i32* %in) {
+entry:
+  %load = load i32, i32* %in, align 4
+  %a1 = add i32 %load, 1
+  %m1 = mul i32 %a1, %load
+  %a2 = add i32 %m1, 1
+  %m2 = mul i32 %a2, %m1
+  %a3 = add i32 %m2, 1
+  %m3 = mul i32 %a3, %m2
+  %a4 = add i32 %m3, 1
+  %m4 = mul i32 %a4, %m3
+  %a5 = add i32 %m4, 1
+  %m5 = mul i32 %a5, %m4
+  %a6 = add i32 %m5, 1
+  %m6 = mul i32 %a6, %m5
+  %a7 = add i32 %m6, 1
+  %m7 = mul i32 %a7, %m6
+  %a8 = add i32 %m7, 1
+  %m8 = mul i32 %a8, %m7
+  %a9 = add i32 %m8, 1
+  %m9 = mul i32 %a9, %m8
+  %a10 = add i32 %m9, 1
+  %m10 = mul i32 %a10, %m9
+  %a11 = add i32 %m10, 1
+  %m11 = mul i32 %a11, %m10
+  %a12 = add i32 %m11, 1
+  %m12 = mul i32 %a12, %m11
+  %a13 = add i32 %m12, 1
+  %m13 = mul i32 %a13, %m12
+  %a14 = add i32 %m13, 1
+  %m14 = mul i32 %a14, %m13
+  %a15 = add i32 %m14, 1
+  %m15 = mul i32 %a15, %m14
+  %a16 = add i32 %m15, 1
+  %m16 = mul i32 %a16, %m15
+  %a17 = add i32 %m16, 1
+  %m17 = mul i32 %a17, %m16
+  %a18 = add i32 %m17, 1
+  %m18 = mul i32 %a18, %m17
+  %a19 = add i32 %m18, 1
+  %m19 = mul i32 %a19, %m18
+  %a20 = add i32 %m19, 1
+  %m20 = mul i32 %a20, %m19
+  %a21 = add i32 %m20, 1
+  %m21 = mul i32 %a21, %m20
+  %a22 = add i32 %m21, 1
+  %m22 = mul i32 %a22, %m21
+  %a23 = add i32 %m22, 1
+  %m23 = mul i32 %a23, %m22
+  %a24 = add i32 %m23, 1
+  %m24 = mul i32 %a24, %m23
+  %a25 = add i32 %m24, 1
+  %m25 = mul i32 %a25, %m24
+  %a26 = add i32 %m25, 1
+  %m26 = mul i32 %a26, %m25
+  %a27 = add i32 %m26, 1
+  %m27 = mul i32 %a27, %m26
+  %a28 = add i32 %m27, 1
+  %m28 = mul i32 %a28, %m27
+  %a29 = add i32 %m28, 1
+  %m29 = mul i32 %a29, %m28
+  %a30 = add i32 %m29, 1
+  %m30 = mul i32 %a30, %m29
+  %a31 = add i32 %m30, 1
+  %m31 = mul i32 %a31, %m30
+  br label %loop
+
+loop:
+  %lp = phi i32 [ %m31, %entry ], [ %linc, %loop ]
+  %0 = sext i32 %lp to i64
+  %gep = getelementptr inbounds i32, i32* %in, i64 %0
+  %loopload = load i32, i32* %gep, align 4
+  store i32 0, i32* %gep, align 4
+  %linc = add i32 %lp, 1
+  %lcmp = icmp eq i32 %linc, 100
+  br i1 %lcmp, label %exit, label %loop
+
+exit:
+  %ll = phi i32 [ %loopload, %loop ]
+  ret i32 %ll
+}
+
diff --git a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll

index ab6dd48..f0b872a 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -lsr-recursive-setupcost=0 -S | FileCheck %s
+; RUN: opt < %s -loop-reduce -lsr-setupcost-depth-limit=1 -S | FileCheck %s
  
  ; This test is adapted from the n-body test of the LLVM test-suite: A bug in
  ; r345114 caused LSR to generate incorrect code. The test verifies that the
author	David Green <david.green@arm.com>
	Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)
committer	David Green <david.green@arm.com>
	Tue, 23 Apr 2019 08:52:21 +0000 (08:52 +0000)
lib/Transforms/Scalar/LoopStrengthReduce.cpp		patch \| blob \| history
test/CodeGen/Hexagon/swp-carried-1.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopStrengthReduce/two-combinations-bug.ll		patch \| blob \| history