From: Kyle Butt Date: Fri, 3 Mar 2017 01:00:22 +0000 (+0000) Subject: CodeGen: BlockPlacement: Precompute layout for chains of triangles. X-Git-Tag: android-x86-7.1-r4~19475 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=c160e2a10f8571824e2c96a67e0086a3c1e66c21;p=android-x86%2Fexternal-llvm.git CodeGen: BlockPlacement: Precompute layout for chains of triangles. For chains of triangles with small join blocks that can be tail duplicated, a simple calculation of probabilities is insufficient. Tail duplication can be profitable in 3 different ways for these cases: 1) The post-dominators marked 50% are actually taken 56% (This shrinks with longer chains) 2) The chains are statically correlated. Branch probabilities have a very U-shaped distribution. [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] If the branches in a chain are likely to be from the same side of the distribution as their predecessor, but are independent at runtime, this transformation is profitable. (Because the cost of being wrong is a small fixed cost, unlike the standard triangle layout where the cost of being wrong scales with the # of triangles.) 3) The chains are dynamically correlated. If the probability that a previous branch was taken positively influences whether the next branch will be taken We believe that 2 and 3 are common enough to justify the small margin in 1. The code pre-scans a function's CFG to identify this pattern and marks the edges so that the standard layout algorithm can use the computed results. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296845 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 74a64631cb9..cba1767281a 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -50,6 +50,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include +#include #include #include using namespace llvm; @@ -143,6 +144,14 @@ static cl::opt TailDupPlacementPenalty( cl::init(2), cl::Hidden); +// Heuristic for triangle chains. +static cl::opt TriangleChainCount( + "triangle-chain-count", + cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " + "triangle tail duplication heuristic to kick in. 0 to disable."), + cl::init(3), + cl::Hidden); + extern cl::opt StaticLikelyProb; extern cl::opt ProfileLikelyProb; @@ -456,6 +465,9 @@ class MachineBlockPlacement : public MachineFunctionPass { bool canTailDuplicateUnplacedPreds( const MachineBasicBlock *BB, MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -1041,6 +1053,127 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( return true; } +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +/// longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +/// U-shaped distribution. +/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +/// If the branches in a chain are likely to be from the same side of the +/// distribution as their predecessor, but are independent at runtime, this +/// transformation is profitable. (Because the cost of being wrong is a small +/// fixed cost, unlike the standard triangle layout where the cost of being +/// wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +/// branch was taken positively influences whether the next branch will be +/// taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + unsigned Count; + std::forward_list Edges; + TriangleChain(MachineBasicBlock* src, MachineBasicBlock *dst) { + Edges.push_front(src); + Edges.push_front(dst); + Count = 1; + } + + void append(MachineBasicBlock *dst) { + assert(!Edges.empty() && Edges.front()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_front(dst); + ++Count; + } + + MachineBasicBlock *getKey() { + return Edges.front(); + } + }; + + if (TriangleChainCount == 0) + return; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain + // Note: This cannot be replaced with a call insert() or emplace() because + // the find key is BB, but the insert/emplace key is PDom. + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert (InsertResult.second && "Block seen twice."); + (void) InsertResult; + } + } + + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + if (Chain.Count < TriangleChainCount) + continue; + MachineBasicBlock *dst = Chain.Edges.front(); + Chain.Edges.pop_front(); + for (MachineBasicBlock *src : Chain.Edges) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed based on triangles.\n"); + ComputedEdges[src] = { dst, true }; + dst = src; + } + } +} + // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( @@ -2504,6 +2637,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } assert(BlockToChain.empty()); diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll index 95c23ade513..e0979d89473 100644 --- a/test/CodeGen/PowerPC/tail-dup-layout.ll +++ b/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -95,6 +95,87 @@ exit: } ; Intended layout: +; The chain-of-triangles based duplicating produces the layout +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; exit +; even for 50/50 branches. +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: straight_test_50: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: .[[OPT1LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-NEXT: .[[OPT2LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, .[[TEST4LABEL]] +;CHECK-NEXT: .[[OPT3LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-NEXT: .[[OPT4LABEL]]: +;CHECK: b .[[EXITLABEL]] + +define void @straight_test_50(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3, !prof !2 +optional3: + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1 +optional4: + call void @d() + br label %exit +exit: + ret void +} + +; Intended layout: ; The chain-based outlining produces the layout ; entry ; --- Begin loop --- diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll index 38ba308ecff..5b984d27249 100644 --- a/test/CodeGen/X86/cmovcmov.ll +++ b/test/CodeGen/X86/cmovcmov.ll @@ -249,16 +249,23 @@ attributes #0 = { nounwind } ; CMOV-DAG: cmpl %edx, %esi ; CMOV-DAG: movb $20, %al ; CMOV-DAG: movb $20, %dl -; CMOV: jl [[BB0:.LBB[0-9_]+]] +; CMOV: jge [[BB2:.LBB[0-9_]+]] +; CMOV: jle [[BB3:.LBB[0-9_]+]] +; CMOV: [[BB0:.LBB[0-9_]+]] +; CMOV: testl %edi, %edi +; CMOV: jne [[BB4:.LBB[0-9_]+]] +; CMOV: [[BB1:.LBB[0-9_]+]] +; CMOV: movb %al, g8(%rip) +; CMOV: retq +; CMOV: [[BB2]]: ; CMOV: movl %ecx, %edx -; CMOV: [[BB0]]: -; CMOV: jg [[BB1:.LBB[0-9_]+]] +; CMOV: jg [[BB0]] +; CMOV: [[BB3]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB1]]: ; CMOV: testl %edi, %edi -; CMOV: je [[BB2:.LBB[0-9_]+]] +; CMOV: je [[BB1]] +; CMOV: [[BB4]]: ; CMOV: movl %edx, %eax -; CMOV: [[BB2]]: ; CMOV: movb %al, g8(%rip) ; CMOV: retq define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {