From b6b25740b763ba8cf7d6eecb59b47e25d6edc1f5 Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Mon, 11 Sep 2017 17:52:08 +0000 Subject: [PATCH] Unmerge GEPs to reduce register pressure on IndirectBr edges. Summary: GEP merging can sometimes increase the number of live values and register pressure across control edges and cause performance problems particularly if the increased register pressure results in spills. This change implements GEP unmerging around an IndirectBr in certain cases to mitigate the issue. This is in the CodeGenPrepare pass (after all the GEP merging has happened.) With this patch, the Python interpreter loop runs faster by ~5%. Reviewers: sanjoy, hfinkel Reviewed By: hfinkel Subscribers: eastig, junbuml, llvm-commits Differential Revision: https://reviews.llvm.org/D36772 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312930 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 167 ++++++++++++++++++++++++ test/Transforms/CodeGenPrepare/gep-unmerging.ll | 60 +++++++++ 2 files changed, 227 insertions(+) create mode 100644 test/Transforms/CodeGenPrepare/gep-unmerging.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 60de0b0c005..b4a52d499d3 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -6189,6 +6189,170 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, return true; } +// Return true if the GEP has two operands, the first operand is of a sequential +// type, and the second operand is a constant. +static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) { + gep_type_iterator I = gep_type_begin(*GEP); + return GEP->getNumOperands() == 2 && + I.isSequential() && + isa(GEP->getOperand(1)); +} + +// Try unmerging GEPs to reduce liveness interference (register pressure) across +// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks, +// reducing liveness interference across those edges benefits global register +// allocation. Currently handles only certain cases. +// +// For example, unmerge %GEPI and %UGEPI as below. +// +// ---------- BEFORE ---------- +// SrcBlock: +// ... +// %GEPIOp = ... +// ... +// %GEPI = gep %GEPIOp, Idx +// ... +// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ] +// (* %GEPI is alive on the indirectbr edges due to other uses ahead) +// (* %GEPIOp is alive on the indirectbr edges only because of it's used by +// %UGEPI) +// +// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged) +// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged) +// ... +// +// DstBi: +// ... +// %UGEPI = gep %GEPIOp, UIdx +// ... +// --------------------------- +// +// ---------- AFTER ---------- +// SrcBlock: +// ... (same as above) +// (* %GEPI is still alive on the indirectbr edges) +// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the +// unmerging) +// ... +// +// DstBi: +// ... +// %UGEPI = gep %GEPI, (UIdx-Idx) +// ... +// --------------------------- +// +// The register pressure on the IndirectBr edges is reduced because %GEPIOp is +// no longer alive on them. +// +// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging +// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as +// not to disable further simplications and optimizations as a result of GEP +// merging. +// +// Note this unmerging may increase the length of the data flow critical path +// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff +// between the register pressure and the length of data-flow critical +// path. Restricting this to the uncommon IndirectBr case would minimize the +// impact of potentially longer critical path, if any, and the impact on compile +// time. +static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, + const TargetTransformInfo *TTI) { + BasicBlock *SrcBlock = GEPI->getParent(); + // Check that SrcBlock ends with an IndirectBr. If not, give up. The common + // (non-IndirectBr) cases exit early here. + if (!isa(SrcBlock->getTerminator())) + return false; + // Check that GEPI is a simple gep with a single constant index. + if (!GEPSequentialConstIndexed(GEPI)) + return false; + ConstantInt *GEPIIdx = cast(GEPI->getOperand(1)); + // Check that GEPI is a cheap one. + if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType()) + > TargetTransformInfo::TCC_Basic) + return false; + Value *GEPIOp = GEPI->getOperand(0); + // Check that GEPIOp is an instruction that's also defined in SrcBlock. + if (!isa(GEPIOp)) + return false; + auto *GEPIOpI = cast(GEPIOp); + if (GEPIOpI->getParent() != SrcBlock) + return false; + // Check that GEP is used outside the block, meaning it's alive on the + // IndirectBr edge(s). + if (find_if(GEPI->users(), [&](User *Usr) { + if (auto *I = dyn_cast(Usr)) { + if (I->getParent() != SrcBlock) { + return true; + } + } + return false; + }) == GEPI->users().end()) + return false; + // The second elements of the GEP chains to be unmerged. + std::vector UGEPIs; + // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive + // on IndirectBr edges. + for (User *Usr : GEPIOp->users()) { + if (Usr == GEPI) continue; + // Check if Usr is an Instruction. If not, give up. + if (!isa(Usr)) + return false; + auto *UI = cast(Usr); + // Check if Usr in the same block as GEPIOp, which is fine, skip. + if (UI->getParent() == SrcBlock) + continue; + // Check if Usr is a GEP. If not, give up. + if (!isa(Usr)) + return false; + auto *UGEPI = cast(Usr); + // Check if UGEPI is a simple gep with a single constant index and GEPIOp is + // the pointer operand to it. If so, record it in the vector. If not, give + // up. + if (!GEPSequentialConstIndexed(UGEPI)) + return false; + if (UGEPI->getOperand(0) != GEPIOp) + return false; + if (GEPIIdx->getType() != + cast(UGEPI->getOperand(1))->getType()) + return false; + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType()) + > TargetTransformInfo::TCC_Basic) + return false; + UGEPIs.push_back(UGEPI); + } + if (UGEPIs.size() == 0) + return false; + // Check the materializing cost of (Uidx-Idx). + for (GetElementPtrInst *UGEPI : UGEPIs) { + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); + unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType()); + if (ImmCost > TargetTransformInfo::TCC_Basic) + return false; + } + // Now unmerge between GEPI and UGEPIs. + for (GetElementPtrInst *UGEPI : UGEPIs) { + UGEPI->setOperand(0, GEPI); + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + Constant *NewUGEPIIdx = + ConstantInt::get(GEPIIdx->getType(), + UGEPIIdx->getValue() - GEPIIdx->getValue()); + UGEPI->setOperand(1, NewUGEPIIdx); + // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not + // inbounds to avoid UB. + if (!GEPI->isInBounds()) { + UGEPI->setIsInBounds(false); + } + } + // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not + // alive on IndirectBr edges). + assert(find_if(GEPIOp->users(), [&](User *Usr) { + return cast(Usr)->getParent() != SrcBlock; + }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock"); + return true; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -6302,6 +6466,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { optimizeInst(NC, ModifiedDT); return true; } + if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { + return true; + } return false; } diff --git a/test/Transforms/CodeGenPrepare/gep-unmerging.ll b/test/Transforms/CodeGenPrepare/gep-unmerging.ll new file mode 100644 index 00000000000..09756756551 --- /dev/null +++ b/test/Transforms/CodeGenPrepare/gep-unmerging.ll @@ -0,0 +1,60 @@ +; RUN: opt -codegenprepare -S < %s | FileCheck %s + +@exit_addr = constant i8* blockaddress(@gep_unmerging, %exit) +@op1_addr = constant i8* blockaddress(@gep_unmerging, %op1) +@op2_addr = constant i8* blockaddress(@gep_unmerging, %op2) +@op3_addr = constant i8* blockaddress(@gep_unmerging, %op3) +@dummy = global i8 0 + +define void @gep_unmerging(i1 %pred, i8* %p0) { +entry: + %table = alloca [256 x i8*] + %table_0 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 0 + %table_1 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 1 + %table_2 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 2 + %table_3 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 3 + %exit_a = load i8*, i8** @exit_addr + %op1_a = load i8*, i8** @op1_addr + %op2_a = load i8*, i8** @op2_addr + %op3_a = load i8*, i8** @op3_addr + store i8* %exit_a, i8** %table_0 + store i8* %op1_a, i8** %table_1 + store i8* %op2_a, i8** %table_2 + store i8* %op3_a, i8** %table_3 + br label %indirectbr + +op1: +; CHECK-LABEL: op1: +; CHECK-NEXT: %p1_inc2 = getelementptr i8, i8* %p_postinc, i64 2 +; CHECK-NEXT: %p1_inc1 = getelementptr i8, i8* %p_postinc, i64 1 + %p1_inc2 = getelementptr i8, i8* %p_preinc, i64 3 + %p1_inc1 = getelementptr i8, i8* %p_preinc, i64 2 + %a10 = load i8, i8* %p_postinc + %a11 = load i8, i8* %p1_inc1 + %a12 = add i8 %a10, %a11 + store i8 %a12, i8* @dummy + br i1 %pred, label %indirectbr, label %exit + +op2: +; CHECK-LABEL: op2: +; CHECK-NEXT: %p2_inc = getelementptr i8, i8* %p_postinc, i64 1 + %p2_inc = getelementptr i8, i8* %p_preinc, i64 2 + %a2 = load i8, i8* %p_postinc + store i8 %a2, i8* @dummy + br i1 %pred, label %indirectbr, label %exit + +op3: + br i1 %pred, label %indirectbr, label %exit + +indirectbr: + %p_preinc = phi i8* [%p0, %entry], [%p1_inc2, %op1], [%p2_inc, %op2], [%p_postinc, %op3] + %p_postinc = getelementptr i8, i8* %p_preinc, i64 1 + %next_op = load i8, i8* %p_preinc + %p_zext = zext i8 %next_op to i64 + %slot = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 %p_zext + %target = load i8*, i8** %slot + indirectbr i8* %target, [label %exit, label %op1, label %op2] + +exit: + ret void +} -- 2.11.0