From 9d60e0ff0a335ec3e59b846629c9c4cd5be64cd7 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 8 Jan 2015 01:16:39 +0000 Subject: [PATCH] [RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225422 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/RegAllocBase.cpp | 2 + lib/CodeGen/RegAllocBase.h | 3 + lib/CodeGen/RegAllocGreedy.cpp | 209 ++++++++++++++++++++- .../CodeGen/X86/regalloc-reconcile-broken-hints.ll | 145 ++++++++++++++ 4 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/regalloc-reconcile-broken-hints.ll diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp index 122afd122d2..6b346f4536c 100644 --- a/lib/CodeGen/RegAllocBase.cpp +++ b/lib/CodeGen/RegAllocBase.cpp @@ -90,6 +90,7 @@ void RegAllocBase::allocatePhysRegs() { // Unused registers can appear when the spiller coalesces snippets. if (MRI->reg_nodbg_empty(VirtReg->reg)) { DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n'); + aboutToRemoveInterval(*VirtReg); LIS->removeInterval(VirtReg->reg); continue; } @@ -139,6 +140,7 @@ void RegAllocBase::allocatePhysRegs() { assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned"); if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) { DEBUG(dbgs() << "not queueing unused " << *SplitVirtReg << '\n'); + aboutToRemoveInterval(*SplitVirtReg); LIS->removeInterval(SplitVirtReg->reg); continue; } diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h index bbd79cdce00..659b8f505a2 100644 --- a/lib/CodeGen/RegAllocBase.h +++ b/lib/CodeGen/RegAllocBase.h @@ -96,6 +96,9 @@ protected: // Use this group name for NamedRegionTimer. static const char TimerGroupName[]; + /// Method called when the allocator is about to remove a LiveInterval. + virtual void aboutToRemoveInterval(LiveInterval &LI) {} + public: /// VerifyEnabled - True when -verify-regalloc is given. static bool VerifyEnabled; diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 8ef5dcdec98..edc329499c4 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -296,6 +296,9 @@ class RAGreedy : public MachineFunctionPass, /// obtained from the TargetSubtargetInfo. bool EnableLocalReassign; + /// Set of broken hints that may be reconciled later because of eviction. + SmallSetVector SetOfBrokenHints; + public: RAGreedy(); @@ -311,6 +314,7 @@ public: void enqueue(LiveInterval *LI) override; LiveInterval *dequeue() override; unsigned selectOrSplit(LiveInterval&, SmallVectorImpl&) override; + void aboutToRemoveInterval(LiveInterval &) override; /// Perform register allocation. bool runOnMachineFunction(MachineFunction &mf) override; @@ -378,6 +382,24 @@ private: SmallVirtRegSet &, unsigned); bool tryRecoloringCandidates(PQueue &, SmallVectorImpl &, SmallVirtRegSet &, unsigned); + void tryHintRecoloring(LiveInterval &); + void tryHintsRecoloring(); + + /// Model the information carried by one end of a copy. + struct HintInfo { + /// The frequency of the copy. + BlockFrequency Freq; + /// The virtual register or physical register. + unsigned Reg; + /// Its currently assigned register. + /// In case of a physical register Reg == PhysReg. + unsigned PhysReg; + HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg) + : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {} + }; + typedef SmallVector HintsInfo; + BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned); + void collectHintInfo(unsigned, HintsInfo &); }; } // end anonymous namespace @@ -453,7 +475,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) { if (VRM->hasPhys(VirtReg)) { - Matrix->unassign(LIS->getInterval(VirtReg)); + LiveInterval &LI = LIS->getInterval(VirtReg); + Matrix->unassign(LI); + aboutToRemoveInterval(LI); return true; } // Unassigned virtreg is probably in the priority queue. @@ -2213,6 +2237,11 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, return PhysReg; } +void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) { + // Do not keep invalid information around. + SetOfBrokenHints.remove(&LI); +} + void RAGreedy::initializeCSRCost() { // We use the larger one out of the command-line option and the value report // by TRI. @@ -2238,6 +2267,170 @@ void RAGreedy::initializeCSRCost() { CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry); } +/// \brief Collect the hint info for \p Reg. +/// The results are stored into \p Out. +/// \p Out is not cleared before being populated. +void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) { + for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) { + if (!Instr.isFullCopy()) + continue; + // Look for the other end of the copy. + unsigned OtherReg = Instr.getOperand(0).getReg(); + if (OtherReg == Reg) { + OtherReg = Instr.getOperand(1).getReg(); + if (OtherReg == Reg) + continue; + } + // Get the current assignment. + unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg) + ? OtherReg + : VRM->getPhys(OtherReg); + // Push the collected information. + Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg, + OtherPhysReg)); + } +} + +/// \brief Using the given \p List, compute the cost of the broken hints if +/// \p PhysReg was used. +/// \return The cost of \p List for \p PhysReg. +BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List, + unsigned PhysReg) { + BlockFrequency Cost = 0; + for (const HintInfo &Info : List) { + if (Info.PhysReg != PhysReg) + Cost += Info.Freq; + } + return Cost; +} + +/// \brief Using the register assigned to \p VirtReg, try to recolor +/// all the live ranges that are copy-related with \p VirtReg. +/// The recoloring is then propagated to all the live-ranges that have +/// been recolored and so on, until no more copies can be coalesced or +/// it is not profitable. +/// For a given live range, profitability is determined by the sum of the +/// frequencies of the non-identity copies it would introduce with the old +/// and new register. +void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) { + // We have a broken hint, check if it is possible to fix it by + // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted + // some register and PhysReg may be available for the other live-ranges. + SmallSet Visited; + SmallVector RecoloringCandidates; + HintsInfo Info; + unsigned Reg = VirtReg.reg; + unsigned PhysReg = VRM->getPhys(Reg); + // Start the recoloring algorithm from the input live-interval, then + // it will propagate to the ones that are copy-related with it. + Visited.insert(Reg); + RecoloringCandidates.push_back(Reg); + + DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '(' + << PrintReg(PhysReg, TRI) << ")\n"); + + do { + Reg = RecoloringCandidates.pop_back_val(); + + // We cannot recolor physcal register. + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + + assert(VRM->hasPhys(Reg) && "We have unallocated variable!!"); + + // Get the live interval mapped with this virtual register to be able + // to check for the interference with the new color. + LiveInterval &LI = LIS->getInterval(Reg); + unsigned CurrPhys = VRM->getPhys(Reg); + // Check that the new color matches the register class constraints and + // that it is free for this live range. + if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) || + Matrix->checkInterference(LI, PhysReg))) + continue; + + DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI) + << ") is recolorable.\n"); + + // Gather the hint info. + Info.clear(); + collectHintInfo(Reg, Info); + // Check if recoloring the live-range will increase the cost of the + // non-identity copies. + if (CurrPhys != PhysReg) { + DEBUG(dbgs() << "Checking profitability:\n"); + BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys); + BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg); + DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency() + << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n'); + if (OldCopiesCost < NewCopiesCost) { + DEBUG(dbgs() << "=> Not profitable.\n"); + continue; + } + // At this point, the cost is either cheaper or equal. If it is + // equal, we consider this is profitable because it may expose + // more recoloring opportunities. + DEBUG(dbgs() << "=> Profitable.\n"); + // Recolor the live-range. + Matrix->unassign(LI); + Matrix->assign(LI, PhysReg); + } + // Push all copy-related live-ranges to keep reconciling the broken + // hints. + for (const HintInfo &HI : Info) { + if (Visited.insert(HI.Reg).second) + RecoloringCandidates.push_back(HI.Reg); + } + } while (!RecoloringCandidates.empty()); +} + +/// \brief Try to recolor broken hints. +/// Broken hints may be repaired by recoloring when an evicted variable +/// freed up a register for a larger live-range. +/// Consider the following example: +/// BB1: +/// a = +/// b = +/// BB2: +/// ... +/// = b +/// = a +/// Let us assume b gets split: +/// BB1: +/// a = +/// b = +/// BB2: +/// c = b +/// ... +/// d = c +/// = d +/// = a +/// Because of how the allocation work, b, c, and d may be assigned different +/// colors. Now, if a gets evicted later: +/// BB1: +/// a = +/// st a, SpillSlot +/// b = +/// BB2: +/// c = b +/// ... +/// d = c +/// = d +/// e = ld SpillSlot +/// = e +/// This is likely that we can assign the same register for b, c, and d, +/// getting rid of 2 copies. +void RAGreedy::tryHintsRecoloring() { + for (LiveInterval *LI : SetOfBrokenHints) { + assert(TargetRegisterInfo::isVirtualRegister(LI->reg) && + "Recoloring is possible only for virtual registers"); + // Some dead defs may be around (e.g., because of debug uses). + // Ignore those. + if (!VRM->hasPhys(LI->reg)) + continue; + tryHintRecoloring(*LI); + } +} + unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, SmallVectorImpl &NewVRegs, SmallVirtRegSet &FixedRegisters, @@ -2274,8 +2467,18 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, // queue. The RS_Split ranges already failed to do this, and they should not // get a second chance until they have been split. if (Stage != RS_Split) - if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) + if (unsigned PhysReg = + tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) { + unsigned Hint = MRI->getSimpleHint(VirtReg.reg); + // If VirtReg has a hint and that hint is broken record this + // virtual register as a recoloring candidate for broken hint. + // Indeed, since we evicted a variable in its neighborhood it is + // likely we can at least partially recolor some of the + // copy-related live-ranges. + if (Hint && Hint != PhysReg) + SetOfBrokenHints.insert(&VirtReg); return PhysReg; + } assert(NewVRegs.empty() && "Cannot append to existing NewVRegs"); @@ -2355,8 +2558,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { NextCascade = 1; IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI); GlobalCand.resize(32); // This will grow as needed. + SetOfBrokenHints.clear(); allocatePhysRegs(); + tryHintsRecoloring(); releaseMemory(); return true; } diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll new file mode 100644 index 00000000000..00679428ca6 --- /dev/null +++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll @@ -0,0 +1,145 @@ +; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s +; Test case for the recoloring of broken hints. +; This is tricky to have something reasonably small to kick this optimization since +; it requires that spliting and spilling occur. +; The bottom line is that this test case is fragile. +; This was reduced from the make_list function from the llvm-testsuite: +; SingleSource/Benchmarks/McGill/chomp.c + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +%struct._list = type { i32*, %struct._list* } + +@ncol = external global i32, align 4 +@nrow = external global i32, align 4 + +declare noalias i32* @copy_data() + +declare noalias i8* @malloc(i64) + +declare i32 @get_value() + +declare i32 @in_wanted(i32* nocapture readonly) + +declare noalias i32* @make_data() + +; CHECK-LABEL: make_list: +; Function prologue. +; CHECK: pushq +; CHECK: subq ${{[0-9]+}}, %rsp +; Move the first argument (%data) into a temporary register. +; It will not survive the call to malloc otherwise. +; CHECK: movq %rdi, [[ARG1:%r[0-9a-z]+]] +; CHECK: callq _malloc +; Compute %data - 1 as used for load in land.rhs.i (via the variable %indvars.iv.next.i). +; CHECK: addq $-4, [[ARG1]] +; We use to produce a useless copy here and move %data in another temporary register. +; CHECK-NOT: movq [[ARG1]] +; End of the first basic block. +; CHECK: .align +; Now check that %data is used in an address computation. +; CHECK: leaq ([[ARG1]] +define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) { +entry: + %call = tail call i8* @malloc(i64 16) + %next = getelementptr inbounds i8* %call, i64 8 + %tmp = bitcast i8* %next to %struct._list** + %tmp2 = bitcast i8* %call to %struct._list* + %.pre78 = load i32* @ncol, align 4 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc32, %entry + %tmp4 = phi i32 [ %.pre78, %entry ], [ 0, %for.inc32 ] + %current.077 = phi %struct._list* [ %tmp2, %entry ], [ %current.1.lcssa, %for.inc32 ] + %cmp270 = icmp eq i32 %tmp4, 0 + br i1 %cmp270, label %for.inc32, label %for.body3 + +for.body3: ; preds = %if.end31, %for.cond1.preheader + %current.173 = phi %struct._list* [ %current.2, %if.end31 ], [ %current.077, %for.cond1.preheader ] + %row.172 = phi i32 [ %row.3, %if.end31 ], [ 0, %for.cond1.preheader ] + %col.071 = phi i32 [ %inc, %if.end31 ], [ 0, %for.cond1.preheader ] + %call4 = tail call i32* @make_data() + %tmp5 = load i32* @ncol, align 4 + %tobool14.i = icmp eq i32 %tmp5, 0 + br i1 %tobool14.i, label %while.cond.i, label %while.body.lr.ph.i + +while.body.lr.ph.i: ; preds = %for.body3 + %tmp6 = sext i32 %tmp5 to i64 + br label %while.body.i + +while.body.i: ; preds = %while.body.i, %while.body.lr.ph.i + %indvars.iv.i = phi i64 [ %tmp6, %while.body.lr.ph.i ], [ %indvars.iv.next.i, %while.body.i ] + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1 + %tmp9 = trunc i64 %indvars.iv.next.i to i32 + %tobool.i = icmp eq i32 %tmp9, 0 + br i1 %tobool.i, label %while.cond.i, label %while.body.i + +while.cond.i: ; preds = %land.rhs.i, %while.body.i, %for.body3 + %indvars.iv.i64 = phi i64 [ %indvars.iv.next.i65, %land.rhs.i ], [ 0, %for.body3 ], [ %tmp6, %while.body.i ] + %indvars.iv.next.i65 = add nsw i64 %indvars.iv.i64, -1 + %tmp10 = trunc i64 %indvars.iv.i64 to i32 + %tobool.i66 = icmp eq i32 %tmp10, 0 + br i1 %tobool.i66, label %if.else, label %land.rhs.i + +land.rhs.i: ; preds = %while.cond.i + %arrayidx.i67 = getelementptr inbounds i32* %call4, i64 %indvars.iv.next.i65 + %tmp11 = load i32* %arrayidx.i67, align 4 + %arrayidx2.i68 = getelementptr inbounds i32* %data, i64 %indvars.iv.next.i65 + %tmp12 = load i32* %arrayidx2.i68, align 4 + %cmp.i69 = icmp eq i32 %tmp11, %tmp12 + br i1 %cmp.i69, label %while.cond.i, label %equal_data.exit + +equal_data.exit: ; preds = %land.rhs.i + %cmp3.i = icmp slt i32 %tmp10, 1 + br i1 %cmp3.i, label %if.else, label %if.then + +if.then: ; preds = %equal_data.exit + %next7 = getelementptr inbounds %struct._list* %current.173, i64 0, i32 1 + %tmp14 = load %struct._list** %next7, align 8 + %next12 = getelementptr inbounds %struct._list* %tmp14, i64 0, i32 1 + store %struct._list* null, %struct._list** %next12, align 8 + %tmp15 = load %struct._list** %next7, align 8 + %tmp16 = load i32* %value, align 4 + %cmp14 = icmp eq i32 %tmp16, 1 + %.tmp16 = select i1 %cmp14, i32 0, i32 %tmp16 + %tmp18 = load i32* %all, align 4 + %tmp19 = or i32 %tmp18, %.tmp16 + %tmp20 = icmp eq i32 %tmp19, 0 + br i1 %tmp20, label %if.then19, label %if.end31 + +if.then19: ; preds = %if.then + %call21 = tail call i32 @in_wanted(i32* %call4) + br label %if.end31 + +if.else: ; preds = %equal_data.exit, %while.cond.i + %cmp26 = icmp eq i32 %col.071, 0 + %.row.172 = select i1 %cmp26, i32 0, i32 %row.172 + %sub30 = add nsw i32 %tmp5, -1 + br label %if.end31 + +if.end31: ; preds = %if.else, %if.then19, %if.then + %col.1 = phi i32 [ %sub30, %if.else ], [ 0, %if.then ], [ 0, %if.then19 ] + %row.3 = phi i32 [ %.row.172, %if.else ], [ %row.172, %if.then ], [ 0, %if.then19 ] + %current.2 = phi %struct._list* [ %current.173, %if.else ], [ %tmp15, %if.then ], [ %tmp15, %if.then19 ] + %inc = add nsw i32 %col.1, 1 + %tmp25 = load i32* @ncol, align 4 + %cmp2 = icmp eq i32 %inc, %tmp25 + br i1 %cmp2, label %for.cond1.for.inc32_crit_edge, label %for.body3 + +for.cond1.for.inc32_crit_edge: ; preds = %if.end31 + %.pre79 = load i32* @nrow, align 4 + br label %for.inc32 + +for.inc32: ; preds = %for.cond1.for.inc32_crit_edge, %for.cond1.preheader + %tmp26 = phi i32 [ %.pre79, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ] + %current.1.lcssa = phi %struct._list* [ %current.2, %for.cond1.for.inc32_crit_edge ], [ %current.077, %for.cond1.preheader ] + %row.1.lcssa = phi i32 [ %row.3, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ] + %inc33 = add nsw i32 %row.1.lcssa, 1 + %cmp = icmp eq i32 %inc33, %tmp26 + br i1 %cmp, label %for.end34, label %for.cond1.preheader + +for.end34: ; preds = %for.inc32 + %.pre = load %struct._list** %tmp, align 8 + ret %struct._list* %.pre +} -- 2.11.0