From ae5ecee3eca91869c3476ad209393273bd08ce1e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 16 Nov 2016 18:42:17 +0000 Subject: [PATCH] AMDGPU/SI: Avoid creating unnecessary copies in the SIFixSGPRCopies pass Summary: 1. Don't try to copy values to and from the same register class. 2. Replace copies with of registers with immediate values with v_mov/s_mov instructions. The main purpose of this change is to make MachineSink do a better job of determining when it is beneficial to split a critical edge, since the pass assumes that copies will become move instructions. This prevents a regression in uniform-cfg.ll if we enable critical edge splitting for AMDGPU. Reviewers: arsenm Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D23408 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287131 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 71 +++++++++++++++++----- lib/Target/AMDGPU/SIInstrInfo.h | 6 ++ lib/Target/AMDGPU/SIRegisterInfo.cpp | 25 ++++---- lib/Target/AMDGPU/SIRegisterInfo.h | 2 + test/CodeGen/AMDGPU/control-flow-fastregalloc.ll | 8 +-- test/CodeGen/AMDGPU/insert_vector_elt.ll | 7 +-- test/CodeGen/AMDGPU/scratch-buffer.ll | 4 +- .../si-instr-info-correct-implicit-operands.ll | 2 +- 8 files changed, 89 insertions(+), 36 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index af9402be585..dcbb51bebb6 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2546,6 +2546,39 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, } } +void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, + MachineRegisterInfo &MRI, + const DebugLoc &DL) const { + + unsigned OpReg = Op.getReg(); + unsigned OpSubReg = Op.getSubReg(); + + const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( + RI.getRegClassForReg(MRI, OpReg), OpSubReg); + + // Check if operand is already the correct register class. + if (DstRC == OpRC) + return; + + unsigned DstReg = MRI.createVirtualRegister(DstRC); + MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setSubReg(0); + + MachineInstr *Def = MRI.getVRegDef(OpReg); + if (!Def) + return; + + // Try to eliminate the copy if it is copying an immediate value. + if (Def->isMoveImmediate()) + FoldImmediate(*Copy, *Def, OpReg, &MRI); +} + void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -2603,15 +2636,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; - unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - Op.setReg(DstReg); + // Avoid creating no-op copies with the same src and dst reg class. These + // confuse some of the machine passes. + legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); } } @@ -2635,12 +2667,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { if (VRC == OpRC) continue; - unsigned DstReg = MRI.createVirtualRegister(VRC); - - BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - - Op.setReg(DstReg); + legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); Op.setIsKill(); } } @@ -2656,11 +2683,9 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI.getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI.getOperand(1).setReg(NewSrc0); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &Op = MI.getOperand(1); + legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); } return; } @@ -3000,6 +3025,22 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; unsigned DstReg = Inst.getOperand(0).getReg(); + if (Inst.isCopy() && + TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + continue; + } + NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 515beb4bbfd..71f1968d250 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -567,6 +567,12 @@ public: void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; + void legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, MachineRegisterInfo &MRI, + const DebugLoc &DL) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr &MI) const; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 63cd49dc044..b30271fc5cf 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1059,17 +1059,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return AMDGPU::NoRegister; } -bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, - unsigned Reg) const { - const TargetRegisterClass *RC; - if (TargetRegisterInfo::isVirtualRegister(Reg)) - RC = MRI.getRegClass(Reg); - else - RC = getPhysRegClass(Reg); - - return hasVGPRs(RC); -} - unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 800; @@ -1363,3 +1352,17 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC llvm_unreachable("unhandled register size"); } } + +const TargetRegisterClass* +SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + + return getPhysRegClass(Reg); +} + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + return hasVGPRs(getRegClassForReg(MRI, Reg)); +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 4a1f8640ad7..467b36616f0 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -172,6 +172,8 @@ public: unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isSGPRPressureSet(unsigned SetID) const { diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 27b3cc0e435..226def93f17 100644 --- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -108,9 +108,9 @@ endif: ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:12 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -133,11 +133,11 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:12 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 37da9c5d5ad..8ff09ff96f8 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -207,7 +207,6 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> ; GCN: buffer_load_ushort v{{[0-9]+}}, off ; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 @@ -418,7 +417,7 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d } ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: -; GCN: SCRATCH_RSRC_DWORD +; GCN-DAG: SCRATCH_RSRC_DWORD ; FIXME: Should be able to eliminate this? @@ -426,8 +425,8 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 0{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index ddf971ccdde..94101f0b92b 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -48,7 +48,9 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000 +; This constant isn't folded, because it has multiple uses. +; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000 +; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll index 98d1bb7cf9a..0d1de6662f2 100644 --- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -3,7 +3,7 @@ ; register operands in the correct order when modifying the opcode of an ; instruction to V_ADD_I32_e32. -; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 killed %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: -- 2.11.0