From 9787e8c76b6b0ea24bc0132a07f237bce8bbcf2a Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 21 Jul 2014 16:55:33 +0000 Subject: [PATCH] R600/SI: Add instruction shrinking pass This pass converts 64-bit instructions to 32-bit when possible. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213561 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp | 2 + lib/Target/R600/CMakeLists.txt | 1 + lib/Target/R600/SIInstrFormats.td | 1 + lib/Target/R600/SIInstrInfo.cpp | 9 ++ lib/Target/R600/SIInstrInfo.h | 6 + lib/Target/R600/SIInstrInfo.td | 9 ++ lib/Target/R600/SIShrinkInstructions.cpp | 189 +++++++++++++++++++++++++++++++ test/CodeGen/R600/bfi_int.ll | 2 +- test/CodeGen/R600/ctpop.ll | 2 +- test/CodeGen/R600/fcmp64.ll | 2 +- test/CodeGen/R600/seto.ll | 2 +- test/CodeGen/R600/setuo.ll | 2 +- 13 files changed, 223 insertions(+), 5 deletions(-) create mode 100644 lib/Target/R600/SIShrinkInstructions.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 416e050f675..d7e94f75e12 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -39,6 +39,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLiveRangesPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 23beb2576ac..56ba719e686 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -176,6 +176,7 @@ bool AMDGPUPassConfig::addPreRegAlloc() { // SIFixSGPRCopies can generate a lot of duplicate instructions, // so we need to run MachineCSE afterwards. addPass(&MachineCSEID); + addPass(createSIShrinkInstructionsPass()); initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); } @@ -185,6 +186,7 @@ bool AMDGPUPassConfig::addPreRegAlloc() { bool AMDGPUPassConfig::addPostRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget(); + addPass(createSIShrinkInstructionsPass()); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createSIInsertWaits(*TM)); } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 4d160826004..49a7f8aa18c 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -48,6 +48,7 @@ add_llvm_target(R600CodeGen SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp + SIShrinkInstructions.cpp SITypeRewriter.cpp ) diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index b9b302957f6..d4cee0d751b 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -288,6 +288,7 @@ class VOPC op, dag ins, string asm, list pattern> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let UseNamedOperandTable = 1; let VOPC = 1; } diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 59f10b6ded1..8c3af77e023 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -1639,3 +1639,12 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); } + +const MachineOperand *SIInstrInfo::getNamedOperand(const MachineInstr& MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index e8b6b6d69f8..13ab4843fda 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -174,11 +174,17 @@ public: unsigned SavReg, unsigned IndexReg) const; void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + const MachineOperand *getNamedOperand(const MachineInstr& MI, + unsigned OperandName) const; }; namespace AMDGPU { int getVOPe64(uint16_t Opcode); + int getVOPe32(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); int getMCOpcode(uint16_t Opcode, unsigned Gen); diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 841d037ca2b..0a624a3e13d 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -829,6 +829,15 @@ def getVOPe64 : InstrMapping { let ValueCols = [["8"]]; } +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + // Maps an original opcode to its commuted version def getCommuteRev : InstrMapping { let FilterClass = "VOP2_REV"; diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp new file mode 100644 index 00000000000..362a5c1e4e0 --- /dev/null +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -0,0 +1,189 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + if (Src2) + return false; + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || Src1Mod->getImm() != 0)) + return false; + + // We don't need to check src0, all input types are legal, so just make + // sure src0 isn't using any modifiers. + const MachineOperand *Src0Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + if (Src0Mod && Src0Mod->getImm() != 0) + return false; + + // Check output modifiers + const MachineOperand *Omod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (Omod && Omod->getImm() != 0) + return false; + + const MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + return !Clamp || Clamp->getImm() == 0; +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = static_cast( + MF.getTarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + if (Op32 == -1) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commtuing the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because the register allocator + // has trouble with sequences like this, which cause the allocator + // to run out of registes if vreg0 and vreg1 belong to the VCCReg + // register class: + // vreg0 = VOPC; + // vreg1 = VOPC; + // S_AND_B64 vreg0, vreg1 + // + // So, instead of forcing the instruction to write to VCC, we provide a + // hint to the register allocator to use VCC and then we + // we will run this pass again after RA and shrink it if it outpus to + // VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << "\n";); + + MachineInstrBuilder MIB = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // dst + MIB.addOperand(MI.getOperand(0)); + + MIB.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + MIB.addOperand(*Src1); + + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + + DEBUG(dbgs() << "e32 MI = "; MI.dump(); dbgs() << "\n";); + ++NumInstructionsShrunk; + MI.eraseFromParent(); + } + } + return false; +} diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll index bbfe856fc93..d18702a1de9 100644 --- a/test/CodeGen/R600/bfi_int.ll +++ b/test/CodeGen/R600/bfi_int.ll @@ -38,7 +38,7 @@ entry: ; R600-CHECK: @bfi_sha256_ma ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W -; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}} +; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}} ; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}} define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll index 15be8e162f3..22a3022145f 100644 --- a/test/CodeGen/R600/ctpop.ll +++ b/test/CodeGen/R600/ctpop.ll @@ -43,7 +43,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali ; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0 ; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]] ; SI-NOT: ADD -; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; SI: BUFFER_STORE_DWORD [[RESULT]], ; SI: S_ENDPGM diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll index bcc7a8c8567..8cbe9f68664 100644 --- a/test/CodeGen/R600/fcmp64.ll +++ b/test/CodeGen/R600/fcmp64.ll @@ -53,7 +53,7 @@ define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1, } ; CHECK: @fne_f64 -; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: V_CMP_NEQ_F64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll index e90e7886a6a..cc942c10a91 100644 --- a/test/CodeGen/R600/seto.ll +++ b/test/CodeGen/R600/seto.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: @main -;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0 +;CHECK: V_CMP_O_F32_e32 vcc, {{[sv][0-9]+, v[0-9]+}} define void @main(float %p) { main_body: diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll index 3b1db8b062b..33007fc754b 100644 --- a/test/CodeGen/R600/setuo.ll +++ b/test/CodeGen/R600/setuo.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: @main -;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0 +;CHECK: V_CMP_U_F32_e32 vcc, {{[sv][0-9]+, v[0-9]+}} define void @main(float %p) { main_body: -- 2.11.0