From d189a0407d891815692e9c6096770ec471245666 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 15 Sep 2014 15:41:53 +0000 Subject: [PATCH] R600/SI: Add preliminary support for flat address space git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217777 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.td | 7 +- lib/Target/R600/AMDGPUAsmPrinter.cpp | 14 ++ lib/Target/R600/AMDGPUAsmPrinter.h | 5 + lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 76 ++++++++- lib/Target/R600/AMDGPUInstrInfo.h | 2 +- lib/Target/R600/AMDGPUInstructions.td | 46 ++++++ lib/Target/R600/AMDGPUMachineFunction.cpp | 4 +- lib/Target/R600/AMDGPUMachineFunction.h | 3 + lib/Target/R600/AMDGPUSubtarget.cpp | 4 +- lib/Target/R600/AMDGPUSubtarget.h | 5 + lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 21 +++ lib/Target/R600/SIDefines.h | 3 +- lib/Target/R600/SIInstrFormats.td | 38 ++++- lib/Target/R600/SIInstrInfo.cpp | 8 + lib/Target/R600/SIInstrInfo.h | 1 + lib/Target/R600/SIInstrInfo.td | 25 +++ lib/Target/R600/SIInstructions.td | 106 +++++++++++++ lib/Target/R600/SILowerControlFlow.cpp | 46 +++++- lib/Target/R600/SIRegisterInfo.cpp | 23 +++ lib/Target/R600/SIRegisterInfo.td | 14 +- test/CodeGen/R600/flat-address-space.ll | 182 ++++++++++++++++++++++ 21 files changed, 622 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/R600/flat-address-space.ll diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 5645f1a2322..0bff35e4dee 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -81,6 +81,11 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug", "true", "GPU has CF_ALU bug">; +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + class SubtargetFeatureFetchLimit : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", @@ -135,7 +140,7 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64]>; + FeatureWavefrontSize64, FeatureFlatAddressSpace]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 845a46b1e63..2755af28688 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -240,6 +240,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; + bool FlatUsed = false; const SIRegisterInfo *RI = static_cast( TM.getSubtargetImpl()->getRegisterInfo()); @@ -262,6 +263,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, reg == AMDGPU::VCC_HI) { VCCUsed = true; continue; + } else if (reg == AMDGPU::FLAT_SCR || + reg == AMDGPU::FLAT_SCR_LO || + reg == AMDGPU::FLAT_SCR_HI) { + FlatUsed = true; + continue; } switch (reg) { @@ -322,6 +328,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; + if (FlatUsed) + MaxSGPR += 2; + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; @@ -340,6 +349,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; } @@ -402,6 +413,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer.EmitIntValue(RsrcReg, 4); OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 226fca949ea..b9a07679af0 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -33,6 +33,8 @@ private: DebugMode(0), IEEEMode(0), ScratchSize(0), + FlatUsed(false), + VCCUsed(false), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. @@ -46,7 +48,10 @@ private: uint32_t IEEEMode; uint32_t ScratchSize; + bool FlatUsed; + // Bonus information for debugging. + bool VCCUsed; uint64_t CodeLen; }; diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index ed86a18e5cb..4fa576f8055 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -65,6 +65,7 @@ private: static bool checkPrivateAddress(const MachineMemOperand *Op); static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); static bool isPrivateStore(const StoreSDNode *N); static bool isLocalStore(const StoreSDNode *N); static bool isRegionStore(const StoreSDNode *N); @@ -72,6 +73,7 @@ private: bool isCPLoad(const LoadSDNode *N) const; bool isConstantLoad(const LoadSDNode *N, int cbID) const; bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; bool isParamLoad(const LoadSDNode *N) const; bool isPrivateLoad(const LoadSDNode *N) const; bool isLocalLoad(const LoadSDNode *N) const; @@ -104,6 +106,7 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; + SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; @@ -484,6 +487,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); } return SelectCode(N); } @@ -522,6 +527,10 @@ bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -553,6 +562,10 @@ bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -582,10 +595,11 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { const Value *MemVal = N->getMemOperand()->getValue(); if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){ + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { return true; } return false; @@ -1005,6 +1019,66 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast(N); + SDLoc DL(N); + + assert(Subtarget.hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + SDLoc(N), N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 9054ec66942..fa56977be84 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -95,6 +95,7 @@ protected: MachineInstr *MI, const SmallVectorImpl &Ops, MachineInstr *LoadMI) const override; +public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexBegin(const MachineFunction &MF) const; @@ -103,7 +104,6 @@ protected: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; -public: bool canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl &Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index cf3bffac968..d152c884522 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -195,6 +195,14 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); }]>; @@ -223,6 +231,14 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); }]>; @@ -248,6 +264,11 @@ def az_extloadi32_global : PatFrag<(ops node:$ptr), return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi32_constant : PatFrag<(ops node:$ptr), (az_extloadi32 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); @@ -263,6 +284,16 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), return isGlobalStore(dyn_cast(N)); }]>; +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast(N)); @@ -318,6 +349,7 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr), return dyn_cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; + def atomic_cmp_swap_32_local : PatFrag<(ops node:$ptr, node:$cmp, node:$swap), (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ @@ -334,6 +366,20 @@ def atomic_cmp_swap_64_local : AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return dyn_cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 90af80113ec..0f3f9e26528 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -12,7 +12,9 @@ void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), - LDSSize(0) { + LDSSize(0), + ScratchSize(0), + IsKernel(true) { AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index 75ce13df2a1..886fb1b1fc3 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -33,6 +33,9 @@ public: unsigned getShaderType() const { return ShaderType; } + + unsigned ScratchSize; + bool IsKernel; }; } diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 1bce85987ff..bcafee51ae8 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -77,14 +77,14 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), CaymanISA(false), - EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), InstrItins(getInstrItineraryForCPU(GPU)) { - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(TM)); diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 0a5e598c5f6..679797219dc 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -56,6 +56,7 @@ private: bool FP64Denormals; bool FP32Denormals; bool CaymanISA; + bool FlatAddressSpace; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -124,6 +125,10 @@ public: return FP64Denormals; } + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 7451ff53eba..d766b1002ea 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -98,6 +98,27 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { case AMDGPU::M0: O << "m0"; return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; default: break; } diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 7679bee9424..0a247379e9b 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -22,7 +22,8 @@ enum { VOPC = 1 << 8, SALU = 1 << 9, MUBUF = 1 << 10, - MTBUF = 1 << 11 + MTBUF = 1 << 11, + FLAT = 1 << 12 }; } diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index e12e7998986..32b8069140c 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -26,6 +26,7 @@ class InstSI pattern> : field bits<1> SALU = 0; field bits<1> MUBUF = 0; field bits<1> MTBUF = 0; + field bits<1> FLAT = 0; // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; @@ -40,6 +41,7 @@ class InstSI pattern> : let TSFlags{9} = SALU; let TSFlags{10} = MUBUF; let TSFlags{11} = MTBUF; + let TSFlags{12} = FLAT; } class Enc32 { @@ -425,8 +427,27 @@ class MIMGe op> : Enc64 { let Inst{57-53} = SSAMP{6-2}; } -class EXPe : Enc64 { +class FLATe op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} + +class EXPe : Enc64 { bits<4> EN; bits<6> TGT; bits<1> COMPR; @@ -533,6 +554,21 @@ class MTBUF op, dag outs, dag ins, string asm, list pattern> : let UseNamedOperandTable = 1; } +class FLAT op, dag outs, dag ins, string asm, list pattern> : + InstSI, FLATe { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; +} + class MIMG op, dag outs, dag ins, string asm, list pattern> : InstSI , MIMGe { diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index b103ceff103..03e25e60f94 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -638,6 +638,10 @@ bool SIInstrInfo::isMTBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } +bool SIInstrInfo::isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; +} + bool SIInstrInfo::isVOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } @@ -843,6 +847,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) ++ConstantBusCount; + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + ++ConstantBusCount; + // SGPRs use the constant bus if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || (!MO.isImplicit() && diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index ed043ac9cba..e1b00388d14 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -108,6 +108,7 @@ public: bool isSMRD(uint16_t Opcode) const; bool isMUBUF(uint16_t Opcode) const; bool isMTBUF(uint16_t Opcode) const; + bool isFLAT(uint16_t Opcode) const; bool isVOP1(uint16_t Opcode) const; bool isVOP2(uint16_t Opcode) const; bool isVOP3(uint16_t Opcode) const; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 2d172c381fe..2d91d496c28 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -209,6 +209,7 @@ def VOP3Mods : ComplexPattern; def SIOperand { int ZERO = 0x80; int VCC = 0x6A; + int FLAT_SCR = 0x68; } def SRCMODS { @@ -1063,6 +1064,30 @@ multiclass MUBUF_Store_Helper op, string name, RegisterClass vdataClass } } +class FLAT_Load_Helper op, string asm, RegisterClass regClass> : + FLAT { + let glc = 0; + let slc = 0; + let tfe = 0; + let mayLoad = 1; +} + +class FLAT_Store_Helper op, string name, RegisterClass vdataClass> : + FLAT { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let glc = 0; + let slc = 0; + let tfe = 0; +} + class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < op, (outs regClass:$dst), diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 8886061dde8..8082d3254ef 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -31,6 +31,7 @@ def isSI : Predicate<"Subtarget.getGeneration() " def isCI : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; def isCFDepth0 : Predicate<"isCFDepth0()">; @@ -1044,6 +1045,80 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O" //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; //===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasFlatAddressSpace] in { +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>; + +def FLAT_STORE_BYTE : FLAT_Store_Helper < + 0x00000018, "FLAT_STORE_BYTE", VReg_32 +>; + +def FLAT_STORE_SHORT : FLAT_Store_Helper < + 0x0000001a, "FLAT_STORE_SHORT", VReg_32 +>; + +def FLAT_STORE_DWORD : FLAT_Store_Helper < + 0x0000001c, "FLAT_STORE_DWORD", VReg_32 +>; + +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x0000001d, "FLAT_STORE_DWORDX2", VReg_64 +>; + +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x0000001e, "FLAT_STORE_DWORDX4", VReg_128 +>; + +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x0000001e, "FLAT_STORE_DWORDX3", VReg_96 +>; + +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>; +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>; +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>; +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>; +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>; +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>; +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>; +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>; +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>; +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>; +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>; +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>; +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>; +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>; +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>; +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>; +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>; +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>; +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>; +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>; +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>; +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>; +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>; +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>; +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>; +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>; +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>; +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>; +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>; +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>; +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>; +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>; +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>; +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>; + +} // End HasFlatAddressSpace predicate +//===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -2822,6 +2897,37 @@ defm V_MAD_I64_I32 : VOP3Inst <0x00000177, "V_MAD_I64_I32", } // End iSCI +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +class FLATLoad_Pattern : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr) +>; + +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; + +class FLATStore_Pattern : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr) + >; + +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; /********** ====================== **********/ /********** Indirect adressing **********/ diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 97c706b0655..59270ee062e 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -52,6 +52,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -451,6 +452,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { bool HaveKill = false; bool NeedM0 = false; bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -467,6 +469,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { NeedWQM = true; } + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) { + NeedM0 = true; + NeedFlat = true; + } + switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: @@ -532,7 +540,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; - } } } @@ -550,5 +557,42 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { AMDGPU::EXEC).addReg(AMDGPU::EXEC); } + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + return true; } diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 8663df88922..823c9e90c5d 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -33,6 +33,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + Reserved.set(AMDGPU::FLAT_SCR); return Reserved; } @@ -246,6 +247,28 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, default: llvm_unreachable("Invalid SubIdx for VCC"); } break; + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; } unsigned Index = getHWRegIndex(Reg); diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 83806775253..64f9ca41187 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -39,6 +39,16 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { def SCC : SIReg<"SCC", 253>; def M0 : SIReg <"M0", 124>; +def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. +def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. + +// Pair to indicate location of scratch space for flat accesses. +def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + // SGPR registers foreach Index = 0-101 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; @@ -167,13 +177,13 @@ def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, - (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI) + (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>; def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, - (add SGPR_64, VCCReg, EXECReg) + (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) >; def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll new file mode 100644 index 00000000000..3f32e4dbc8b --- /dev/null +++ b/test/CodeGen/R600/flat-address-space.ll @@ -0,0 +1,182 @@ +; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + + +; CHECK-LABEL: @branch_use_flat_i32: +; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, [M0, FLAT_SCRATCH] +; CHECK: S_ENDPGM +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; These testcases might become useless when there are optimizations to +; remove generic pointers. + +; CHECK-LABEL: @store_flat_i32: +; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr, align 4 + ret void +} + +; CHECK-LABEL: @store_flat_i64: +; CHECK: FLAT_STORE_DWORDX2 +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + store i64 %x, i64 addrspace(4)* %fptr, align 8 + ret void +} + +; CHECK-LABEL: @store_flat_v4i32: +; CHECK: FLAT_STORE_DWORDX4 +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + ret void +} + +; CHECK-LABEL: @store_flat_trunc_i16: +; CHECK: FLAT_STORE_SHORT +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %y = trunc i32 %x to i16 + store i16 %y, i16 addrspace(4)* %fptr, align 2 + ret void +} + +; CHECK-LABEL: @store_flat_trunc_i8: +; CHECK: FLAT_STORE_BYTE +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %y = trunc i32 %x to i8 + store i8 %y, i8 addrspace(4)* %fptr, align 2 + ret void +} + + + +; CHECK-LABEL @load_flat_i32: +; CHECK: FLAT_LOAD_DWORD +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + %fload = load i32 addrspace(4)* %fptr, align 4 + store i32 %fload, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @load_flat_i64: +; CHECK: FLAT_LOAD_DWORDX2 +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + %fload = load i64 addrspace(4)* %fptr, align 4 + store i64 %fload, i64 addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @load_flat_v4i32: +; CHECK: FLAT_LOAD_DWORDX4 +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + %fload = load <4 x i32> addrspace(4)* %fptr, align 4 + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @sextload_flat_i8: +; CHECK: FLAT_LOAD_SBYTE +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8 addrspace(4)* %fptr, align 4 + %ext = sext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i8: +; CHECK: FLAT_LOAD_UBYTE +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8 addrspace(4)* %fptr, align 4 + %ext = zext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @sextload_flat_i16: +; CHECK: FLAT_LOAD_SSHORT +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16 addrspace(4)* %fptr, align 4 + %ext = sext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i16: +; CHECK: FLAT_LOAD_USHORT +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16 addrspace(4)* %fptr, align 4 + %ext = zext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; TODO: This should not be zero when registers are used for small +; scratch allocations again. + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: @store_flat_scratch: +; CHECK: S_MOVK_I32 flat_scratch_lo, 0 +; CHECK-NO-PROMOTE: S_MOVK_I32 flat_scratch_hi, 40 +; CHECK-PROMOTE: S_MOVK_I32 flat_scratch_hi, 0 +; CHECK: FLAT_STORE_DWORD +; CHECK: S_BARRIER +; CHECK: FLAT_LOAD_DWORD +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { + %alloca = alloca i32, i32 9, align 4 + %x = call i32 @llvm.r600.read.tidig.x() #3 + %pptr = getelementptr i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 +declare i32 @llvm.r600.read.tidig.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind noduplicate } +attributes #3 = { nounwind readnone } -- 2.11.0