R600/SI: Add preliminary support for flat address space

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td

index 5645f1a..0bff35e 100644 (file)
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -81,6 +81,11 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug",
          "true",
          "GPU has CF_ALU bug">;
  
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+        "FlatAddressSpace",
+        "true",
+        "Support flat address space">;
+
  class SubtargetFeatureFetchLimit <string Value> :
                            SubtargetFeature <"fetch"#Value,
          "TexVTXClauseSize",
@@ -135,7 +140,7 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
  
  def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
          [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64]>;
+         FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
  //===----------------------------------------------------------------------===//
  
  def AMDGPUInstrInfo : InstrInfo {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp

index 845a46b..2755af2 100644 (file)
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -240,6 +240,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    unsigned MaxSGPR = 0;
    unsigned MaxVGPR = 0;
    bool VCCUsed = false;
+  bool FlatUsed = false;
    const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
        TM.getSubtargetImpl()->getRegisterInfo());
  
@@ -262,6 +263,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
             reg == AMDGPU::VCC_HI) {
            VCCUsed = true;
            continue;
+        } else if (reg == AMDGPU::FLAT_SCR ||
+                   reg == AMDGPU::FLAT_SCR_LO ||
+                   reg == AMDGPU::FLAT_SCR_HI) {
+          FlatUsed = true;
+          continue;
          }
  
          switch (reg) {
@@ -322,6 +328,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    if (VCCUsed)
      MaxSGPR += 2;
  
+  if (FlatUsed)
+    MaxSGPR += 2;
+
    // We found the maximum register index. They start at 0, so add one to get the
    // number of registers.
    ProgInfo.NumVGPR = MaxVGPR + 1;
@@ -340,6 +349,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
    ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
  
+  ProgInfo.FlatUsed = FlatUsed;
+  ProgInfo.VCCUsed = VCCUsed;
    ProgInfo.CodeLen = CodeSize;
  }
  
@@ -402,6 +413,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
  
      OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
      OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+
+    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+    // 0" comment but I don't see a corresponding field in the register spec.
    } else {
      OutStreamer.EmitIntValue(RsrcReg, 4);
      OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h

index 226fca9..b9a0767 100644 (file)
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -33,6 +33,8 @@ private:
        DebugMode(0),
        IEEEMode(0),
        ScratchSize(0),
+      FlatUsed(false),
+      VCCUsed(false),
        CodeLen(0) {}
  
      // Fields set in PGM_RSRC1 pm4 packet.
@@ -46,7 +48,10 @@ private:
      uint32_t IEEEMode;
      uint32_t ScratchSize;
  
+    bool FlatUsed;
+
      // Bonus information for debugging.
+    bool VCCUsed;
      uint64_t CodeLen;
    };
  
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp

index ed86a18..4fa576f 100644 (file)
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -65,6 +65,7 @@ private:
    static bool checkPrivateAddress(const MachineMemOperand *Op);
  
    static bool isGlobalStore(const StoreSDNode *N);
+  static bool isFlatStore(const StoreSDNode *N);
    static bool isPrivateStore(const StoreSDNode *N);
    static bool isLocalStore(const StoreSDNode *N);
    static bool isRegionStore(const StoreSDNode *N);
@@ -72,6 +73,7 @@ private:
    bool isCPLoad(const LoadSDNode *N) const;
    bool isConstantLoad(const LoadSDNode *N, int cbID) const;
    bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isFlatLoad(const LoadSDNode *N) const;
    bool isParamLoad(const LoadSDNode *N) const;
    bool isPrivateLoad(const LoadSDNode *N) const;
    bool isLocalLoad(const LoadSDNode *N) const;
@@ -104,6 +106,7 @@ private:
    bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
                           SDValue &Offset, SDValue &GLC, SDValue &SLC,
                           SDValue &TFE) const;
+  SDNode *SelectAddrSpaceCast(SDNode *N);
    bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
    bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                         SDValue &Clamp, SDValue &Omod) const;
@@ -484,6 +487,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    case AMDGPUISD::DIV_SCALE: {
      return SelectDIV_SCALE(N);
    }
+  case ISD::ADDRSPACECAST:
+    return SelectAddrSpaceCast(N);
    }
    return SelectCode(N);
  }
@@ -522,6 +527,10 @@ bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
    return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
  }
  
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
  bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
    return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
  }
@@ -553,6 +562,10 @@ bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
    return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
  }
  
+bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
  bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
    return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
  }
@@ -582,10 +595,11 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
    const Value *MemVal = N->getMemOperand()->getValue();
    if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
        !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
        !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
        !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
        !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
      return true;
    }
    return false;
@@ -1005,6 +1019,66 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
    return false;
  }
  
+// FIXME: This is incorrect and only enough to be able to compile.
+SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+  SDLoc DL(N);
+
+  assert(Subtarget.hasFlatAddressSpace() &&
+         "addrspacecast only supported with flat address space!");
+
+  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
+         "Cannot cast address space to / from constant address!");
+
+  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
+         "Can only cast to / from flat address space!");
+
+  // The flat instructions read the address as the index of the VGPR holding the
+  // address, so casting should just be reinterpreting the base VGPR, so just
+  // insert trunc / bitcast / zext.
+
+  SDValue Src = ASC->getOperand(0);
+  EVT DestVT = ASC->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+
+  if (SrcSize > DestSize) {
+    assert(SrcSize == 64 && DestSize == 32);
+    return CurDAG->getMachineNode(
+      TargetOpcode::EXTRACT_SUBREG,
+      DL,
+      DestVT,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
+  }
+
+
+  if (DestSize > SrcSize) {
+    assert(SrcSize == 32 && DestSize == 64);
+
+    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+
+    const SDValue Ops[] = {
+      RC,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                     CurDAG->getConstant(0, MVT::i32)), 0),
+      CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  SDLoc(N), N->getValueType(0), Ops);
+  }
+
+  assert(SrcSize == 64 && DestSize == 64);
+  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+}
+
  bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
  
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h

index 9054ec6..fa56977 100644 (file)
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -95,6 +95,7 @@ protected:
                                        MachineInstr *MI,
                                        const SmallVectorImpl<unsigned> &Ops,
                                        MachineInstr *LoadMI) const override;
+public:
    /// \returns the smallest register index that will be accessed by an indirect
    /// read or write or -1 if indirect addressing is not used by this program.
    int getIndirectIndexBegin(const MachineFunction &MF) const;
@@ -103,7 +104,6 @@ protected:
    /// read or write or -1 if indirect addressing is not used by this program.
    int getIndirectIndexEnd(const MachineFunction &MF) const;
  
-public:
    bool canFoldMemoryOperand(const MachineInstr *MI,
                             const SmallVectorImpl<unsigned> &Ops) const override;
    bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td

index cf3bffa..d152c88 100644 (file)
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -195,6 +195,14 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
      return isGlobalLoad(dyn_cast<LoadSDNode>(N));
  }]>;
  
+def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
  def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
      return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
  }]>;
@@ -223,6 +231,14 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
      return isGlobalLoad(dyn_cast<LoadSDNode>(N));
  }]>;
  
+def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
  def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
      return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
  }]>;
@@ -248,6 +264,11 @@ def az_extloadi32_global : PatFrag<(ops node:$ptr),
    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
  }]>;
  
+def az_extloadi32_flat : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
  def az_extloadi32_constant : PatFrag<(ops node:$ptr),
                                       (az_extloadi32 node:$ptr), [{
    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
@@ -263,6 +284,16 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
    return isGlobalStore(dyn_cast<StoreSDNode>(N));
  }]>;
  
+def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
  def local_store : PatFrag<(ops node:$val, node:$ptr),
                               (store node:$val, node:$ptr), [{
    return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -318,6 +349,7 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr),
    return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
  }]>;
  
+
  def atomic_cmp_swap_32_local :
    PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
            (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
@@ -334,6 +366,20 @@ def atomic_cmp_swap_64_local :
           AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
  }]>;
  
+def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def flat_store : PatFrag<(ops node:$val, node:$ptr),
+                         (store node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
  //===----------------------------------------------------------------------===//
  // Misc Pattern Fragments
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp

index 90af801..0f3f9e2 100644 (file)
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -12,7 +12,9 @@ void AMDGPUMachineFunction::anchor() {}
  AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
    MachineFunctionInfo(),
    ShaderType(ShaderType::COMPUTE),
-  LDSSize(0) {
+  LDSSize(0),
+  ScratchSize(0),
+  IsKernel(true) {
    AttributeSet Set = MF.getFunction()->getAttributes();
    Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
                                   ShaderTypeAttribute);
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h

index 75ce13d..886fb1b 100644 (file)
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -33,6 +33,9 @@ public:
    unsigned getShaderType() const {
      return ShaderType;
    }
+
+  unsigned ScratchSize;
+  bool IsKernel;
  };
  
  }
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp

index 1bce859..bcafee5 100644 (file)
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -77,14 +77,14 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
        DumpCode(false), R600ALUInst(false), HasVertexCache(false),
        TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
        FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
-      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+      FlatAddressSpace(false), EnableIRStructurizer(true),
+      EnablePromoteAlloca(false), EnableIfCvt(true),
        WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
        DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
        FrameLowering(TargetFrameLowering::StackGrowsUp,
                      64 * 16, // Maximum stack alignment (long16)
                      0),
        InstrItins(getInstrItineraryForCPU(GPU)) {
-
    if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      InstrInfo.reset(new R600InstrInfo(*this));
      TLInfo.reset(new R600TargetLowering(TM));
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h

index 0a5e598..6797972 100644 (file)
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -56,6 +56,7 @@ private:
    bool FP64Denormals;
    bool FP32Denormals;
    bool CaymanISA;
+  bool FlatAddressSpace;
    bool EnableIRStructurizer;
    bool EnablePromoteAlloca;
    bool EnableIfCvt;
@@ -124,6 +125,10 @@ public:
      return FP64Denormals;
    }
  
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
    bool hasBFE() const {
      return (getGeneration() >= EVERGREEN);
    }
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp

index 7451ff5..d766b10 100644 (file)
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -98,6 +98,27 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
    case AMDGPU::M0:
      O << "m0";
      return;
+  case AMDGPU::FLAT_SCR:
+    O << "flat_scratch";
+    return;
+  case AMDGPU::VCC_LO:
+    O << "vcc_lo";
+    return;
+  case AMDGPU::VCC_HI:
+    O << "vcc_hi";
+    return;
+  case AMDGPU::EXEC_LO:
+    O << "exec_lo";
+    return;
+  case AMDGPU::EXEC_HI:
+    O << "exec_hi";
+    return;
+  case AMDGPU::FLAT_SCR_LO:
+    O << "flat_scratch_lo";
+    return;
+  case AMDGPU::FLAT_SCR_HI:
+    O << "flat_scratch_hi";
+    return;
    default:
      break;
    }
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h

index 7679bee..0a24737 100644 (file)
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -22,7 +22,8 @@ enum {
    VOPC = 1 << 8,
    SALU = 1 << 9,
    MUBUF = 1 << 10,
-  MTBUF = 1 << 11
+  MTBUF = 1 << 11,
+  FLAT = 1 << 12
  };
  }
  
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td

index e12e799..32b8069 100644 (file)
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -26,6 +26,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
    field bits<1> SALU = 0;
    field bits<1> MUBUF = 0;
    field bits<1> MTBUF = 0;
+  field bits<1> FLAT = 0;
  
    // These need to be kept in sync with the enum in SIInstrFlags.
    let TSFlags{0} = VM_CNT;
@@ -40,6 +41,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
    let TSFlags{9} = SALU;
    let TSFlags{10} = MUBUF;
    let TSFlags{11} = MTBUF;
+  let TSFlags{12} = FLAT;
  }
  
  class Enc32 {
@@ -425,8 +427,27 @@ class MIMGe <bits<7> op> : Enc64 {
    let Inst{57-53} = SSAMP{6-2};
  }
  
-class EXPe : Enc64 {
+class FLATe<bits<7> op> : Enc64 {
+  bits<8> addr;
+  bits<8> data;
+  bits<8> vdst;
+  bits<1> slc;
+  bits<1> glc;
+  bits<1> tfe;
  
+  // 15-0 is reserved.
+  let Inst{16} = glc;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x37; // Encoding.
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data;
+  // 54-48 is reserved.
+  let Inst{55} = tfe;
+  let Inst{63-56} = vdst;
+}
+
+class EXPe : Enc64 {
    bits<4> EN;
    bits<6> TGT;
    bits<1> COMPR;
@@ -533,6 +554,21 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
    let UseNamedOperandTable = 1;
  }
  
+class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, FLATe <op> {
+  let FLAT = 1;
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let LGKM_CNT = 1;
+
+  let Uses = [EXEC, FLAT_SCR]; // M0
+
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0;
+}
+
  class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
      InstSI <outs, ins, asm, pattern>, MIMGe <op> {
  
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp

index b103cef..03e25e6 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -638,6 +638,10 @@ bool SIInstrInfo::isMTBUF(uint16_t Opcode) const {
    return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
  }
  
+bool SIInstrInfo::isFLAT(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+}
+
  bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
    return get(Opcode).TSFlags & SIInstrFlags::VOP1;
  }
@@ -843,6 +847,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
          if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
            ++ConstantBusCount;
  
+        // FLAT_SCR is just an SGPR pair.
+        if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+          ++ConstantBusCount;
+
          // SGPRs use the constant bus
          if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
              (!MO.isImplicit() &&
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h

index ed043ac..e1b0038 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -108,6 +108,7 @@ public:
    bool isSMRD(uint16_t Opcode) const;
    bool isMUBUF(uint16_t Opcode) const;
    bool isMTBUF(uint16_t Opcode) const;
+  bool isFLAT(uint16_t Opcode) const;
    bool isVOP1(uint16_t Opcode) const;
    bool isVOP2(uint16_t Opcode) const;
    bool isVOP3(uint16_t Opcode) const;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td

index 2d172c3..2d91d49 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -209,6 +209,7 @@ def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
  def SIOperand {
    int ZERO = 0x80;
    int VCC = 0x6A;
+  int FLAT_SCR = 0x68;
  }
  
  def SRCMODS {
@@ -1063,6 +1064,30 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
     }
  }
  
+class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
+      FLAT <op, (outs regClass:$data),
+                (ins VReg_64:$addr),
+            asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> {
+  let glc = 0;
+  let slc = 0;
+  let tfe = 0;
+  let mayLoad = 1;
+}
+
+class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
+          name#" $data, $addr, [M0, FLAT_SCRATCH]",
+         []> {
+
+  let mayLoad = 0;
+  let mayStore = 1;
+
+  // Encoding
+  let glc = 0;
+  let slc = 0;
+  let tfe = 0;
+}
+
  class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
    op,
    (outs regClass:$dst),
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td

index 8886061..8082d32 100644 (file)
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -31,6 +31,7 @@ def isSI : Predicate<"Subtarget.getGeneration() "
  
  def isCI : Predicate<"Subtarget.getGeneration() "
                        ">= AMDGPUSubtarget::SEA_ISLANDS">;
+def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
  
  def isCFDepth0 : Predicate<"isCFDepth0()">;
  
@@ -1044,6 +1045,80 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O"
  //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
  
  //===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasFlatAddressSpace] in {
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Helper <
+  0x00000018, "FLAT_STORE_BYTE", VReg_32
+>;
+
+def FLAT_STORE_SHORT : FLAT_Store_Helper <
+  0x0000001a, "FLAT_STORE_SHORT", VReg_32
+>;
+
+def FLAT_STORE_DWORD : FLAT_Store_Helper <
+  0x0000001c, "FLAT_STORE_DWORD", VReg_32
+>;
+
+def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  0x0000001d, "FLAT_STORE_DWORDX2", VReg_64
+>;
+
+def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  0x0000001e, "FLAT_STORE_DWORDX4", VReg_128
+>;
+
+def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  0x0000001e, "FLAT_STORE_DWORDX3", VReg_96
+>;
+
+//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>;
+//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>;
+//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>;
+//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>;
+//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>;
+//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>;
+//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>;
+//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>;
+//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>;
+//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>;
+//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>;
+//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>;
+//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>;
+//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>;
+//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>;
+//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>;
+//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>;
+//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>;
+//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>;
+//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>;
+//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>;
+//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>;
+//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>;
+//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>;
+//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>;
+//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>;
+//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>;
+//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>;
+//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>;
+//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>;
+//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>;
+//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>;
+//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>;
+//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>;
+
+} // End HasFlatAddressSpace predicate
+//===----------------------------------------------------------------------===//
  // VOP1 Instructions
  //===----------------------------------------------------------------------===//
  
@@ -2822,6 +2897,37 @@ defm V_MAD_I64_I32 : VOP3Inst <0x00000177, "V_MAD_I64_I32",
  
  } // End iSCI
  
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
+                             PatFrag flat_ld> :
+  Pat <(vt (flat_ld i64:$ptr)),
+       (Instr_ADDR64 $ptr)
+>;
+
+def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+
+class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
+  Pat <(st vt:$value, i64:$ptr),
+        (Instr $value, $ptr)
+  >;
+
+def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
+def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
+def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
  
  /********** ====================== **********/
  /**********   Indirect adressing   **********/
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp

index 97c706b..59270ee 100644 (file)
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -52,6 +52,7 @@
  #include "AMDGPUSubtarget.h"
  #include "SIInstrInfo.h"
  #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -451,6 +452,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
    bool HaveKill = false;
    bool NeedM0 = false;
    bool NeedWQM = false;
+  bool NeedFlat = false;
    unsigned Depth = 0;
  
    for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -467,6 +469,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
          NeedWQM = true;
        }
  
+      // Flat uses m0 in case it needs to access LDS.
+      if (TII->isFLAT(MI.getOpcode())) {
+        NeedM0 = true;
+        NeedFlat = true;
+      }
+
        switch (MI.getOpcode()) {
          default: break;
          case AMDGPU::SI_IF:
@@ -532,7 +540,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
          case AMDGPU::V_INTERP_MOV_F32:
            NeedWQM = true;
            break;
-
        }
      }
    }
@@ -550,5 +557,42 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
              AMDGPU::EXEC).addReg(AMDGPU::EXEC);
    }
  
+  // FIXME: This seems inappropriate to do here.
+  if (NeedFlat && MFI->IsKernel) {
+    // Insert the prologue initializing the SGPRs pointing to the scratch space
+    // for flat accesses.
+    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+    // TODO: What to use with function calls?
+
+    // FIXME: This is reporting stack size that is used in a scratch buffer
+    // rather than registers as well.
+    uint64_t StackSizeBytes = FrameInfo->getStackSize();
+
+    int IndirectBegin
+      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
+    // Convert register index to 256-byte unit.
+    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
+
+    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
+           "Stack limits should be smaller than 16-bits");
+
+    // Initialize the flat scratch register pair.
+    // TODO: Can we use one s_mov_b64 here?
+
+    // Offset is in units of 256-bytes.
+    MachineBasicBlock &MBB = MF.front();
+    DebugLoc NoDL;
+    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
+    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
+
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
+      .addImm(StackOffset);
+
+    // Documentation says size is "per-thread scratch size in bytes"
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
+      .addImm(StackSizeBytes);
+  }
+
    return true;
  }
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp

index 8663df8..823c9e9 100644 (file)
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -33,6 +33,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    BitVector Reserved(getNumRegs());
    Reserved.set(AMDGPU::EXEC);
    Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  Reserved.set(AMDGPU::FLAT_SCR);
    return Reserved;
  }
  
@@ -246,6 +247,28 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
          default: llvm_unreachable("Invalid SubIdx for VCC");
        }
        break;
+
+  case AMDGPU::FLAT_SCR:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::FLAT_SCR_LO;
+    case 1:
+      return AMDGPU::FLAT_SCR_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for FLAT_SCR");
+    }
+    break;
+
+  case AMDGPU::EXEC:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::EXEC_LO;
+    case 1:
+      return AMDGPU::EXEC_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for EXEC");
+    }
+    break;
    }
  
    unsigned Index = getHWRegIndex(Reg);
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td

index 8380677..64f9ca4 100644 (file)
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -39,6 +39,16 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
  def SCC : SIReg<"SCC", 253>;
  def M0 : SIReg <"M0", 124>;
  
+def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
+def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+
+// Pair to indicate location of scratch space for flat accesses.
+def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 104;
+}
+
  // SGPR registers
  foreach Index = 0-101 in {
    def SGPR#Index : SIReg <"SGPR"#Index, Index>;
@@ -167,13 +177,13 @@ def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
  
  // Register class for all scalar registers (SGPRs + Special Registers)
  def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI)
+  (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
  >;
  
  def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
  
  def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg)
+  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
  >;
  
  def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll

new file mode 100644 (file)

index 0000000..3f32e4d
--- /dev/null
+++ b/test/CodeGen/R600/flat-address-space.ll
@@ -0,0 +1,182 @@
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: @branch_use_flat_i32:
+; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, [M0, FLAT_SCRATCH]
+; CHECK: S_ENDPGM
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: @store_flat_i32:
+; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_i64:
+; CHECK: FLAT_STORE_DWORDX2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_v4i32:
+; CHECK: FLAT_STORE_DWORDX4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_trunc_i16:
+; CHECK: FLAT_STORE_SHORT
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_trunc_i8:
+; CHECK: FLAT_STORE_BYTE
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: FLAT_LOAD_DWORD
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: FLAT_LOAD_DWORDX2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: FLAT_LOAD_DWORDX4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: FLAT_LOAD_SBYTE
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: FLAT_LOAD_UBYTE
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: FLAT_LOAD_SSHORT
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: FLAT_LOAD_USHORT
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: @store_flat_scratch:
+; CHECK: S_MOVK_I32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: S_MOVK_I32 flat_scratch_hi, 40
+; CHECK-PROMOTE: S_MOVK_I32 flat_scratch_hi, 0
+; CHECK: FLAT_STORE_DWORD
+; CHECK: S_BARRIER
+; CHECK: FLAT_LOAD_DWORD
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 15 Sep 2014 15:41:53 +0000 (15:41 +0000)
lib/Target/R600/AMDGPU.td		patch \| blob \| history
lib/Target/R600/AMDGPUAsmPrinter.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUAsmPrinter.h		patch \| blob \| history
lib/Target/R600/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUInstrInfo.h		patch \| blob \| history
lib/Target/R600/AMDGPUInstructions.td		patch \| blob \| history
lib/Target/R600/AMDGPUMachineFunction.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUMachineFunction.h		patch \| blob \| history
lib/Target/R600/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp		patch \| blob \| history
lib/Target/R600/SIDefines.h		patch \| blob \| history
lib/Target/R600/SIInstrFormats.td		patch \| blob \| history
lib/Target/R600/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/R600/SIInstrInfo.h		patch \| blob \| history
lib/Target/R600/SIInstrInfo.td		patch \| blob \| history
lib/Target/R600/SIInstructions.td		patch \| blob \| history
lib/Target/R600/SILowerControlFlow.cpp		patch \| blob \| history
lib/Target/R600/SIRegisterInfo.cpp		patch \| blob \| history
lib/Target/R600/SIRegisterInfo.td		patch \| blob \| history
test/CodeGen/R600/flat-address-space.ll	[new file with mode: 0644]	patch \| blob