AMDGPU: Add 32-bit constant address space

author Marek Olsak <marek.olsak@amd.com>

Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)

committer Marek Olsak <marek.olsak@amd.com>

Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)
author Marek Olsak <marek.olsak@amd.com>
Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)
committer Marek Olsak <marek.olsak@amd.com>
Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst

index ff22f2c..21e9308 100644 (file)
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -285,6 +285,7 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
       3                  Local (group/LDS) Local (group/LDS) Local (group/LDS) Local (group/LDS)
       4                  Generic (Flat)    Region (GDS)      Region (GDS)      Constant
       5                  Region (GDS)      Private (Scratch) Private (Scratch) Private (Scratch)
+     6                  Constant 32-bit   Constant 32-bit   Constant 32-bit   Constant 32-bit
       ================== ================= ================= ================= =================
  
  Current Default
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h

index 0ddc43a..0b590c3 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -224,6 +224,9 @@ struct AMDGPUAS {
      GLOBAL_ADDRESS = 1,   ///< Address space for global memory (RAT0, VTX0).
      CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
      LOCAL_ADDRESS = 3,    ///< Address space for local memory.
+
+    CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+
      /// Address space for direct addressible parameter memory (CONST0)
      PARAM_D_ADDRESS = 6,
      /// Address space for indirect addressible parameter memory (VTX1)
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp

index 392b011..fa52bbb 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                              bool OrLocal) {
    const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
  
-  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
+      Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
      return true;
    }
  
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

index b17b671..0c30f05 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -466,7 +466,8 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
  }
  
  bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
-  if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+       I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
        canWidenScalarExtLoad(I)) {
      IRBuilder<> Builder(&I);
      Builder.SetCurrentDebugLocation(I.getDebugLoc());
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 440f8b2..192d4b0 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -162,6 +162,7 @@ private:
  
    bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                          bool &Imm) const;
+  SDValue Expand32BitAddress(SDValue Addr) const;
    bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
                    bool &Imm) const;
    bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -636,7 +637,8 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
    if (!N->readMem())
      return false;
    if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+           N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
  
    return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
  }
@@ -1438,19 +1440,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
    return true;
  }
  
+SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
+  if (Addr.getValueType() != MVT::i32)
+    return Addr;
+
+  // Zero-extend a 32-bit address.
+  SDLoc SL(Addr);
+
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned AddrHiVal = Info->get32BitAddressHighBits();
+  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
+
+  const SDValue Ops[] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+    Addr,
+    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+            0),
+    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+  };
+
+  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+                                        Ops), 0);
+}
+
  bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                       SDValue &Offset, bool &Imm) const {
    SDLoc SL(Addr);
+
    if (CurDAG->isBaseWithConstantOffset(Addr)) {
      SDValue N0 = Addr.getOperand(0);
      SDValue N1 = Addr.getOperand(1);
  
      if (SelectSMRDOffset(N1, Offset, Imm)) {
-      SBase = N0;
+      SBase = Expand32BitAddress(N0);
        return true;
      }
    }
-  SBase = Addr;
+  SBase = Expand32BitAddress(Addr);
    Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
    Imm = true;
    return true;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

index b7f65c2..7cb6ef0 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -229,6 +229,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
        isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
      return true;
  
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
    const Instruction *I = dyn_cast<Instruction>(Ptr);
    return I && I->getMetadata("amdgpu.uniform");
  }
@@ -293,7 +296,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
    if (!I.hasOneMemOperand())
      return false;
  
-  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
      return false;
  
    if (!isInstrUniform(I))
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index b5d43af..1af1e10 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -266,7 +266,7 @@ static StringRef computeDataLayout(const Triple &TT) {
  
    // 32-bit private, local, and region pointers. 64-bit global, constant and
    // flat.
-    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32"
           "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
  }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

index 3ad099c..00ff030 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -237,6 +237,7 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
    AMDGPUAS AS = ST->getAMDGPUAS();
    if (AddrSpace == AS.GLOBAL_ADDRESS ||
        AddrSpace == AS.CONSTANT_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS_32BIT ||
        AddrSpace == AS.FLAT_ADDRESS)
      return 128;
    if (AddrSpace == AS.LOCAL_ADDRESS ||
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 83fe7e3..6361c2c 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -900,7 +900,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
    if (AS == AMDGPUASI.GLOBAL_ADDRESS)
      return isLegalGlobalAddressingMode(AM);
  
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
      // If the offset isn't a multiple of 4, it probably isn't going to be
      // correctly aligned.
      // FIXME: Can we get the real alignment here?
@@ -1023,7 +1024,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
      // If we have an uniform constant load, it still requires using a slow
      // buffer instruction if unaligned.
      if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+                 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
          (Align % 4 == 0) : true;
      }
  
@@ -1066,7 +1068,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
  static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
    return AS == AMDGPUASI.GLOBAL_ADDRESS ||
           AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS;
+         AS == AMDGPUASI.CONSTANT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
  }
  
  bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -4008,13 +4011,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
  
  bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
           AMDGPU::shouldEmitConstantsToTextSection(TT);
  }
  
  bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
           !shouldEmitFixup(GV) &&
           !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
  }
@@ -4391,7 +4396,8 @@ bool
  SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    // We can fold offsets for anything that doesn't require a GOT relocation.
    return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
           !shouldEmitGOTReloc(GA->getGlobal());
  }
  
@@ -4444,6 +4450,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    const GlobalValue *GV = GSD->getGlobal();
  
    if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
        GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
        // FIXME: It isn't correct to rely on the type of the pointer. This should
        // be removed when address space 0 is 64-bit.
@@ -5378,7 +5385,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
           AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
  
    unsigned NumElements = MemVT.getVectorNumElements();
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
      if (isMemOpUniform(Load))
        return SDValue();
      // Non-uniform loads will be selected to MUBUF instructions, so they
@@ -5386,7 +5394,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
      // loads.
      //
    }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS) {
      if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
          !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
        return SDValue();
@@ -5395,7 +5405,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
      // loads.
      //
    }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS ||
        AS == AMDGPUASI.FLAT_ADDRESS) {
      if (NumElements > 4)
        return SplitVectorLoad(Op, DAG);
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

index 888d8f9..2534ad0 100644 (file)
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -47,7 +47,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
      WorkItemIDZ(false),
      ImplicitBufferPtr(false),
      ImplicitArgPtr(false),
-    GITPtrHigh(0xffffffff) {
+    GITPtrHigh(0xffffffff),
+    HighBitsOf32BitAddress(0) {
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    const Function &F = MF.getFunction();
    FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -164,6 +165,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
    StringRef S = A.getValueAsString();
    if (!S.empty())
      S.consumeInteger(0, GITPtrHigh);
+
+  A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
+  S = A.getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, HighBitsOf32BitAddress);
  }
  
  unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h

index 63875c5..6eed4fc 100644 (file)
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -186,6 +186,8 @@ private:
    // current hardware only allows a 16 bit value.
    unsigned GITPtrHigh;
  
+  unsigned HighBitsOf32BitAddress;
+
    MCPhysReg getNextUserSGPR() const {
      assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
      return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -411,6 +413,10 @@ public:
      return GITPtrHigh;
    }
  
+  unsigned get32BitAddressHighBits() const {
+    return HighBitsOf32BitAddress;
+  }
+
    unsigned getNumUserSGPRs() const {
      return NumUserSGPRs;
    }
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td

index 8f34798..7ee0af0 100644 (file)
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -223,7 +223,8 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
  def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
    auto Ld = cast<LoadSDNode>(N);
    return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+    (((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+       Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
      static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
      (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
      !Ld->isVolatile() &&
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

index 50311c2..0367ce7 100644 (file)
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -447,7 +447,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
  }
  
  bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+         GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
  }
  
  bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -916,6 +917,9 @@ bool isUniformMMO(const MachineMemOperand *MMO) {
        isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
      return true;
  
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
    if (const Argument *Arg = dyn_cast<Argument>(Ptr))
      return isArgPassedInSGPR(Arg);
  
diff --git a/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/test/CodeGen/AMDGPU/constant-address-space-32bit.ll

new file mode 100644 (file)

index 0000000..61ad224
--- /dev/null
+++ b/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
+
+; GCN-LABEL: {{^}}load_i32:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
+; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
+define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr i32, i32 addrspace(6)* %p1, i64 2
+  %r0 = load i32, i32 addrspace(6)* %p0
+  %r1 = load i32, i32 addrspace(6)* %gep1
+  %r = add i32 %r0, %r1
+  %r2 = bitcast i32 %r to float
+  ret float %r2
+}
+
+; GCN-LABEL: {{^}}load_v2i32:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
+; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
+define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p1, i64 2
+  %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0
+  %r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1
+  %r = add <2 x i32> %r0, %r1
+  %r2 = bitcast <2 x i32> %r to <2 x float>
+  ret <2 x float> %r2
+}
+
+; GCN-LABEL: {{^}}load_v4i32:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
+; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
+define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p1, i64 2
+  %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0
+  %r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1
+  %r = add <4 x i32> %r0, %r1
+  %r2 = bitcast <4 x i32> %r to <4 x float>
+  ret <4 x float> %r2
+}
+
+; GCN-LABEL: {{^}}load_v8i32:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
+; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
+define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p1, i64 2
+  %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0
+  %r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1
+  %r = add <8 x i32> %r0, %r1
+  %r2 = bitcast <8 x i32> %r to <8 x float>
+  ret <8 x float> %r2
+}
+
+; GCN-LABEL: {{^}}load_v16i32:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
+; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
+define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p1, i64 2
+  %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0
+  %r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1
+  %r = add <16 x i32> %r0, %r1
+  %r2 = bitcast <16 x i32> %r to <16 x float>
+  ret <16 x float> %r2
+}
+
+; GCN-LABEL: {{^}}load_float:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
+; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
+define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr float, float addrspace(6)* %p1, i64 2
+  %r0 = load float, float addrspace(6)* %p0
+  %r1 = load float, float addrspace(6)* %gep1
+  %r = fadd float %r0, %r1
+  ret float %r
+}
+
+; GCN-LABEL: {{^}}load_v2float:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
+; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
+define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p1, i64 2
+  %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0
+  %r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1
+  %r = fadd <2 x float> %r0, %r1
+  ret <2 x float> %r
+}
+
+; GCN-LABEL: {{^}}load_v4float:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
+; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
+define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p1, i64 2
+  %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0
+  %r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1
+  %r = fadd <4 x float> %r0, %r1
+  ret <4 x float> %r
+}
+
+; GCN-LABEL: {{^}}load_v8float:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
+; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
+define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p1, i64 2
+  %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0
+  %r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1
+  %r = fadd <8 x float> %r0, %r1
+  ret <8 x float> %r
+}
+
+; GCN-LABEL: {{^}}load_v16float:
+; GCN-DAG: s_mov_b32 s3, 0
+; GCN-DAG: s_mov_b32 s2, s1
+; GCN-DAG: s_mov_b32 s1, s3
+; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
+; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
+; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
+; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
+define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 {
+  %gep1 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p1, i64 2
+  %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0
+  %r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1
+  %r = fadd <16 x float> %r0, %r1
+  ret <16 x float> %r
+}
+
+; GCN-LABEL: {{^}}load_i32_hi0:
+; GCN: s_mov_b32 s1, 0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+define amdgpu_vs i32 @load_i32_hi0(i32 addrspace(6)* inreg %p) #1 {
+  %r0 = load i32, i32 addrspace(6)* %p
+  ret i32 %r0
+}
+
+; GCN-LABEL: {{^}}load_i32_hi1:
+; GCN: s_mov_b32 s1, 1
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+define amdgpu_vs i32 @load_i32_hi1(i32 addrspace(6)* inreg %p) #2 {
+  %r0 = load i32, i32 addrspace(6)* %p
+  ret i32 %r0
+}
+
+; GCN-LABEL: {{^}}load_i32_hiffff8000:
+; GCN: s_movk_i32 s1, 0x8000
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+define amdgpu_vs i32 @load_i32_hiffff8000(i32 addrspace(6)* inreg %p) #3 {
+  %r0 = load i32, i32 addrspace(6)* %p
+  ret i32 %r0
+}
+
+; GCN-LABEL: {{^}}load_i32_hifffffff0:
+; GCN: s_mov_b32 s1, -16
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
+  %r0 = load i32, i32 addrspace(6)* %p
+  ret i32 %r0
+}
+
+; GCN-LABEL: {{^}}load_sampler
+; GCN: v_readfirstlane_b32
+; GCN-NEXT: v_readfirstlane_b32
+; SI: s_nop
+; GCN-NEXT: s_load_dwordx8
+; GCN-NEXT: s_load_dwordx4
+; GCN: image_sample
+define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+main_body:
+  %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
+  %23 = bitcast float %22 to i32
+  %24 = shl i32 %23, 1
+  %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24, !amdgpu.uniform !0
+  %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
+  %27 = shl i32 %23, 2
+  %28 = or i32 %27, 3
+  %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
+  %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28, !amdgpu.uniform !0
+  %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
+  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = extractelement <4 x float> %32, i32 1
+  %35 = extractelement <4 x float> %32, i32 2
+  %36 = extractelement <4 x float> %32, i32 3
+  %37 = bitcast float %4 to i32
+  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
+  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
+  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
+  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
+  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
+  %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
+  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
+}
+
+; GCN-LABEL: {{^}}load_sampler_nouniform
+; GCN: v_readfirstlane_b32
+; GCN-NEXT: v_readfirstlane_b32
+; SI: s_nop
+; GCN-NEXT: s_load_dwordx8
+; GCN-NEXT: s_load_dwordx4
+; GCN: image_sample
+define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+main_body:
+  %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
+  %23 = bitcast float %22 to i32
+  %24 = shl i32 %23, 1
+  %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24
+  %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
+  %27 = shl i32 %23, 2
+  %28 = or i32 %27, 3
+  %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
+  %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28
+  %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
+  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = extractelement <4 x float> %32, i32 1
+  %35 = extractelement <4 x float> %32, i32 2
+  %36 = extractelement <4 x float> %32, i32 3
+  %37 = bitcast float %4 to i32
+  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
+  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
+  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
+  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
+  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
+  %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
+  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7
+
+
+!0 = !{}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-32bit-address-high-bits"="0" }
+attributes #2 = { nounwind "amdgpu-32bit-address-high-bits"="1" }
+attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" }
+attributes #4 = { nounwind "amdgpu-32bit-address-high-bits"="0xfffffff0" }
+attributes #5 = { "InitialPSInputAddr"="45175" }
+attributes #6 = { nounwind readnone speculatable }
+attributes #7 = { nounwind readonly }
+attributes #8 = { nounwind readnone }
author	Marek Olsak <marek.olsak@amd.com>
	Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)
committer	Marek Olsak <marek.olsak@amd.com>
	Wed, 7 Feb 2018 16:01:00 +0000 (16:01 +0000)
docs/AMDGPUUsage.rst		patch \| blob \| history
lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIMachineFunctionInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIMachineFunctionInfo.h		patch \| blob \| history
lib/Target/AMDGPU/SMInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/constant-address-space-32bit.ll	[new file with mode: 0644]	patch \| blob