AMDGPU: More bits of frame index are known to be zero

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td

index 0975f97..af12ce7 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -198,14 +198,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
  def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
  def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
  
-
-def FeatureEnableHugeScratchBuffer : SubtargetFeature<
-  "huge-scratch-buffer",
-  "EnableHugeScratchBuffer",
-  "true",
-  "Enable scratch buffer sizes greater than 128 GB"
->;
-
  def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
    "EnableVGPRSpilling",
    "true",
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index fed46fe..9631915 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -84,7 +84,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
        GCN1Encoding(false), GCN3Encoding(false), CIInsts(false),
        HasSMemRealTime(false), Has16BitInsts(false),
        LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
+      IsaVersion(ISAVersion0_0_0),
        EnableSIScheduler(false), FrameLowering(nullptr),
        InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
  
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index b9874f8..3922894 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -93,7 +93,6 @@ private:
    bool FeatureDisable;
    int LDSBankCount;
    unsigned IsaVersion;
-  bool EnableHugeScratchBuffer;
    bool EnableSIScheduler;
  
    std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
@@ -293,10 +292,6 @@ public:
      return false;
    }
  
-  bool enableHugeScratchBuffer() const {
-    return EnableHugeScratchBuffer;
-  }
-
    bool enableSIScheduler() const {
      return EnableSIScheduler;
    }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 46b73f7..d6a69da 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1178,25 +1178,35 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
    unsigned FrameIndex = FINode->getIndex();
  
-  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
-  // the high bit of a frame index offset were to be set, this would mean
-  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
-  // scratch buffer, with 64 being the number of threads per wave.
+  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+  // high bit of a frame index offset were to be set, this would mean that it
+  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+  // buffer, with 64 being the number of threads per wave.
    //
-  // If we know the machine uses less than 128GB of scratch, then we can
-  // amrk the high bit of the FrameIndex node as known zero,
-  // which is important, because it means in most situations we can
-  // prove that values derived from FrameIndex nodes are non-negative.
-  // This enables us to take advantage of more addressing modes when
-  // accessing scratch buffers, since for scratch reads/writes, the register
-  // offset must always be positive.
+  // The maximum private allocation for the entire GPU is 4G, and we are
+  // concerned with the largest the index could ever be for an individual
+  // workitem. This will occur with the minmum dispatch size. If a program
+  // requires more, the dispatch size will be reduced.
+  //
+  // With this limit, we can mark the high bit of the FrameIndex node as known
+  // zero, which is important, because it means in most situations we can prove
+  // that values derived from FrameIndex nodes are non-negative. This enables us
+  // to take advantage of more addressing modes when accessing scratch buffers,
+  // since for scratch reads/writes, the register offset must always be
+  // positive.
  
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  if (Subtarget->enableHugeScratchBuffer())
-    return TFI;
+  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
+
+  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+  // granularity. It is probably a full wave.
+  uint64_t MinGranularity = 32;
  
+  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
    return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+                     DAG.getValueType(ExtVT));
  }
  
  bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll

index 6366697..7251f0c 100644 (file)
--- a/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -33,9 +33,9 @@
  ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
  
  ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
  define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
  entry:
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -99,10 +99,14 @@ entry:
  ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
  ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
  
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
  define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
  entry:
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll

index d43de47..c992eb8 100644 (file)
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
  
  ; When a frame index offset is more than 12-bits, make sure we don't store
  ; it in mubuf's offset field.
@@ -102,8 +100,7 @@ entry:
  }
  
  ; GCN-LABEL: @pos_vaddr_offse
-; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
-; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
  define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
  entry:
    %array = alloca [8192 x i32]
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 27 Feb 2016 20:26:57 +0000 (20:26 +0000)
lib/Target/AMDGPU/AMDGPU.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/private-element-size.ll		patch \| blob \| history
test/CodeGen/AMDGPU/scratch-buffer.ll		patch \| blob \| history