OSDN Git Service

[AMDGPU] Add options for waitcnt pass debugging; add instr count in debug output.
authorMark Searles <m.c.searles@gmail.com>
Thu, 7 Dec 2017 20:36:39 +0000 (20:36 +0000)
committerMark Searles <m.c.searles@gmail.com>
Thu, 7 Dec 2017 20:36:39 +0000 (20:36 +0000)
-amdgpu-waitcnt-forcezero={1|0}  Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-amdgpu-waitcnt-forceexp=<n>  Force emit a s_waitcnt expcnt(0) before the first <n> instrs
-amdgpu-waitcnt-forcelgkm=<n> Force emit a s_waitcnt lgkmcnt(0) before the first <n> instrs
-amdgpu-waitcnt-forcevm=<n>   Force emit a s_waitcnt vmcnt(0) before the first <n> instrs

Differential Revision: https://reviews.llvm.org/D40091

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320084 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/AMDGPU/SIInsertWaitcnts.cpp
test/CodeGen/AMDGPU/waitcnt-debug.mir [new file with mode: 0644]

index 6bbe597..ea6391a 100644 (file)
 
 using namespace llvm;
 
+static cl::opt<unsigned> ForceZeroFlag(
+  "amdgpu-waitcnt-forcezero",
+  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+  cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> ForceExpFlag(
+  "amdgpu-waitcnt-forceexp",
+  cl::desc("Force emit a s_waitcnt expcnt(0) before the first <n> instrs"),
+  cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> ForceLgkmFlag(
+  "amdgpu-waitcnt-forcelgkm",
+  cl::desc("Force emit a s_waitcnt lgkmcnt(0) before the first <n> instrs"),
+  cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> ForceVmFlag(
+  "amdgpu-waitcnt-forcevm",
+  cl::desc("Force emit a s_waitcnt vmcnt(0) before the first <n> instrs"),
+  cl::init(0), cl::Hidden);
+
 namespace {
 
 // Class of object that encapsulates latest instruction counter score
@@ -373,6 +393,10 @@ private:
 
   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 
+  int32_t InstCnt = 0;
+  bool ForceZero = false;
+  int32_t ForceSwaitcnt[NUM_INST_CNTS];
+
 public:
   static char ID;
 
@@ -397,6 +421,14 @@ public:
         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
   }
 
+  bool ForceEmit() const {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1))
+      if (ForceSwaitcnt[T] > 0)
+        return true;
+    return false;
+  }
+
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
                                            BlockWaitcntBrackets *ScoreBrackets);
@@ -1023,9 +1055,6 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     } // End of for loop that looks at all dest operands.
   }
 
-  // TODO: Tie force zero to a compiler triage option.
-  bool ForceZero = false;
-
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
   // occurs before the instruction. Doing it here prevents any additional
   // S_WAITCNTs from being emitted if the instruction was marked as
@@ -1058,7 +1087,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   }
 
   // Does this operand processing indicate s_wait counter update?
-  if (EmitSwaitcnt) {
+  if (EmitSwaitcnt || ForceEmit()) {
     int CntVal[NUM_INST_CNTS];
 
     bool UseDefaultWaitcntStrategy = true;
@@ -1099,7 +1128,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     }
 
     // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitSwaitcnt != 0) {
+    if (EmitSwaitcnt != 0 || ForceEmit()) {
       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
       if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
@@ -1135,11 +1164,31 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         CompilerGeneratedWaitcntSet.insert(SWaitInst);
       }
 
+      if (!EmitSwaitcnt) {
+        for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+             T = (enum InstCounterType)(T + 1)) {
+          if (ForceSwaitcnt[T] > 0 ) {
+            DEBUG(dbgs() << "ForceSwaitcnt[" << T << "]: "
+                  << ForceSwaitcnt[T] << '\n';);
+          }
+        }
+      }
+
       const MachineOperand &Op =
           MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
-              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
+              IV,
+              (ForceSwaitcnt[VM_CNT]   > 0) ? 0 : CntVal[VM_CNT],
+              (ForceSwaitcnt[EXP_CNT]  > 0) ? 0 : CntVal[EXP_CNT],
+              (ForceSwaitcnt[LGKM_CNT] > 0) ? 0 : CntVal[LGKM_CNT]));
       SWaitInst->addOperand(MF, Op);
 
+      if (!EmitSwaitcnt) {
+        for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+             T = (enum InstCounterType)(T + 1)) {
+          --ForceSwaitcnt[T];
+        }
+      }
+
       if (CntVal[EXP_CNT] == 0) {
         ScoreBrackets->setMixedExpTypes(false);
       }
@@ -1518,7 +1567,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
 
   DEBUG({
-    dbgs() << "Block" << Block.getNumber();
+    dbgs() << "*** Block" << Block.getNumber() << " ***";
     ScoreBrackets->dump();
   });
 
@@ -1591,7 +1640,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       DEBUG({ SWaitInst->print(dbgs() << '\n'); });
     }
     DEBUG({
-      Inst.print(dbgs());
+      dbgs() << "Instr" << ++InstCnt << ": " << Inst;
       ScoreBrackets->dump();
     });
 
@@ -1696,6 +1745,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
+  ForceZero = ForceZeroFlag;
+  ForceSwaitcnt[VM_CNT] = ForceVmFlag;
+  ForceSwaitcnt[EXP_CNT] = ForceExpFlag;
+  ForceSwaitcnt[LGKM_CNT] = ForceLgkmFlag;
+
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
diff --git a/test/CodeGen/AMDGPU/waitcnt-debug.mir b/test/CodeGen/AMDGPU/waitcnt-debug.mir
new file mode 100644 (file)
index 0000000..86430fc
--- /dev/null
@@ -0,0 +1,41 @@
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcelgkm=1 -o - %s | FileCheck -check-prefixes=GCN,LGKM %s
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forceexp=2 -o - %s | FileCheck -check-prefixes=GCN,EXP %s
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcevm=3 -o - %s | FileCheck -check-prefixes=GCN,VM %s
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcezero=1 -amdgpu-waitcnt-forcevm=2 -o - %s | FileCheck -check-prefixes=GCN,ZERO %s
+
+# check that the waitcnt pass options that force insertion of waitcnt instructions are working as expected
+
+...
+# GCN-LABEL: name: waitcnt-debug
+# LGKM: S_WAITCNT 127
+# LGKM-NEXT: S_NOP 0
+# LGKM-NEXT: S_NOP 0
+
+# EXP: S_WAITCNT 3855
+# EXP-NEXT: S_NOP 0
+# EXP-NEXT: S_WAITCNT 3855
+# EXP-NEXT: S_NOP 0
+
+# VM: S_WAITCNT 3952
+# VM-NEXT: S_NOP 0
+# VM-NEXT: S_WAITCNT 3952
+# VM-NEXT: S_NOP 0
+# VM-NEXT: S_WAITCNT 3952
+# VM-NEXT: S_NOP 0
+
+# ZERO: S_WAITCNT 0
+# ZERO-NEXT: S_WAITCNT 0
+# ZERO-NEXT: S_NOP 0
+# ZERO-NEXT: S_WAITCNT 0
+# ZERO-NEXT: S_NOP 0
+
+name:            waitcnt-debug
+liveins:
+body:             |
+  bb.0:
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+...
+