[AMDGPU] Insert PS early exit at end of control flow

author Carl Ritson <carl.ritson@amd.com>

Fri, 3 Jul 2020 03:25:33 +0000 (12:25 +0900)

committer Carl Ritson <carl.ritson@amd.com>

Fri, 3 Jul 2020 05:04:34 +0000 (14:04 +0900)
author Carl Ritson <carl.ritson@amd.com>
Fri, 3 Jul 2020 03:25:33 +0000 (12:25 +0900)
committer Carl Ritson <carl.ritson@amd.com>
Fri, 3 Jul 2020 05:04:34 +0000 (14:04 +0900)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp

index d4371ef..052db5f 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -239,7 +239,16 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
        for (auto LiveIn : MBB.liveins())
          SplitBB->addLiveIn(LiveIn);
        MBB.addSuccessor(SplitBB);
-      MDT->addNewBlock(SplitBB, &MBB);
+
+      // Update dominator tree
+      using DomTreeT = DomTreeBase<MachineBasicBlock>;
+      SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+      for (MachineBasicBlock *Succ : SplitBB->successors()) {
+        DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+        DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+      }
+      DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+      MDT->getBase().applyUpdates(DTUpdates);
      }
  
      MBB.addSuccessor(EarlyExitBlock);
@@ -447,6 +456,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
          break;
        }
  
+      case AMDGPU::SI_KILL_CLEANUP:
+        if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+            dominatesAllReachable(MBB)) {
+          KillInstrs.push_back(&MI);
+        } else {
+          MI.eraseFromParent();
+        }
+        break;
+
        default:
          break;
        }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 2b053f8..ec37837 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -379,6 +379,9 @@ multiclass PseudoInstKill <dag ins> {
  defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
  defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
  
+let Defs = [EXEC] in
+def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
+
  let Defs = [EXEC,VCC] in
  def SI_ILLEGAL_COPY : SPseudoInstSI <
    (outs unknown:$dst), (ins unknown:$src),
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

index 1e90e6b..36d52ac 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -89,8 +89,10 @@ private:
    MachineRegisterInfo *MRI = nullptr;
    SetVector<MachineInstr*> LoweredEndCf;
    DenseSet<Register> LoweredIf;
+  SmallSet<MachineInstr *, 16> NeedsKillCleanup;
  
    const TargetRegisterClass *BoolRC = nullptr;
+  bool InsertKillCleanups;
    unsigned AndOpc;
    unsigned OrOpc;
    unsigned XorOpc;
@@ -111,6 +113,8 @@ private:
  
    void combineMasks(MachineInstr &MI);
  
+  void process(MachineInstr &MI);
+
    // Skip to the next instruction, ignoring debug instructions, and trivial
    // block boundaries (blocks that have one (typically fallthrough) successor,
    // and the successor has one predecessor.
@@ -160,36 +164,36 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
  
  char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
  
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
-                       const SIInstrInfo *TII) {
-  Register SaveExecReg = MI.getOperand(0).getReg();
-  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
-
-  if (U == MRI->use_instr_nodbg_end() ||
-      std::next(U) != MRI->use_instr_nodbg_end() ||
-      U->getOpcode() != AMDGPU::SI_END_CF)
-    return false;
-
-  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
-  // if there is any such terminator simplififcations are not safe.
-  auto SMBB = MI.getParent();
-  auto EMBB = U->getParent();
+static bool hasKill(const MachineBasicBlock *Begin,
+                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
    DenseSet<const MachineBasicBlock*> Visited;
-  SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
-                                              SMBB->succ_end());
+  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
+                                               Begin->succ_end());
  
    while (!Worklist.empty()) {
      MachineBasicBlock *MBB = Worklist.pop_back_val();
  
-    if (MBB == EMBB || !Visited.insert(MBB).second)
+    if (MBB == End || !Visited.insert(MBB).second)
        continue;
-    for(auto &Term : MBB->terminators())
+    for (auto &Term : MBB->terminators())
        if (TII->isKillTerminator(Term.getOpcode()))
-        return false;
+        return true;
  
      Worklist.append(MBB->succ_begin(), MBB->succ_end());
    }
  
+  return false;
+}
+
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+  Register SaveExecReg = MI.getOperand(0).getReg();
+  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+  if (U == MRI->use_instr_nodbg_end() ||
+      std::next(U) != MRI->use_instr_nodbg_end() ||
+      U->getOpcode() != AMDGPU::SI_END_CF)
+    return false;
+
    return true;
  }
  
@@ -207,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
    // If there is only one use of save exec register and that use is SI_END_CF,
    // we can optimize SI_IF by returning the full saved exec mask instead of
    // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI, TII);
+  bool SimpleIf = isSimpleIf(MI, MRI);
+
+  if (InsertKillCleanups) {
+    // Check for SI_KILL_*_TERMINATOR on full path of control flow and
+    // flag the associated SI_END_CF for insertion of a kill cleanup.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
+      assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
+      assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
+      MachineOperand &NextExec = UseMI->getOperand(0);
+      Register NextExecReg = NextExec.getReg();
+      if (NextExec.isDead()) {
+        assert(!SimpleIf);
+        break;
+      }
+      UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
+    }
+    if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
+      if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
+        NeedsKillCleanup.insert(&*UseMI);
+        SimpleIf = false;
+      }
+    }
+  } else if (SimpleIf) {
+    // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+    // if there is any such terminator simplifications are not safe.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+  }
  
    // Add an implicit def of exec to discourage scheduling VALU after this which
    // will interfere with trying to form s_and_saveexec_b64 later.
@@ -427,6 +459,8 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
  
      auto E = B->end();
      for ( ; It != E; ++It) {
+      if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
+        continue;
        if (TII->mayReadEXEC(*MRI, *It))
          break;
      }
@@ -461,8 +495,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
  
    LoweredEndCf.insert(NewMI);
  
-  if (LIS)
+  // If this ends control flow which contains kills (as flagged in emitIf)
+  // then insert an SI_KILL_CLEANUP immediately following the exec mask
+  // manipulation.  This can be lowered to early termination if appropriate.
+  MachineInstr *CleanUpMI = nullptr;
+  if (NeedsKillCleanup.count(&MI))
+    CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
+
+  if (LIS) {
      LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    if (CleanUpMI)
+      LIS->InsertMachineInstrInMaps(*CleanUpMI);
+  }
  
    MI.eraseFromParent();
  
@@ -553,6 +597,56 @@ void SILowerControlFlow::optimizeEndCf() {
    }
  }
  
+void SILowerControlFlow::process(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::iterator I(MI);
+  MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_IF:
+    emitIf(MI);
+    break;
+
+  case AMDGPU::SI_ELSE:
+    emitElse(MI);
+    break;
+
+  case AMDGPU::SI_IF_BREAK:
+    emitIfBreak(MI);
+    break;
+
+  case AMDGPU::SI_LOOP:
+    emitLoop(MI);
+    break;
+
+  case AMDGPU::SI_END_CF:
+    emitEndCf(MI);
+    break;
+
+  default:
+    assert(false && "Attempt to process unsupported instruction");
+    break;
+  }
+
+  MachineBasicBlock::iterator Next;
+  for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MaskMI = *I;
+    switch (MaskMI.getOpcode()) {
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_OR_B32:
+      // Cleanup bit manipulations on exec mask
+      combineMasks(MaskMI);
+      break;
+    default:
+      I = MBB.end();
+      break;
+    }
+  }
+}
+
  bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    TII = ST.getInstrInfo();
@@ -562,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    LIS = getAnalysisIfAvailable<LiveIntervals>();
    MRI = &MF.getRegInfo();
    BoolRC = TRI->getBoolRC();
+  InsertKillCleanups =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
  
    if (ST.isWave32()) {
      AndOpc = AMDGPU::S_AND_B32;
@@ -583,62 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
      Exec = AMDGPU::EXEC;
    }
  
+  SmallVector<MachineInstr *, 32> Worklist;
+
    MachineFunction::iterator NextBB;
    for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
         BI != BE; BI = NextBB) {
      NextBB = std::next(BI);
      MachineBasicBlock &MBB = *BI;
  
-    MachineBasicBlock::iterator I, Next, Last;
-
-    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
        Next = std::next(I);
        MachineInstr &MI = *I;
  
        switch (MI.getOpcode()) {
        case AMDGPU::SI_IF:
-        emitIf(MI);
+        process(MI);
          break;
  
        case AMDGPU::SI_ELSE:
-        emitElse(MI);
-        break;
-
        case AMDGPU::SI_IF_BREAK:
-        emitIfBreak(MI);
-        break;
-
        case AMDGPU::SI_LOOP:
-        emitLoop(MI);
-        break;
-
        case AMDGPU::SI_END_CF:
-        emitEndCf(MI);
+        // Only build worklist if SI_IF instructions must be processed first.
+        if (InsertKillCleanups)
+          Worklist.push_back(&MI);
+        else
+          process(MI);
          break;
  
-      case AMDGPU::S_AND_B64:
-      case AMDGPU::S_OR_B64:
-      case AMDGPU::S_AND_B32:
-      case AMDGPU::S_OR_B32:
-        // Cleanup bit manipulations on exec mask
-        combineMasks(MI);
-        Last = I;
-        continue;
-
        default:
-        Last = I;
-        continue;
+        break;
        }
-
-      // Replay newly inserted code to combine masks
-      Next = (Last == MBB.end()) ? MBB.begin() : Last;
      }
    }
  
+  for (MachineInstr *MI : Worklist)
+    process(*MI);
+
    optimizeEndCf();
  
    LoweredEndCf.clear();
    LoweredIf.clear();
+  NeedsKillCleanup.clear();
  
    return true;
  }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll

index a2358f3..172e6bf 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -61,9 +61,11 @@ loop:
    br label %loop
  }
  
-; In case there's an epilog, we shouldn't have to do this.
+; Check that the epilog is the final block
  ; CHECK-LABEL: return_nonvoid
-; CHECK-NOT: exp null off, off, off, off done vm
+; CHECK: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
  define amdgpu_ps float @return_nonvoid(float %0) #0 {
  main_body:
    %cmp = fcmp olt float %0, 1.000000e+01
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll

index fee3158..f178259 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -470,7 +470,11 @@ bb9:                                              ; preds = %bb4
  }
  
  ; CHECK-LABEL: {{^}}cbranch_kill:
-; CHECK-NOT: exp null off, off, off, off done vm
+; CHECK: ; %bb.{{[0-9]+}}: ; %export
+; CHECK-NEXT: s_or_b64
+; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: exp null off, off, off, off done vm
  define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
  .entry:
    %val0 = extractelement <2 x float> %1, i32 0
author	Carl Ritson <carl.ritson@amd.com>
	Fri, 3 Jul 2020 03:25:33 +0000 (12:25 +0900)
committer	Carl Ritson <carl.ritson@amd.com>
	Fri, 3 Jul 2020 05:04:34 +0000 (14:04 +0900)
llvm/lib/Target/AMDGPU/SIInsertSkips.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll		patch \| blob \| history