OSDN Git Service

[LegacyPassManager] Remove TargetMachine constructors
[android-x86/external-llvm.git] / lib / Target / AMDGPU / R600ControlFlowFinalizer.cpp
1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/Support/Debug.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "R600Defines.h"
19 #include "R600InstrInfo.h"
20 #include "R600MachineFunctionInfo.h"
21 #include "R600RegisterInfo.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineOperand.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/Support/MathExtras.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include <algorithm>
36 #include <cassert>
37 #include <cstdint>
38 #include <new>
39 #include <set>
40 #include <utility>
41 #include <vector>
42
43 using namespace llvm;
44
45 #define DEBUG_TYPE "r600cf"
46
47 namespace {
48
49 struct CFStack {
50
51   enum StackItem {
52     ENTRY = 0,
53     SUB_ENTRY = 1,
54     FIRST_NON_WQM_PUSH = 2,
55     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
56   };
57
58   const R600Subtarget *ST;
59   std::vector<StackItem> BranchStack;
60   std::vector<StackItem> LoopStack;
61   unsigned MaxStackSize;
62   unsigned CurrentEntries = 0;
63   unsigned CurrentSubEntries = 0;
64
65   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
66       // We need to reserve a stack entry for CALL_FS in vertex shaders.
67       MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
68
69   unsigned getLoopDepth();
70   bool branchStackContains(CFStack::StackItem);
71   bool requiresWorkAroundForInst(unsigned Opcode);
72   unsigned getSubEntrySize(CFStack::StackItem Item);
73   void updateMaxStackSize();
74   void pushBranch(unsigned Opcode, bool isWQM = false);
75   void pushLoop();
76   void popBranch();
77   void popLoop();
78 };
79
80 unsigned CFStack::getLoopDepth() {
81   return LoopStack.size();
82 }
83
84 bool CFStack::branchStackContains(CFStack::StackItem Item) {
85   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
86        E = BranchStack.end(); I != E; ++I) {
87     if (*I == Item)
88       return true;
89   }
90   return false;
91 }
92
93 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
94   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
95       getLoopDepth() > 1)
96     return true;
97
98   if (!ST->hasCFAluBug())
99     return false;
100
101   switch(Opcode) {
102   default: return false;
103   case AMDGPU::CF_ALU_PUSH_BEFORE:
104   case AMDGPU::CF_ALU_ELSE_AFTER:
105   case AMDGPU::CF_ALU_BREAK:
106   case AMDGPU::CF_ALU_CONTINUE:
107     if (CurrentSubEntries == 0)
108       return false;
109     if (ST->getWavefrontSize() == 64) {
110       // We are being conservative here.  We only require this work-around if
111       // CurrentSubEntries > 3 &&
112       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
113       //
114       // We have to be conservative, because we don't know for certain that
115       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
116       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
117       // resources without any problems.
118       return CurrentSubEntries > 3;
119     } else {
120       assert(ST->getWavefrontSize() == 32);
121       // We are being conservative here.  We only require the work-around if
122       // CurrentSubEntries > 7 &&
123       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
124       // See the comment on the wavefront size == 64 case for why we are
125       // being conservative.
126       return CurrentSubEntries > 7;
127     }
128   }
129 }
130
131 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
132   switch(Item) {
133   default:
134     return 0;
135   case CFStack::FIRST_NON_WQM_PUSH:
136   assert(!ST->hasCaymanISA());
137   if (ST->getGeneration() <= R600Subtarget::R700) {
138     // +1 For the push operation.
139     // +2 Extra space required.
140     return 3;
141   } else {
142     // Some documentation says that this is not necessary on Evergreen,
143     // but experimentation has show that we need to allocate 1 extra
144     // sub-entry for the first non-WQM push.
145     // +1 For the push operation.
146     // +1 Extra space required.
147     return 2;
148   }
149   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
150     assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
151     // +1 For the push operation.
152     // +1 Extra space required.
153     return 2;
154   case CFStack::SUB_ENTRY:
155     return 1;
156   }
157 }
158
159 void CFStack::updateMaxStackSize() {
160   unsigned CurrentStackSize =
161       CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
162   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
163 }
164
165 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
166   CFStack::StackItem Item = CFStack::ENTRY;
167   switch(Opcode) {
168   case AMDGPU::CF_PUSH_EG:
169   case AMDGPU::CF_ALU_PUSH_BEFORE:
170     if (!isWQM) {
171       if (!ST->hasCaymanISA() &&
172           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
173         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
174                                              // See comment in
175                                              // CFStack::getSubEntrySize()
176       else if (CurrentEntries > 0 &&
177                ST->getGeneration() > R600Subtarget::EVERGREEN &&
178                !ST->hasCaymanISA() &&
179                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
180         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
181       else
182         Item = CFStack::SUB_ENTRY;
183     } else
184       Item = CFStack::ENTRY;
185     break;
186   }
187   BranchStack.push_back(Item);
188   if (Item == CFStack::ENTRY)
189     CurrentEntries++;
190   else
191     CurrentSubEntries += getSubEntrySize(Item);
192   updateMaxStackSize();
193 }
194
195 void CFStack::pushLoop() {
196   LoopStack.push_back(CFStack::ENTRY);
197   CurrentEntries++;
198   updateMaxStackSize();
199 }
200
201 void CFStack::popBranch() {
202   CFStack::StackItem Top = BranchStack.back();
203   if (Top == CFStack::ENTRY)
204     CurrentEntries--;
205   else
206     CurrentSubEntries-= getSubEntrySize(Top);
207   BranchStack.pop_back();
208 }
209
210 void CFStack::popLoop() {
211   CurrentEntries--;
212   LoopStack.pop_back();
213 }
214
215 class R600ControlFlowFinalizer : public MachineFunctionPass {
216 private:
217   typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
218
219   enum ControlFlowInstruction {
220     CF_TC,
221     CF_VC,
222     CF_CALL_FS,
223     CF_WHILE_LOOP,
224     CF_END_LOOP,
225     CF_LOOP_BREAK,
226     CF_LOOP_CONTINUE,
227     CF_JUMP,
228     CF_ELSE,
229     CF_POP,
230     CF_END
231   };
232
233   static char ID;
234   const R600InstrInfo *TII = nullptr;
235   const R600RegisterInfo *TRI = nullptr;
236   unsigned MaxFetchInst;
237   const R600Subtarget *ST = nullptr;
238
239   bool IsTrivialInst(MachineInstr &MI) const {
240     switch (MI.getOpcode()) {
241     case AMDGPU::KILL:
242     case AMDGPU::RETURN:
243       return true;
244     default:
245       return false;
246     }
247   }
248
249   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
250     unsigned Opcode = 0;
251     bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
252     switch (CFI) {
253     case CF_TC:
254       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
255       break;
256     case CF_VC:
257       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
258       break;
259     case CF_CALL_FS:
260       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
261       break;
262     case CF_WHILE_LOOP:
263       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
264       break;
265     case CF_END_LOOP:
266       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
267       break;
268     case CF_LOOP_BREAK:
269       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
270       break;
271     case CF_LOOP_CONTINUE:
272       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
273       break;
274     case CF_JUMP:
275       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
276       break;
277     case CF_ELSE:
278       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
279       break;
280     case CF_POP:
281       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
282       break;
283     case CF_END:
284       if (ST->hasCaymanISA()) {
285         Opcode = AMDGPU::CF_END_CM;
286         break;
287       }
288       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
289       break;
290     }
291     assert (Opcode && "No opcode selected");
292     return TII->get(Opcode);
293   }
294
295   bool isCompatibleWithClause(const MachineInstr &MI,
296                               std::set<unsigned> &DstRegs) const {
297     unsigned DstMI, SrcMI;
298     for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
299                                           E = MI.operands_end();
300          I != E; ++I) {
301       const MachineOperand &MO = *I;
302       if (!MO.isReg())
303         continue;
304       if (MO.isDef()) {
305         unsigned Reg = MO.getReg();
306         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
307           DstMI = Reg;
308         else
309           DstMI = TRI->getMatchingSuperReg(Reg,
310               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
311               &AMDGPU::R600_Reg128RegClass);
312       }
313       if (MO.isUse()) {
314         unsigned Reg = MO.getReg();
315         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
316           SrcMI = Reg;
317         else
318           SrcMI = TRI->getMatchingSuperReg(Reg,
319               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
320               &AMDGPU::R600_Reg128RegClass);
321       }
322     }
323     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
324       DstRegs.insert(DstMI);
325       return true;
326     } else
327       return false;
328   }
329
330   ClauseFile
331   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
332       const {
333     MachineBasicBlock::iterator ClauseHead = I;
334     std::vector<MachineInstr *> ClauseContent;
335     unsigned AluInstCount = 0;
336     bool IsTex = TII->usesTextureCache(*ClauseHead);
337     std::set<unsigned> DstRegs;
338     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
339       if (IsTrivialInst(*I))
340         continue;
341       if (AluInstCount >= MaxFetchInst)
342         break;
343       if ((IsTex && !TII->usesTextureCache(*I)) ||
344           (!IsTex && !TII->usesVertexCache(*I)))
345         break;
346       if (!isCompatibleWithClause(*I, DstRegs))
347         break;
348       AluInstCount ++;
349       ClauseContent.push_back(&*I);
350     }
351     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
352         getHWInstrDesc(IsTex?CF_TC:CF_VC))
353         .addImm(0) // ADDR
354         .addImm(AluInstCount - 1); // COUNT
355     return ClauseFile(MIb, std::move(ClauseContent));
356   }
357
358   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
359     static const unsigned LiteralRegs[] = {
360       AMDGPU::ALU_LITERAL_X,
361       AMDGPU::ALU_LITERAL_Y,
362       AMDGPU::ALU_LITERAL_Z,
363       AMDGPU::ALU_LITERAL_W
364     };
365     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
366         TII->getSrcs(MI);
367     for (const auto &Src:Srcs) {
368       if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
369         continue;
370       int64_t Imm = Src.second;
371       std::vector<MachineOperand *>::iterator It =
372           llvm::find_if(Lits, [&](MachineOperand *val) {
373             return val->isImm() && (val->getImm() == Imm);
374           });
375
376       // Get corresponding Operand
377       MachineOperand &Operand = MI.getOperand(
378           TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
379
380       if (It != Lits.end()) {
381         // Reuse existing literal reg
382         unsigned Index = It - Lits.begin();
383         Src.first->setReg(LiteralRegs[Index]);
384       } else {
385         // Allocate new literal reg
386         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
387         Src.first->setReg(LiteralRegs[Lits.size()]);
388         Lits.push_back(&Operand);
389       }
390     }
391   }
392
393   MachineBasicBlock::iterator insertLiterals(
394       MachineBasicBlock::iterator InsertPos,
395       const std::vector<unsigned> &Literals) const {
396     MachineBasicBlock *MBB = InsertPos->getParent();
397     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
398       unsigned LiteralPair0 = Literals[i];
399       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
400       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
401           TII->get(AMDGPU::LITERALS))
402           .addImm(LiteralPair0)
403           .addImm(LiteralPair1);
404     }
405     return InsertPos;
406   }
407
408   ClauseFile
409   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
410       const {
411     MachineInstr &ClauseHead = *I;
412     std::vector<MachineInstr *> ClauseContent;
413     I++;
414     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
415       if (IsTrivialInst(*I)) {
416         ++I;
417         continue;
418       }
419       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
420         break;
421       std::vector<MachineOperand *>Literals;
422       if (I->isBundle()) {
423         MachineInstr &DeleteMI = *I;
424         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
425         while (++BI != E && BI->isBundledWithPred()) {
426           BI->unbundleFromPred();
427           for (MachineOperand &MO : BI->operands()) {
428             if (MO.isReg() && MO.isInternalRead())
429               MO.setIsInternalRead(false);
430           }
431           getLiteral(*BI, Literals);
432           ClauseContent.push_back(&*BI);
433         }
434         I = BI;
435         DeleteMI.eraseFromParent();
436       } else {
437         getLiteral(*I, Literals);
438         ClauseContent.push_back(&*I);
439         I++;
440       }
441       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
442         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
443             TII->get(AMDGPU::LITERALS));
444         if (Literals[i]->isImm()) {
445             MILit.addImm(Literals[i]->getImm());
446         } else {
447             MILit.addGlobalAddress(Literals[i]->getGlobal(),
448                                    Literals[i]->getOffset());
449         }
450         if (i + 1 < e) {
451           if (Literals[i + 1]->isImm()) {
452             MILit.addImm(Literals[i + 1]->getImm());
453           } else {
454             MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
455                                    Literals[i + 1]->getOffset());
456           }
457         } else
458           MILit.addImm(0);
459         ClauseContent.push_back(MILit);
460       }
461     }
462     assert(ClauseContent.size() < 128 && "ALU clause is too big");
463     ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
464     return ClauseFile(&ClauseHead, std::move(ClauseContent));
465   }
466
467   void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
468                        const DebugLoc &DL, ClauseFile &Clause,
469                        unsigned &CfCount) {
470     CounterPropagateAddr(*Clause.first, CfCount);
471     MachineBasicBlock *BB = Clause.first->getParent();
472     BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
473     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
474       BB->splice(InsertPos, BB, Clause.second[i]);
475     }
476     CfCount += 2 * Clause.second.size();
477   }
478
479   void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
480                      ClauseFile &Clause, unsigned &CfCount) {
481     Clause.first->getOperand(0).setImm(0);
482     CounterPropagateAddr(*Clause.first, CfCount);
483     MachineBasicBlock *BB = Clause.first->getParent();
484     BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
485     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
486       BB->splice(InsertPos, BB, Clause.second[i]);
487     }
488     CfCount += Clause.second.size();
489   }
490
491   void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
492     MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
493   }
494   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
495                             unsigned Addr) const {
496     for (MachineInstr *MI : MIs) {
497       CounterPropagateAddr(*MI, Addr);
498     }
499   }
500
501 public:
502   R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
503
504   bool runOnMachineFunction(MachineFunction &MF) override {
505     ST = &MF.getSubtarget<R600Subtarget>();
506     MaxFetchInst = ST->getTexVTXClauseSize();
507     TII = ST->getInstrInfo();
508     TRI = ST->getRegisterInfo();
509
510     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
511
512     CFStack CFStack(ST, MF.getFunction()->getCallingConv());
513     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
514         ++MB) {
515       MachineBasicBlock &MBB = *MB;
516       unsigned CfCount = 0;
517       std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
518       std::vector<MachineInstr * > IfThenElseStack;
519       if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
520         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
521             getHWInstrDesc(CF_CALL_FS));
522         CfCount++;
523       }
524       std::vector<ClauseFile> FetchClauses, AluClauses;
525       std::vector<MachineInstr *> LastAlu(1);
526       std::vector<MachineInstr *> ToPopAfter;
527
528       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
529           I != E;) {
530         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
531           DEBUG(dbgs() << CfCount << ":"; I->dump(););
532           FetchClauses.push_back(MakeFetchClause(MBB, I));
533           CfCount++;
534           LastAlu.back() = nullptr;
535           continue;
536         }
537
538         MachineBasicBlock::iterator MI = I;
539         if (MI->getOpcode() != AMDGPU::ENDIF)
540           LastAlu.back() = nullptr;
541         if (MI->getOpcode() == AMDGPU::CF_ALU)
542           LastAlu.back() = &*MI;
543         I++;
544         bool RequiresWorkAround =
545             CFStack.requiresWorkAroundForInst(MI->getOpcode());
546         switch (MI->getOpcode()) {
547         case AMDGPU::CF_ALU_PUSH_BEFORE:
548           if (RequiresWorkAround) {
549             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
550             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
551                 .addImm(CfCount + 1)
552                 .addImm(1);
553             MI->setDesc(TII->get(AMDGPU::CF_ALU));
554             CfCount++;
555             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
556           } else
557             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
558
559         case AMDGPU::CF_ALU:
560           I = MI;
561           AluClauses.push_back(MakeALUClause(MBB, I));
562           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
563           CfCount++;
564           break;
565         case AMDGPU::WHILELOOP: {
566           CFStack.pushLoop();
567           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
568               getHWInstrDesc(CF_WHILE_LOOP))
569               .addImm(1);
570           std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
571               std::set<MachineInstr *>());
572           Pair.second.insert(MIb);
573           LoopStack.push_back(std::move(Pair));
574           MI->eraseFromParent();
575           CfCount++;
576           break;
577         }
578         case AMDGPU::ENDLOOP: {
579           CFStack.popLoop();
580           std::pair<unsigned, std::set<MachineInstr *>> Pair =
581               std::move(LoopStack.back());
582           LoopStack.pop_back();
583           CounterPropagateAddr(Pair.second, CfCount);
584           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
585               .addImm(Pair.first + 1);
586           MI->eraseFromParent();
587           CfCount++;
588           break;
589         }
590         case AMDGPU::IF_PREDICATE_SET: {
591           LastAlu.push_back(nullptr);
592           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
593               getHWInstrDesc(CF_JUMP))
594               .addImm(0)
595               .addImm(0);
596           IfThenElseStack.push_back(MIb);
597           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
598           MI->eraseFromParent();
599           CfCount++;
600           break;
601         }
602         case AMDGPU::ELSE: {
603           MachineInstr * JumpInst = IfThenElseStack.back();
604           IfThenElseStack.pop_back();
605           CounterPropagateAddr(*JumpInst, CfCount);
606           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
607               getHWInstrDesc(CF_ELSE))
608               .addImm(0)
609               .addImm(0);
610           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
611           IfThenElseStack.push_back(MIb);
612           MI->eraseFromParent();
613           CfCount++;
614           break;
615         }
616         case AMDGPU::ENDIF: {
617           CFStack.popBranch();
618           if (LastAlu.back()) {
619             ToPopAfter.push_back(LastAlu.back());
620           } else {
621             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
622                 getHWInstrDesc(CF_POP))
623                 .addImm(CfCount + 1)
624                 .addImm(1);
625             (void)MIb;
626             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
627             CfCount++;
628           }
629
630           MachineInstr *IfOrElseInst = IfThenElseStack.back();
631           IfThenElseStack.pop_back();
632           CounterPropagateAddr(*IfOrElseInst, CfCount);
633           IfOrElseInst->getOperand(1).setImm(1);
634           LastAlu.pop_back();
635           MI->eraseFromParent();
636           break;
637         }
638         case AMDGPU::BREAK: {
639           CfCount ++;
640           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
641               getHWInstrDesc(CF_LOOP_BREAK))
642               .addImm(0);
643           LoopStack.back().second.insert(MIb);
644           MI->eraseFromParent();
645           break;
646         }
647         case AMDGPU::CONTINUE: {
648           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
649               getHWInstrDesc(CF_LOOP_CONTINUE))
650               .addImm(0);
651           LoopStack.back().second.insert(MIb);
652           MI->eraseFromParent();
653           CfCount++;
654           break;
655         }
656         case AMDGPU::RETURN: {
657           DebugLoc DL = MBB.findDebugLoc(MI);
658           BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
659           CfCount++;
660           if (CfCount % 2) {
661             BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
662             CfCount++;
663           }
664           MI->eraseFromParent();
665           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
666             EmitFetchClause(I, DL, FetchClauses[i], CfCount);
667           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
668             EmitALUClause(I, DL, AluClauses[i], CfCount);
669           break;
670         }
671         default:
672           if (TII->isExport(MI->getOpcode())) {
673             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
674             CfCount++;
675           }
676           break;
677         }
678       }
679       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
680         MachineInstr *Alu = ToPopAfter[i];
681         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
682             TII->get(AMDGPU::CF_ALU_POP_AFTER))
683             .addImm(Alu->getOperand(0).getImm())
684             .addImm(Alu->getOperand(1).getImm())
685             .addImm(Alu->getOperand(2).getImm())
686             .addImm(Alu->getOperand(3).getImm())
687             .addImm(Alu->getOperand(4).getImm())
688             .addImm(Alu->getOperand(5).getImm())
689             .addImm(Alu->getOperand(6).getImm())
690             .addImm(Alu->getOperand(7).getImm())
691             .addImm(Alu->getOperand(8).getImm());
692         Alu->eraseFromParent();
693       }
694       MFI->CFStackSize = CFStack.MaxStackSize;
695     }
696
697     return false;
698   }
699
700   StringRef getPassName() const override {
701     return "R600 Control Flow Finalizer Pass";
702   }
703 };
704
705 char R600ControlFlowFinalizer::ID = 0;
706
707 } // end anonymous namespace
708
709 FunctionPass *llvm::createR600ControlFlowFinalizer() {
710   return new R600ControlFlowFinalizer();
711 }