Subzero: Update for LLVM 3.9 (trunk).

[android-x86/external-swiftshader.git] / src / IceCfgNode.cpp
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp

index d269ee2..78e3d18 100644 (file)
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -6,124 +6,182 @@
  // License. See LICENSE.TXT for details.
  //
  //===----------------------------------------------------------------------===//
-//
-// This file implements the CfgNode class, including the complexities
-// of instruction insertion and in-edge calculation.
-//
+///
+/// \file
+/// \brief Implements the CfgNode class, including the complexities of
+/// instruction insertion and in-edge calculation.
+///
  //===----------------------------------------------------------------------===//
  
-#include "assembler.h"
-#include "IceCfg.h"
  #include "IceCfgNode.h"
+
+#include "IceAssembler.h"
+#include "IceCfg.h"
+#include "IceGlobalInits.h"
  #include "IceInst.h"
+#include "IceInstVarIter.h"
  #include "IceLiveness.h"
  #include "IceOperand.h"
  #include "IceTargetLowering.h"
  
  namespace Ice {
  
-CfgNode::CfgNode(Cfg *Func, SizeT LabelNumber)
-    : Func(Func), Number(LabelNumber), NameIndex(Cfg::IdentifierIndexInvalid),
-      HasReturn(false), NeedsPlacement(false), InstCountEstimate(0) {}
+// Adds an instruction to either the Phi list or the regular instruction list.
+// Validates that all Phis are added before all regular instructions.
+void CfgNode::appendInst(Inst *Instr) {
+  ++InstCountEstimate;
  
-// Returns the name the node was created with.  If no name was given,
-// it synthesizes a (hopefully) unique name.
-IceString CfgNode::getName() const {
-  if (NameIndex >= 0)
-    return Func->getIdentifierName(NameIndex);
-  return "__" + std::to_string(getIndex());
-}
+  if (BuildDefs::wasm()) {
+    if (llvm::isa<InstSwitch>(Instr) || llvm::isa<InstBr>(Instr)) {
+      for (auto *N : Instr->getTerminatorEdges()) {
+        N->addInEdge(this);
+        addOutEdge(N);
+      }
+    }
+  }
  
-// Adds an instruction to either the Phi list or the regular
-// instruction list.  Validates that all Phis are added before all
-// regular instructions.
-void CfgNode::appendInst(Inst *Inst) {
-  ++InstCountEstimate;
-  if (InstPhi *Phi = llvm::dyn_cast<InstPhi>(Inst)) {
+  if (auto *Phi = llvm::dyn_cast<InstPhi>(Instr)) {
      if (!Insts.empty()) {
        Func->setError("Phi instruction added to the middle of a block");
        return;
      }
      Phis.push_back(Phi);
    } else {
-    Insts.push_back(Inst);
+    Insts.push_back(Instr);
    }
  }
  
-// Renumbers the non-deleted instructions in the node.  This needs to
-// be done in preparation for live range analysis.  The instruction
-// numbers in a block must be monotonically increasing.  The range of
-// instruction numbers in a block, from lowest to highest, must not
-// overlap with the range of any other block.
+namespace {
+template <typename List> void removeDeletedAndRenumber(List *L, Cfg *Func) {
+  const bool DoDelete =
+      BuildDefs::minimal() || !getFlags().getKeepDeletedInsts();
+  auto I = L->begin(), E = L->end(), Next = I;
+  for (++Next; I != E; I = Next++) {
+    if (DoDelete && I->isDeleted()) {
+      L->erase(I);
+    } else {
+      I->renumber(Func);
+    }
+  }
+}
+} // end of anonymous namespace
+
  void CfgNode::renumberInstructions() {
    InstNumberT FirstNumber = Func->getNextInstNumber();
-  for (Inst &I : Phis)
-    I.renumber(Func);
-  for (Inst &I : Insts)
-    I.renumber(Func);
+  removeDeletedAndRenumber(&Phis, Func);
+  removeDeletedAndRenumber(&Insts, Func);
    InstCountEstimate = Func->getNextInstNumber() - FirstNumber;
  }
  
-// When a node is created, the OutEdges are immediately known, but the
-// InEdges have to be built up incrementally.  After the CFG has been
-// constructed, the computePredecessors() pass finalizes it by
-// creating the InEdges list.
+// When a node is created, the OutEdges are immediately known, but the InEdges
+// have to be built up incrementally. After the CFG has been constructed, the
+// computePredecessors() pass finalizes it by creating the InEdges list.
  void CfgNode::computePredecessors() {
-  OutEdges = Insts.rbegin()->getTerminatorEdges();
    for (CfgNode *Succ : OutEdges)
      Succ->InEdges.push_back(this);
  }
  
-// This does part 1 of Phi lowering, by creating a new dest variable
-// for each Phi instruction, replacing the Phi instruction's dest with
-// that variable, and adding an explicit assignment of the old dest to
-// the new dest.  For example,
+void CfgNode::computeSuccessors() {
+  OutEdges.clear();
+  InEdges.clear();
+  assert(!Insts.empty());
+  OutEdges = Insts.rbegin()->getTerminatorEdges();
+}
+
+// Ensure each Phi instruction in the node is consistent with respect to control
+// flow.  For each predecessor, there must be a phi argument with that label.
+// If a phi argument's label doesn't appear in the predecessor list (which can
+// happen as a result of e.g. unreachable node elimination), its value is
+// modified to be zero, to maintain consistency in liveness analysis.  This
+// allows us to remove some dead control flow without a major rework of the phi
+// instructions.  We don't check that phi arguments with the same label have the
+// same value.
+void CfgNode::enforcePhiConsistency() {
+  for (Inst &Instr : Phis) {
+    auto *Phi = llvm::cast<InstPhi>(&Instr);
+    // We do a simple O(N^2) algorithm to check for consistency. Even so, it
+    // shows up as only about 0.2% of the total translation time. But if
+    // necessary, we could improve the complexity by using a hash table to
+    // count how many times each node is referenced in the Phi instruction, and
+    // how many times each node is referenced in the incoming edge list, and
+    // compare the two for equality.
+    for (SizeT i = 0; i < Phi->getSrcSize(); ++i) {
+      CfgNode *Label = Phi->getLabel(i);
+      bool Found = false;
+      for (CfgNode *InNode : getInEdges()) {
+        if (InNode == Label) {
+          Found = true;
+          break;
+        }
+      }
+      if (!Found) {
+        // Predecessor was unreachable, so if (impossibly) the control flow
+        // enters from that predecessor, the value should be zero.
+        Phi->clearOperandForTarget(Label);
+      }
+    }
+    for (CfgNode *InNode : getInEdges()) {
+      bool Found = false;
+      for (SizeT i = 0; i < Phi->getSrcSize(); ++i) {
+        CfgNode *Label = Phi->getLabel(i);
+        if (InNode == Label) {
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        llvm::report_fatal_error("Phi error: missing label for incoming edge");
+    }
+  }
+}
+
+// This does part 1 of Phi lowering, by creating a new dest variable for each
+// Phi instruction, replacing the Phi instruction's dest with that variable,
+// and adding an explicit assignment of the old dest to the new dest. For
+// example,
  //   a=phi(...)
  // changes to
  //   "a_phi=phi(...); a=a_phi".
  //
-// This is in preparation for part 2 which deletes the Phi
-// instructions and appends assignment instructions to predecessor
-// blocks.  Note that this transformation preserves SSA form.
+// This is in preparation for part 2 which deletes the Phi instructions and
+// appends assignment instructions to predecessor blocks. Note that this
+// transformation preserves SSA form.
  void CfgNode::placePhiLoads() {
    for (Inst &I : Phis) {
-    auto Phi = llvm::dyn_cast<InstPhi>(&I);
+    auto *Phi = llvm::dyn_cast<InstPhi>(&I);
      Insts.insert(Insts.begin(), Phi->lower(Func));
    }
  }
  
-// This does part 2 of Phi lowering.  For each Phi instruction at each
-// out-edge, create a corresponding assignment instruction, and add
-// all the assignments near the end of this block.  They need to be
-// added before any branch instruction, and also if the block ends
-// with a compare instruction followed by a branch instruction that we
-// may want to fuse, it's better to insert the new assignments before
-// the compare instruction. The tryOptimizedCmpxchgCmpBr() method
-// assumes this ordering of instructions.
+// This does part 2 of Phi lowering. For each Phi instruction at each out-edge,
+// create a corresponding assignment instruction, and add all the assignments
+// near the end of this block. They need to be added before any branch
+// instruction, and also if the block ends with a compare instruction followed
+// by a branch instruction that we may want to fuse, it's better to insert the
+// new assignments before the compare instruction. The
+// tryOptimizedCmpxchgCmpBr() method assumes this ordering of instructions.
  //
-// Note that this transformation takes the Phi dest variables out of
-// SSA form, as there may be assignments to the dest variable in
-// multiple blocks.
+// Note that this transformation takes the Phi dest variables out of SSA form,
+// as there may be assignments to the dest variable in multiple blocks.
  void CfgNode::placePhiStores() {
    // Find the insertion point.
    InstList::iterator InsertionPoint = Insts.end();
-  // Every block must end in a terminator instruction, and therefore
-  // must have at least one instruction, so it's valid to decrement
-  // InsertionPoint (but assert just in case).
+  // Every block must end in a terminator instruction, and therefore must have
+  // at least one instruction, so it's valid to decrement InsertionPoint (but
+  // assert just in case).
    assert(InsertionPoint != Insts.begin());
    --InsertionPoint;
-  // Confirm that InsertionPoint is a terminator instruction.  Calling
-  // getTerminatorEdges() on a non-terminator instruction will cause
-  // an llvm_unreachable().
+  // Confirm that InsertionPoint is a terminator instruction. Calling
+  // getTerminatorEdges() on a non-terminator instruction will cause an
+  // llvm_unreachable().
    (void)InsertionPoint->getTerminatorEdges();
    // SafeInsertionPoint is always immediately before the terminator
-  // instruction.  If the block ends in a compare and conditional
-  // branch, it's better to place the Phi store before the compare so
-  // as not to interfere with compare/branch fusing.  However, if the
-  // compare instruction's dest operand is the same as the new
-  // assignment statement's source operand, this can't be done due to
-  // data dependences, so we need to fall back to the
-  // SafeInsertionPoint.  To illustrate:
+  // instruction. If the block ends in a compare and conditional branch, it's
+  // better to place the Phi store before the compare so as not to interfere
+  // with compare/branch fusing. However, if the compare instruction's dest
+  // operand is the same as the new assignment statement's source operand, this
+  // can't be done due to data dependences, so we need to fall back to the
+  // SafeInsertionPoint. To illustrate:
    //   ; <label>:95
    //   %97 = load i8* %96, align 1
    //   %98 = icmp ne i8 %97, 0
@@ -142,9 +200,8 @@ void CfgNode::placePhiStores() {
    //   %100 = %100_phi
    //   %101 = %101_phi
    //
-  // TODO(stichnot): It may be possible to bypass this whole
-  // SafeInsertionPoint mechanism.  If a source basic block ends in a
-  // conditional branch:
+  // TODO(stichnot): It may be possible to bypass this whole SafeInsertionPoint
+  // mechanism. If a source basic block ends in a conditional branch:
    //   labelSource:
    //   ...
    //   br i1 %foo, label %labelTrue, label %labelFalse
@@ -154,18 +211,18 @@ void CfgNode::placePhiStores() {
    // then we actually know the constant i1 value of the Phi operand:
    //   labelTrue:
    //   %bar = phi i1 [ true, %labelSource ], ...
-  // It seems that this optimization should be done by clang or opt,
-  // but we could also do it here.
+  // It seems that this optimization should be done by clang or opt, but we
+  // could also do it here.
    InstList::iterator SafeInsertionPoint = InsertionPoint;
-  // Keep track of the dest variable of a compare instruction, so that
-  // we insert the new instruction at the SafeInsertionPoint if the
-  // compare's dest matches the Phi-lowered assignment's source.
+  // Keep track of the dest variable of a compare instruction, so that we
+  // insert the new instruction at the SafeInsertionPoint if the compare's dest
+  // matches the Phi-lowered assignment's source.
    Variable *CmpInstDest = nullptr;
-  // If the current insertion point is at a conditional branch
-  // instruction, and the previous instruction is a compare
-  // instruction, then we move the insertion point before the compare
-  // instruction so as not to interfere with compare/branch fusing.
-  if (InstBr *Branch = llvm::dyn_cast<InstBr>(InsertionPoint)) {
+  // If the current insertion point is at a conditional branch instruction, and
+  // the previous instruction is a compare instruction, then we move the
+  // insertion point before the compare instruction so as not to interfere with
+  // compare/branch fusing.
+  if (auto *Branch = llvm::dyn_cast<InstBr>(InsertionPoint)) {
      if (!Branch->isUnconditional()) {
        if (InsertionPoint != Insts.begin()) {
          --InsertionPoint;
@@ -183,12 +240,12 @@ void CfgNode::placePhiStores() {
    for (CfgNode *Succ : OutEdges) {
      // Consider every Phi instruction at the out-edge.
      for (Inst &I : Succ->Phis) {
-      auto Phi = llvm::dyn_cast<InstPhi>(&I);
+      auto *Phi = llvm::dyn_cast<InstPhi>(&I);
        Operand *Operand = Phi->getOperandForTarget(this);
        assert(Operand);
        Variable *Dest = I.getDest();
        assert(Dest);
-      InstAssign *NewInst = InstAssign::create(Func, Dest, Operand);
+      auto *NewInst = InstAssign::create(Func, Dest, Operand);
        if (CmpInstDest == Operand)
          Insts.insert(SafeInsertionPoint, NewInst);
        else
@@ -203,282 +260,328 @@ void CfgNode::deletePhis() {
      I.setDeleted();
  }
  
-// Splits the edge from Pred to this node by creating a new node and
-// hooking up the in and out edges appropriately.  (The EdgeIndex
-// parameter is only used to make the new node's name unique when
-// there are multiple edges between the same pair of nodes.)  The new
-// node's instruction list is initialized to the empty list, with no
-// terminator instruction.  If there are multiple edges from Pred to
-// this node, only one edge is split, and the particular choice of
-// edge is undefined.  This could happen with a switch instruction, or
-// a conditional branch that weirdly has both branches to the same
-// place.  TODO(stichnot,kschimpf): Figure out whether this is legal
-// in the LLVM IR or the PNaCl bitcode, and if so, we need to
-// establish a strong relationship among the ordering of Pred's
-// out-edge list, this node's in-edge list, and the Phi instruction's
-// operand list.
+// Splits the edge from Pred to this node by creating a new node and hooking up
+// the in and out edges appropriately. (The EdgeIndex parameter is only used to
+// make the new node's name unique when there are multiple edges between the
+// same pair of nodes.) The new node's instruction list is initialized to the
+// empty list, with no terminator instruction. There must not be multiple edges
+// from Pred to this node so all Inst::getTerminatorEdges implementations must
+// not contain duplicates.
  CfgNode *CfgNode::splitIncomingEdge(CfgNode *Pred, SizeT EdgeIndex) {
    CfgNode *NewNode = Func->makeNode();
-  if (ALLOW_DUMP)
+  // Depth is the minimum as it works if both are the same, but if one is
+  // outside the loop and the other is inside, the new node should be placed
+  // outside and not be executed multiple times within the loop.
+  NewNode->setLoopNestDepth(
+      std::min(getLoopNestDepth(), Pred->getLoopNestDepth()));
+  if (BuildDefs::dump())
      NewNode->setName("split_" + Pred->getName() + "_" + getName() + "_" +
                       std::to_string(EdgeIndex));
-  // The new node is added to the end of the node list, and will later
-  // need to be sorted into a reasonable topological order.
+  // The new node is added to the end of the node list, and will later need to
+  // be sorted into a reasonable topological order.
    NewNode->setNeedsPlacement(true);
    // Repoint Pred's out-edge.
    bool Found = false;
-  for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end();
-       !Found && I != E; ++I) {
-    if (*I == this) {
-      *I = NewNode;
+  for (CfgNode *&I : Pred->OutEdges) {
+    if (I == this) {
+      I = NewNode;
        NewNode->InEdges.push_back(Pred);
        Found = true;
+      break;
      }
    }
    assert(Found);
+  (void)Found;
    // Repoint this node's in-edge.
    Found = false;
-  for (auto I = InEdges.begin(), E = InEdges.end(); !Found && I != E; ++I) {
-    if (*I == Pred) {
-      *I = NewNode;
+  for (CfgNode *&I : InEdges) {
+    if (I == Pred) {
+      I = NewNode;
        NewNode->OutEdges.push_back(this);
        Found = true;
+      break;
      }
    }
    assert(Found);
-  // Repoint a suitable branch instruction's target and return.
+  (void)Found;
+  // Repoint all suitable branch instructions' target and return.
    Found = false;
-  for (Inst &I : reverse_range(Pred->getInsts())) {
-    if (!I.isDeleted() && I.repointEdge(this, NewNode))
-      return NewNode;
-  }
-  // This should be unreachable, so the assert will fail.
+  for (Inst &I : Pred->getInsts())
+    if (!I.isDeleted() && I.repointEdges(this, NewNode))
+      Found = true;
    assert(Found);
+  (void)Found;
    return NewNode;
  }
  
  namespace {
  
-// Helper function used by advancedPhiLowering().
-bool sameVarOrReg(const Variable *Var, const Operand *Opnd) {
-  if (Var == Opnd)
+// Helpers for advancedPhiLowering().
+
+class PhiDesc {
+  PhiDesc() = delete;
+  PhiDesc(const PhiDesc &) = delete;
+  PhiDesc &operator=(const PhiDesc &) = delete;
+
+public:
+  PhiDesc(InstPhi *Phi, Variable *Dest) : Phi(Phi), Dest(Dest) {}
+  PhiDesc(PhiDesc &&) = default;
+  InstPhi *Phi = nullptr;
+  Variable *Dest = nullptr;
+  Operand *Src = nullptr;
+  bool Processed = false;
+  size_t NumPred = 0; // number of entries whose Src is this Dest
+  int32_t Weight = 0; // preference for topological order
+};
+using PhiDescList = llvm::SmallVector<PhiDesc, 32>;
+
+// Always pick NumPred=0 over NumPred>0.
+constexpr int32_t WeightNoPreds = 8;
+// Prefer Src as a register because the register might free up.
+constexpr int32_t WeightSrcIsReg = 4;
+// Prefer Dest not as a register because the register stays free longer.
+constexpr int32_t WeightDestNotReg = 2;
+// Prefer NumPred=1 over NumPred>1.  This is used as a tiebreaker when a
+// dependency cycle must be broken so that hopefully only one temporary
+// assignment has to be added to break the cycle.
+constexpr int32_t WeightOnePred = 1;
+
+bool sameVarOrReg(TargetLowering *Target, const Variable *Var1,
+                  const Operand *Opnd) {
+  if (Var1 == Opnd)
      return true;
-  if (const auto Var2 = llvm::dyn_cast<Variable>(Opnd)) {
-    if (Var->hasReg() && Var->getRegNum() == Var2->getRegNum())
-      return true;
+  const auto *Var2 = llvm::dyn_cast<Variable>(Opnd);
+  if (Var2 == nullptr)
+    return false;
+
+  // If either operand lacks a register, they cannot be the same.
+  if (!Var1->hasReg())
+    return false;
+  if (!Var2->hasReg())
+    return false;
+
+  const auto RegNum1 = Var1->getRegNum();
+  const auto RegNum2 = Var2->getRegNum();
+  // Quick common-case check.
+  if (RegNum1 == RegNum2)
+    return true;
+
+  assert(Target->getAliasesForRegister(RegNum1)[RegNum2] ==
+         Target->getAliasesForRegister(RegNum2)[RegNum1]);
+  return Target->getAliasesForRegister(RegNum1)[RegNum2];
+}
+
+// Update NumPred for all Phi assignments using Var as their Dest variable.
+// Also update Weight if NumPred dropped from 2 to 1, or 1 to 0.
+void updatePreds(PhiDescList &Desc, TargetLowering *Target, Variable *Var) {
+  for (PhiDesc &Item : Desc) {
+    if (!Item.Processed && sameVarOrReg(Target, Var, Item.Dest)) {
+      --Item.NumPred;
+      if (Item.NumPred == 1) {
+        // If NumPred changed from 2 to 1, add in WeightOnePred.
+        Item.Weight += WeightOnePred;
+      } else if (Item.NumPred == 0) {
+        // If NumPred changed from 1 to 0, subtract WeightOnePred and add in
+        // WeightNoPreds.
+        Item.Weight += (WeightNoPreds - WeightOnePred);
+      }
+    }
    }
-  return false;
  }
  
  } // end of anonymous namespace
  
-// This the "advanced" version of Phi lowering for a basic block, in
-// contrast to the simple version that lowers through assignments
-// involving temporaries.
+// This the "advanced" version of Phi lowering for a basic block, in contrast
+// to the simple version that lowers through assignments involving temporaries.
  //
-// All Phi instructions in a basic block are conceptually executed in
-// parallel.  However, if we lower Phis early and commit to a
-// sequential ordering, we may end up creating unnecessary
-// interferences which lead to worse register allocation.  Delaying
-// Phi scheduling until after register allocation can help unless
-// there are no free registers for shuffling registers or stack slots
+// All Phi instructions in a basic block are conceptually executed in parallel.
+// However, if we lower Phis early and commit to a sequential ordering, we may
+// end up creating unnecessary interferences which lead to worse register
+// allocation. Delaying Phi scheduling until after register allocation can help
+// unless there are no free registers for shuffling registers or stack slots
  // and spilling becomes necessary.
  //
-// The advanced Phi lowering starts by finding a topological sort of
-// the Phi instructions, where "A=B" comes before "B=C" due to the
-// anti-dependence on B.  If a topological sort is not possible due to
-// a cycle, the cycle is broken by introducing a non-parallel
-// temporary.  For example, a cycle arising from a permutation like
-// "A=B;B=C;C=A" can become "T=A;A=B;B=C;C=T".  All else being equal,
-// prefer to schedule assignments with register-allocated Src operands
-// earlier, in case that register becomes free afterwards, and prefer
-// to schedule assignments with register-allocated Dest variables
-// later, to keep that register free for longer.
+// The advanced Phi lowering starts by finding a topological sort of the Phi
+// instructions, where "A=B" comes before "B=C" due to the anti-dependence on
+// B. Preexisting register assignments are considered in the topological sort.
+// If a topological sort is not possible due to a cycle, the cycle is broken by
+// introducing a non-parallel temporary. For example, a cycle arising from a
+// permutation like "A=B;B=C;C=A" can become "T=A;A=B;B=C;C=T". All else being
+// equal, prefer to schedule assignments with register-allocated Src operands
+// earlier, in case that register becomes free afterwards, and prefer to
+// schedule assignments with register-allocated Dest variables later, to keep
+// that register free for longer.
+//
+// Once the ordering is determined, the Cfg edge is split and the assignment
+// list is lowered by the target lowering layer. Since the assignment lowering
+// may create new infinite-weight temporaries, a follow-on register allocation
+// pass will be needed. To prepare for this, liveness (including live range
+// calculation) of the split nodes needs to be calculated, and liveness of the
+// original node need to be updated to "undo" the effects of the phi
+// assignments.
+
+// The specific placement of the new node within the Cfg node list is deferred
+// until later, including after empty node contraction.
  //
-// Once the ordering is determined, the Cfg edge is split and the
-// assignment list is lowered by the target lowering layer.  The
-// specific placement of the new node within the Cfg node list is
-// deferred until later, including after empty node contraction.
+// After phi assignments are lowered across all blocks, another register
+// allocation pass is run, focusing only on pre-colored and infinite-weight
+// variables, similar to Om1 register allocation (except without the need to
+// specially compute these variables' live ranges, since they have already been
+// precisely calculated). The register allocator in this mode needs the ability
+// to forcibly spill and reload registers in case none are naturally available.
  void CfgNode::advancedPhiLowering() {
    if (getPhis().empty())
      return;
  
-  // Count the number of non-deleted Phi instructions.
-  struct PhiDesc {
-    InstPhi *Phi;
-    Variable *Dest;
-    Operand *Src;
-    bool Processed;
-    size_t NumPred; // number of entries whose Src is this Dest
-    int32_t Weight; // preference for topological order
-  };
-  llvm::SmallVector<PhiDesc, 32> Desc(getPhis().size());
-
-  size_t NumPhis = 0;
+  PhiDescList Desc;
+
    for (Inst &I : Phis) {
-    auto Inst = llvm::dyn_cast<InstPhi>(&I);
-    if (!Inst->isDeleted()) {
-      Desc[NumPhis].Phi = Inst;
-      Desc[NumPhis].Dest = Inst->getDest();
-      ++NumPhis;
+    auto *Phi = llvm::dyn_cast<InstPhi>(&I);
+    if (!Phi->isDeleted()) {
+      Variable *Dest = Phi->getDest();
+      Desc.emplace_back(Phi, Dest);
+      // Undo the effect of the phi instruction on this node's live-in set by
+      // marking the phi dest variable as live on entry.
+      SizeT VarNum = Func->getLiveness()->getLiveIndex(Dest->getIndex());
+      assert(!Func->getLiveness()->getLiveIn(this)[VarNum]);
+      Func->getLiveness()->getLiveIn(this)[VarNum] = true;
+      Phi->setDeleted();
      }
    }
-  if (NumPhis == 0)
+  if (Desc.empty())
      return;
  
+  TargetLowering *Target = Func->getTarget();
    SizeT InEdgeIndex = 0;
    for (CfgNode *Pred : InEdges) {
      CfgNode *Split = splitIncomingEdge(Pred, InEdgeIndex++);
-    AssignList Assignments;
-    SizeT Remaining = NumPhis;
+    SizeT Remaining = Desc.size();
  
      // First pass computes Src and initializes NumPred.
-    for (size_t I = 0; I < NumPhis; ++I) {
-      Variable *Dest = Desc[I].Dest;
-      Operand *Src = Desc[I].Phi->getOperandForTarget(Pred);
-      Desc[I].Src = Src;
-      Desc[I].Processed = false;
-      Desc[I].NumPred = 0;
-      // Cherry-pick any trivial assignments, so that they don't
-      // contribute to the running complexity of the topological sort.
-      if (sameVarOrReg(Dest, Src)) {
-        Desc[I].Processed = true;
+    for (PhiDesc &Item : Desc) {
+      Variable *Dest = Item.Dest;
+      Operand *Src = Item.Phi->getOperandForTarget(Pred);
+      Item.Src = Src;
+      Item.Processed = false;
+      Item.NumPred = 0;
+      // Cherry-pick any trivial assignments, so that they don't contribute to
+      // the running complexity of the topological sort.
+      if (sameVarOrReg(Target, Dest, Src)) {
+        Item.Processed = true;
          --Remaining;
          if (Dest != Src)
-          // If Dest and Src are syntactically the same, don't bother
-          // adding the assignment, because in all respects it would
-          // be redundant, and if Dest/Src are on the stack, the
-          // target lowering may naively decide to lower it using a
-          // temporary register.
-          Assignments.push_back(InstAssign::create(Func, Dest, Src));
+          // If Dest and Src are syntactically the same, don't bother adding
+          // the assignment, because in all respects it would be redundant, and
+          // if Dest/Src are on the stack, the target lowering may naively
+          // decide to lower it using a temporary register.
+          Split->appendInst(InstAssign::create(Func, Dest, Src));
        }
      }
-    // Second pass computes NumPred by comparing every pair of Phi
-    // instructions.
-    for (size_t I = 0; I < NumPhis; ++I) {
-      if (Desc[I].Processed)
+    // Second pass computes NumPred by comparing every pair of Phi instructions.
+    for (PhiDesc &Item : Desc) {
+      if (Item.Processed)
          continue;
-      const Variable *Dest = Desc[I].Dest;
-      for (size_t J = 0; J < NumPhis; ++J) {
-        if (Desc[J].Processed)
+      const Variable *Dest = Item.Dest;
+      for (PhiDesc &Item2 : Desc) {
+        if (Item2.Processed)
            continue;
-        if (I != J) {
-          // There shouldn't be two Phis with the same Dest variable
-          // or register.
-          assert(!sameVarOrReg(Dest, Desc[J].Dest));
-        }
-        const Operand *Src = Desc[J].Src;
-        if (sameVarOrReg(Dest, Src))
-          ++Desc[I].NumPred;
+        // There shouldn't be two different Phis with the same Dest variable or
+        // register.
+        assert((&Item == &Item2) || !sameVarOrReg(Target, Dest, Item2.Dest));
+        if (sameVarOrReg(Target, Dest, Item2.Src))
+          ++Item.NumPred;
        }
      }
  
      // Another pass to compute initial Weight values.
-
-    // Always pick NumPred=0 over NumPred>0.
-    const int32_t WeightNoPreds = 4;
-    // Prefer Src as a register because the register might free up.
-    const int32_t WeightSrcIsReg = 2;
-    // Prefer Dest not as a register because the register stays free
-    // longer.
-    const int32_t WeightDestNotReg = 1;
-
-    for (size_t I = 0; I < NumPhis; ++I) {
-      if (Desc[I].Processed)
+    for (PhiDesc &Item : Desc) {
+      if (Item.Processed)
          continue;
        int32_t Weight = 0;
-      if (Desc[I].NumPred == 0)
+      if (Item.NumPred == 0)
          Weight += WeightNoPreds;
-      if (auto Var = llvm::dyn_cast<Variable>(Desc[I].Src))
+      if (Item.NumPred == 1)
+        Weight += WeightOnePred;
+      if (auto *Var = llvm::dyn_cast<Variable>(Item.Src))
          if (Var->hasReg())
            Weight += WeightSrcIsReg;
-      if (!Desc[I].Dest->hasReg())
+      if (!Item.Dest->hasReg())
          Weight += WeightDestNotReg;
-      Desc[I].Weight = Weight;
+      Item.Weight = Weight;
      }
  
-    // Repeatedly choose and process the best candidate in the
-    // topological sort, until no candidates remain.  This
-    // implementation is O(N^2) where N is the number of Phi
-    // instructions, but with a small constant factor compared to a
-    // likely implementation of O(N) topological sort.
+    // Repeatedly choose and process the best candidate in the topological sort,
+    // until no candidates remain. This implementation is O(N^2) where N is the
+    // number of Phi instructions, but with a small constant factor compared to
+    // a likely implementation of O(N) topological sort.
      for (; Remaining; --Remaining) {
-      size_t BestIndex = 0;
        int32_t BestWeight = -1;
+      PhiDesc *BestItem = nullptr;
        // Find the best candidate.
-      for (size_t I = 0; I < NumPhis; ++I) {
-        if (Desc[I].Processed)
+      for (PhiDesc &Item : Desc) {
+        if (Item.Processed)
            continue;
-        int32_t Weight = 0;
-        Weight = Desc[I].Weight;
+        const int32_t Weight = Item.Weight;
          if (Weight > BestWeight) {
-          BestIndex = I;
+          BestItem = &Item;
            BestWeight = Weight;
          }
        }
        assert(BestWeight >= 0);
-      assert(Desc[BestIndex].NumPred <= 1);
-      Variable *Dest = Desc[BestIndex].Dest;
-      Operand *Src = Desc[BestIndex].Src;
-      assert(!sameVarOrReg(Dest, Src));
+      Variable *Dest = BestItem->Dest;
+      Operand *Src = BestItem->Src;
+      assert(!sameVarOrReg(Target, Dest, Src));
        // Break a cycle by introducing a temporary.
-      if (Desc[BestIndex].NumPred) {
+      while (BestItem->NumPred > 0) {
          bool Found = false;
-        // If the target instruction "A=B" is part of a cycle, find
-        // the "X=A" assignment in the cycle because it will have to
-        // be rewritten as "X=tmp".
-        for (size_t J = 0; !Found && J < NumPhis; ++J) {
-          if (Desc[J].Processed)
+        // If the target instruction "A=B" is part of a cycle, find the "X=A"
+        // assignment in the cycle because it will have to be rewritten as
+        // "X=tmp".
+        for (PhiDesc &Item : Desc) {
+          if (Item.Processed)
              continue;
-          Operand *OtherSrc = Desc[J].Src;
-          if (Desc[J].NumPred && sameVarOrReg(Dest, OtherSrc)) {
+          Operand *OtherSrc = Item.Src;
+          if (Item.NumPred && sameVarOrReg(Target, Dest, OtherSrc)) {
              SizeT VarNum = Func->getNumVariables();
              Variable *Tmp = Func->makeVariable(OtherSrc->getType());
-            if (ALLOW_DUMP)
+            if (BuildDefs::dump())
                Tmp->setName(Func, "__split_" + std::to_string(VarNum));
-            Assignments.push_back(InstAssign::create(Func, Tmp, OtherSrc));
-            Desc[J].Src = Tmp;
+            Split->appendInst(InstAssign::create(Func, Tmp, OtherSrc));
+            Item.Src = Tmp;
+            updatePreds(Desc, Target, llvm::cast<Variable>(OtherSrc));
              Found = true;
+            break;
            }
          }
          assert(Found);
+        (void)Found;
        }
        // Now that a cycle (if any) has been broken, create the actual
        // assignment.
-      Assignments.push_back(InstAssign::create(Func, Dest, Src));
-      // Update NumPred for all Phi assignments using this Phi's Src
-      // as their Dest variable.  Also update Weight if NumPred
-      // dropped from 1 to 0.
-      if (auto Var = llvm::dyn_cast<Variable>(Src)) {
-        for (size_t I = 0; I < NumPhis; ++I) {
-          if (Desc[I].Processed)
-            continue;
-          if (sameVarOrReg(Var, Desc[I].Dest)) {
-            if (--Desc[I].NumPred == 0)
-              Desc[I].Weight += WeightNoPreds;
-          }
-        }
-      }
-      Desc[BestIndex].Processed = true;
+      Split->appendInst(InstAssign::create(Func, Dest, Src));
+      if (auto *Var = llvm::dyn_cast<Variable>(Src))
+        updatePreds(Desc, Target, Var);
+      BestItem->Processed = true;
      }
+    Split->appendInst(InstBr::create(Func, this));
  
-    Func->getTarget()->lowerPhiAssignments(Split, Assignments);
-
-    // Renumber the instructions to be monotonically increasing so
-    // that addNode() doesn't assert when multi-definitions are added
-    // out of order.
-    Split->renumberInstructions();
+    Split->genCode();
      Func->getVMetadata()->addNode(Split);
+    // Validate to be safe.  All items should be marked as processed, and have
+    // no predecessors.
+    if (BuildDefs::asserts()) {
+      for (PhiDesc &Item : Desc) {
+        (void)Item;
+        assert(Item.Processed);
+        assert(Item.NumPred == 0);
+      }
+    }
    }
-
-  for (Inst &I : Phis)
-    I.setDeleted();
  }
  
-// Does address mode optimization.  Pass each instruction to the
-// TargetLowering object.  If it returns a new instruction
-// (representing the optimized address mode), then insert the new
-// instruction and delete the old.
+// Does address mode optimization. Pass each instruction to the TargetLowering
+// object. If it returns a new instruction (representing the optimized address
+// mode), then insert the new instruction and delete the old.
  void CfgNode::doAddressOpt() {
    TargetLowering *Target = Func->getTarget();
    LoweringContext &Context = Target->getContext();
@@ -488,31 +591,36 @@ void CfgNode::doAddressOpt() {
    }
  }
  
-void CfgNode::doNopInsertion() {
+void CfgNode::doNopInsertion(RandomNumberGenerator &RNG) {
    TargetLowering *Target = Func->getTarget();
    LoweringContext &Context = Target->getContext();
    Context.init(this);
+  Context.setInsertPoint(Context.getCur());
+  // Do not insert nop in bundle locked instructions.
+  bool PauseNopInsertion = false;
    while (!Context.atEnd()) {
-    Target->doNopInsertion();
+    if (llvm::isa<InstBundleLock>(Context.getCur())) {
+      PauseNopInsertion = true;
+    } else if (llvm::isa<InstBundleUnlock>(Context.getCur())) {
+      PauseNopInsertion = false;
+    }
+    if (!PauseNopInsertion)
+      Target->doNopInsertion(RNG);
      // Ensure Cur=Next, so that the nops are inserted before the current
      // instruction rather than after.
-    Context.advanceNext();
      Context.advanceCur();
+    Context.advanceNext();
    }
-  // Insert before all instructions.
-  Context.setInsertPoint(getInsts().begin());
-  Context.advanceNext();
-  Context.advanceCur();
-  Target->doNopInsertion();
  }
  
-// Drives the target lowering.  Passes the current instruction and the
-// next non-deleted instruction for target lowering.
+// Drives the target lowering. Passes the current instruction and the next
+// non-deleted instruction for target lowering.
  void CfgNode::genCode() {
    TargetLowering *Target = Func->getTarget();
    LoweringContext &Context = Target->getContext();
    // Lower the regular instructions.
    Context.init(this);
+  Target->initNodeForLowering(this);
    while (!Context.atEnd()) {
      InstList::iterator Orig = Context.getCur();
      if (llvm::isa<InstRet>(*Orig))
@@ -521,6 +629,7 @@ void CfgNode::genCode() {
      // Ensure target lowering actually moved the cursor.
      assert(Context.getCur() != Orig);
    }
+  Context.availabilityReset();
    // Do preliminary lowering of the Phi instructions.
    Target->prelowerPhis();
  }
@@ -541,47 +650,56 @@ void CfgNode::livenessLightweight() {
    }
  }
  
-// Performs liveness analysis on the block.  Returns true if the
-// incoming liveness changed from before, false if it stayed the same.
-// (If it changes, the node's predecessors need to be processed
-// again.)
+// Performs liveness analysis on the block. Returns true if the incoming
+// liveness changed from before, false if it stayed the same. (If it changes,
+// the node's predecessors need to be processed again.)
  bool CfgNode::liveness(Liveness *Liveness) {
-  SizeT NumVars = Liveness->getNumVarsInNode(this);
-  LivenessBV Live(NumVars);
+  const SizeT NumVars = Liveness->getNumVarsInNode(this);
+  const SizeT NumGlobalVars = Liveness->getNumGlobalVars();
+  LivenessBV &Live = Liveness->getScratchBV();
+  Live.clear();
+
    LiveBeginEndMap *LiveBegin = nullptr;
    LiveBeginEndMap *LiveEnd = nullptr;
-  // Mark the beginning and ending of each variable's live range
-  // with the sentinel instruction number 0.
+  // Mark the beginning and ending of each variable's live range with the
+  // sentinel instruction number 0.
    if (Liveness->getMode() == Liveness_Intervals) {
      LiveBegin = Liveness->getLiveBegin(this);
      LiveEnd = Liveness->getLiveEnd(this);
      LiveBegin->clear();
      LiveEnd->clear();
-    // Guess that the number of live ranges beginning is roughly the
-    // number of instructions, and same for live ranges ending.
+    // Guess that the number of live ranges beginning is roughly the number of
+    // instructions, and same for live ranges ending.
      LiveBegin->reserve(getInstCountEstimate());
      LiveEnd->reserve(getInstCountEstimate());
    }
+
    // Initialize Live to be the union of all successors' LiveIn.
    for (CfgNode *Succ : OutEdges) {
-    Live |= Liveness->getLiveIn(Succ);
+    const LivenessBV &LiveIn = Liveness->getLiveIn(Succ);
+    assert(LiveIn.empty() || LiveIn.size() == NumGlobalVars);
+    Live |= LiveIn;
      // Mark corresponding argument of phis in successor as live.
      for (Inst &I : Succ->Phis) {
-      auto Phi = llvm::dyn_cast<InstPhi>(&I);
+      if (I.isDeleted())
+        continue;
+      auto *Phi = llvm::cast<InstPhi>(&I);
        Phi->livenessPhiOperand(Live, this, Liveness);
      }
    }
+  assert(Live.empty() || Live.size() == NumGlobalVars);
    Liveness->getLiveOut(this) = Live;
  
+  // Expand Live so it can hold locals in addition to globals.
+  Live.resize(NumVars);
    // Process regular instructions in reverse order.
    for (Inst &I : reverse_range(Insts)) {
      if (I.isDeleted())
        continue;
      I.liveness(I.getNumber(), Live, Liveness, LiveBegin, LiveEnd);
    }
-  // Process phis in forward order so that we can override the
-  // instruction number to be that of the earliest phi instruction in
-  // the block.
+  // Process phis in forward order so that we can override the instruction
+  // number to be that of the earliest phi instruction in the block.
    SizeT NumNonDeadPhis = 0;
    InstNumberT FirstPhiNumber = Inst::NumberSentinel;
    for (Inst &I : Phis) {
@@ -593,35 +711,33 @@ bool CfgNode::liveness(Liveness *Liveness) {
        ++NumNonDeadPhis;
    }
  
-  // When using the sparse representation, after traversing the
-  // instructions in the block, the Live bitvector should only contain
-  // set bits for global variables upon block entry.  We validate this
-  // by shrinking the Live vector and then testing it against the
-  // pre-shrunk version.  (The shrinking is required, but the
-  // validation is not.)
-  LivenessBV LiveOrig = Live;
-  Live.resize(Liveness->getNumGlobalVars());
-  // Non-global arguments in the entry node are allowed to be live on
-  // entry.
-  bool IsEntry = (Func->getEntryNode() == this);
-  if (!(IsEntry || Live == LiveOrig)) {
-    // This is a fatal liveness consistency error.  Print some
-    // diagnostics and abort.
-    Ostream &Str = Func->getContext()->getStrDump();
-    Func->resetCurrentNode();
-    Str << "LiveOrig-Live =";
-    for (SizeT i = Live.size(); i < LiveOrig.size(); ++i) {
-      if (LiveOrig.test(i)) {
-        Str << " ";
-        Liveness->getVariable(i, this)->dump(Func);
+  // When using the sparse representation, after traversing the instructions in
+  // the block, the Live bitvector should only contain set bits for global
+  // variables upon block entry.  We validate this by testing the upper bits of
+  // the Live bitvector.
+  if (Live.find_next(NumGlobalVars) != -1) {
+    if (BuildDefs::dump()) {
+      // This is a fatal liveness consistency error. Print some diagnostics and
+      // abort.
+      Ostream &Str = Func->getContext()->getStrDump();
+      Func->resetCurrentNode();
+      Str << "Invalid Live =";
+      for (SizeT i = NumGlobalVars; i < Live.size(); ++i) {
+        if (Live.test(i)) {
+          Str << " ";
+          Liveness->getVariable(i, this)->dump(Func);
+        }
        }
+      Str << "\n";
      }
-    Str << "\n";
-    llvm_unreachable("Fatal inconsistency in liveness analysis");
+    llvm::report_fatal_error("Fatal inconsistency in liveness analysis");
    }
+  // Now truncate Live to prevent LiveIn from growing.
+  Live.resize(NumGlobalVars);
  
    bool Changed = false;
    LivenessBV &LiveIn = Liveness->getLiveIn(this);
+  assert(LiveIn.empty() || LiveIn.size() == NumGlobalVars);
    // Add in current LiveIn
    Live |= LiveIn;
    // Check result, set LiveIn=Live
@@ -634,37 +750,86 @@ bool CfgNode::liveness(Liveness *Liveness) {
    return Changed;
  }
  
-// Once basic liveness is complete, compute actual live ranges.  It is
-// assumed that within a single basic block, a live range begins at
-// most once and ends at most once.  This is certainly true for pure
-// SSA form.  It is also true once phis are lowered, since each
-// assignment to the phi-based temporary is in a different basic
-// block, and there is a single read that ends the live in the basic
-// block that contained the actual phi instruction.
+// Validate the integrity of the live ranges in this block.  If there are any
+// errors, it prints details and returns false.  On success, it returns true.
+bool CfgNode::livenessValidateIntervals(Liveness *Liveness) const {
+  if (!BuildDefs::asserts())
+    return true;
+
+  // Verify there are no duplicates.
+  auto ComparePair =
+      [](const LiveBeginEndMapEntry &A, const LiveBeginEndMapEntry &B) {
+        return A.first == B.first;
+      };
+  LiveBeginEndMap &MapBegin = *Liveness->getLiveBegin(this);
+  LiveBeginEndMap &MapEnd = *Liveness->getLiveEnd(this);
+  if (std::adjacent_find(MapBegin.begin(), MapBegin.end(), ComparePair) ==
+          MapBegin.end() &&
+      std::adjacent_find(MapEnd.begin(), MapEnd.end(), ComparePair) ==
+          MapEnd.end())
+    return true;
+
+  // There is definitely a liveness error.  All paths from here return false.
+  if (!BuildDefs::dump())
+    return false;
+
+  // Print all the errors.
+  if (BuildDefs::dump()) {
+    GlobalContext *Ctx = Func->getContext();
+    OstreamLocker L(Ctx);
+    Ostream &Str = Ctx->getStrDump();
+    if (Func->isVerbose()) {
+      Str << "Live range errors in the following block:\n";
+      dump(Func);
+    }
+    for (auto Start = MapBegin.begin();
+         (Start = std::adjacent_find(Start, MapBegin.end(), ComparePair)) !=
+         MapBegin.end();
+         ++Start) {
+      auto Next = Start + 1;
+      Str << "Duplicate LR begin, block " << getName() << ", instructions "
+          << Start->second << " & " << Next->second << ", variable "
+          << Liveness->getVariable(Start->first, this)->getName() << "\n";
+    }
+    for (auto Start = MapEnd.begin();
+         (Start = std::adjacent_find(Start, MapEnd.end(), ComparePair)) !=
+         MapEnd.end();
+         ++Start) {
+      auto Next = Start + 1;
+      Str << "Duplicate LR end,   block " << getName() << ", instructions "
+          << Start->second << " & " << Next->second << ", variable "
+          << Liveness->getVariable(Start->first, this)->getName() << "\n";
+    }
+  }
+
+  return false;
+}
+
+// Once basic liveness is complete, compute actual live ranges. It is assumed
+// that within a single basic block, a live range begins at most once and ends
+// at most once. This is certainly true for pure SSA form. It is also true once
+// phis are lowered, since each assignment to the phi-based temporary is in a
+// different basic block, and there is a single read that ends the live in the
+// basic block that contained the actual phi instruction.
  void CfgNode::livenessAddIntervals(Liveness *Liveness, InstNumberT FirstInstNum,
                                     InstNumberT LastInstNum) {
    TimerMarker T1(TimerStack::TT_liveRange, Func);
  
-  SizeT NumVars = Liveness->getNumVarsInNode(this);
-  LivenessBV &LiveIn = Liveness->getLiveIn(this);
-  LivenessBV &LiveOut = Liveness->getLiveOut(this);
+  const SizeT NumVars = Liveness->getNumVarsInNode(this);
+  const LivenessBV &LiveIn = Liveness->getLiveIn(this);
+  const LivenessBV &LiveOut = Liveness->getLiveOut(this);
    LiveBeginEndMap &MapBegin = *Liveness->getLiveBegin(this);
    LiveBeginEndMap &MapEnd = *Liveness->getLiveEnd(this);
    std::sort(MapBegin.begin(), MapBegin.end());
    std::sort(MapEnd.begin(), MapEnd.end());
-  // Verify there are no duplicates.
-  struct ComparePair {
-    bool operator()(const LiveBeginEndMapEntry &A,
-                    const LiveBeginEndMapEntry &B) {
-      return A.first == B.first;
-    }
-  };
-  assert(std::adjacent_find(MapBegin.begin(), MapBegin.end(), ComparePair()) ==
-         MapBegin.end());
-  assert(std::adjacent_find(MapEnd.begin(), MapEnd.end(), ComparePair()) ==
-         MapEnd.end());
-
-  LivenessBV LiveInAndOut = LiveIn;
+
+  if (!livenessValidateIntervals(Liveness)) {
+    llvm::report_fatal_error("livenessAddIntervals: Liveness error");
+    return;
+  }
+
+  LivenessBV &LiveInAndOut = Liveness->getScratchBV();
+  LiveInAndOut = LiveIn;
    LiveInAndOut &= LiveOut;
  
    // Iterate in parallel across the sorted MapBegin[] and MapEnd[].
@@ -674,30 +839,27 @@ void CfgNode::livenessAddIntervals(Liveness *Liveness, InstNumberT FirstInstNum,
      SizeT i1 = IBB == IBE ? NumVars : IBB->first;
      SizeT i2 = IEB == IEE ? NumVars : IEB->first;
      SizeT i = std::min(i1, i2);
-    // i1 is the Variable number of the next MapBegin entry, and i2 is
-    // the Variable number of the next MapEnd entry.  If i1==i2, then
-    // the Variable's live range begins and ends in this block.  If
-    // i1<i2, then i1's live range begins at instruction IBB->second
-    // and extends through the end of the block.  If i1>i2, then i2's
-    // live range begins at the first instruction of the block and
-    // ends at IEB->second.  In any case, we choose the lesser of i1
-    // and i2 and proceed accordingly.
+    // i1 is the Variable number of the next MapBegin entry, and i2 is the
+    // Variable number of the next MapEnd entry. If i1==i2, then the Variable's
+    // live range begins and ends in this block. If i1<i2, then i1's live range
+    // begins at instruction IBB->second and extends through the end of the
+    // block. If i1>i2, then i2's live range begins at the first instruction of
+    // the block and ends at IEB->second. In any case, we choose the lesser of
+    // i1 and i2 and proceed accordingly.
      InstNumberT LB = i == i1 ? IBB->second : FirstInstNum;
      InstNumberT LE = i == i2 ? IEB->second : LastInstNum + 1;
  
      Variable *Var = Liveness->getVariable(i, this);
-    if (!Var->getIgnoreLiveness()) {
-      if (LB > LE) {
-        Var->addLiveRange(FirstInstNum, LE, 1);
-        Var->addLiveRange(LB, LastInstNum + 1, 1);
-        // Assert that Var is a global variable by checking that its
-        // liveness index is less than the number of globals.  This
-        // ensures that the LiveInAndOut[] access is valid.
-        assert(i < Liveness->getNumGlobalVars());
-        LiveInAndOut[i] = false;
-      } else {
-        Var->addLiveRange(LB, LE, 1);
-      }
+    if (LB > LE) {
+      Var->addLiveRange(FirstInstNum, LE);
+      Var->addLiveRange(LB, LastInstNum + 1);
+      // Assert that Var is a global variable by checking that its liveness
+      // index is less than the number of globals. This ensures that the
+      // LiveInAndOut[] access is valid.
+      assert(i < Liveness->getNumGlobalVars());
+      LiveInAndOut[i] = false;
+    } else {
+      Var->addLiveRange(LB, LE);
      }
      if (i == i1)
        ++IBB;
@@ -708,13 +870,14 @@ void CfgNode::livenessAddIntervals(Liveness *Liveness, InstNumberT FirstInstNum,
    for (int i = LiveInAndOut.find_first(); i != -1;
         i = LiveInAndOut.find_next(i)) {
      Variable *Var = Liveness->getVariable(i, this);
-    Var->addLiveRange(FirstInstNum, LastInstNum + 1, 1);
+    if (Liveness->getRangeMask(Var->getIndex()))
+      Var->addLiveRange(FirstInstNum, LastInstNum + 1);
    }
  }
  
  // If this node contains only deleted instructions, and ends in an
-// unconditional branch, contract the node by repointing all its
-// in-edges to its successor.
+// unconditional branch, contract the node by repointing all its in-edges to
+// its successor.
  void CfgNode::contractIfEmpty() {
    if (InEdges.empty())
      return;
@@ -727,43 +890,58 @@ void CfgNode::contractIfEmpty() {
      else if (!I.isRedundantAssign())
        return;
    }
+  // Make sure there is actually a successor to repoint in-edges to.
+  if (OutEdges.empty())
+    return;
+  assert(hasSingleOutEdge());
+  // Don't try to delete a self-loop.
+  if (OutEdges[0] == this)
+    return;
+  // Make sure the node actually contains (ends with) an unconditional branch.
+  if (Branch == nullptr)
+    return;
+
    Branch->setDeleted();
-  assert(OutEdges.size() == 1);
-  // Repoint all this node's in-edges to this node's successor, unless
-  // this node's successor is actually itself (in which case the
-  // statement "OutEdges.front()->InEdges.push_back(Pred)" could
-  // invalidate the iterator over this->InEdges).
-  if (OutEdges.front() != this) {
+  CfgNode *Successor = OutEdges.front();
+  // Repoint all this node's in-edges to this node's successor, unless this
+  // node's successor is actually itself (in which case the statement
+  // "OutEdges.front()->InEdges.push_back(Pred)" could invalidate the iterator
+  // over this->InEdges).
+  if (Successor != this) {
      for (CfgNode *Pred : InEdges) {
-      for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end(); I != E;
-           ++I) {
-        if (*I == this) {
-          *I = OutEdges.front();
-          OutEdges.front()->InEdges.push_back(Pred);
+      for (CfgNode *&I : Pred->OutEdges) {
+        if (I == this) {
+          I = Successor;
+          Successor->InEdges.push_back(Pred);
          }
        }
        for (Inst &I : Pred->getInsts()) {
          if (!I.isDeleted())
-          I.repointEdge(this, OutEdges.front());
+          I.repointEdges(this, Successor);
        }
      }
+
+    // Remove the in-edge to the successor to allow node reordering to make
+    // better decisions. For example it's more helpful to place a node after a
+    // reachable predecessor than an unreachable one (like the one we just
+    // contracted).
+    Successor->InEdges.erase(
+        std::find(Successor->InEdges.begin(), Successor->InEdges.end(), this));
    }
    InEdges.clear();
-  // Don't bother removing the single out-edge, which would also
-  // require finding the corresponding in-edge in the successor and
-  // removing it.
  }
  
  void CfgNode::doBranchOpt(const CfgNode *NextNode) {
    TargetLowering *Target = Func->getTarget();
-  // Check every instruction for a branch optimization opportunity.
-  // It may be more efficient to iterate in reverse and stop after the
-  // first opportunity, unless there is some target lowering where we
-  // have the possibility of multiple such optimizations per block
-  // (currently not the case for x86 lowering).
-  for (Inst &I : Insts) {
+  // Find the first opportunity for branch optimization (which will be the last
+  // instruction in the block) and stop. This is sufficient unless there is
+  // some target lowering where we have the possibility of multiple
+  // optimizations per block. Take care with switch lowering as there are
+  // multiple unconditional branches and only the last can be deleted.
+  for (Inst &I : reverse_range(Insts)) {
      if (!I.isDeleted()) {
        Target->doBranchOpt(&I, NextNode);
+      return;
      }
    }
  }
@@ -775,67 +953,94 @@ namespace {
  // Helper functions for emit().
  
  void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node,
-                       bool IsLiveIn, std::vector<SizeT> &LiveRegCount) {
-  if (!ALLOW_DUMP)
+                       bool IsLiveIn, CfgVector<SizeT> &LiveRegCount) {
+  if (!BuildDefs::dump())
      return;
    Liveness *Liveness = Func->getLiveness();
    const LivenessBV *Live;
+  const auto StackReg = Func->getTarget()->getStackReg();
+  const auto FrameOrStackReg = Func->getTarget()->getFrameOrStackReg();
    if (IsLiveIn) {
      Live = &Liveness->getLiveIn(Node);
-    Str << "\t\t\t\t# LiveIn=";
+    Str << "\t\t\t\t/* LiveIn=";
    } else {
      Live = &Liveness->getLiveOut(Node);
-    Str << "\t\t\t\t# LiveOut=";
+    Str << "\t\t\t\t/* LiveOut=";
    }
    if (!Live->empty()) {
-    bool First = true;
+    CfgVector<Variable *> LiveRegs;
      for (SizeT i = 0; i < Live->size(); ++i) {
-      if ((*Live)[i]) {
-        Variable *Var = Liveness->getVariable(i, Node);
-        if (Var->hasReg()) {
-          if (IsLiveIn)
-            ++LiveRegCount[Var->getRegNum()];
-          if (!First)
-            Str << ",";
-          First = false;
-          Var->emit(Func);
-        }
-      }
+      if (!(*Live)[i])
+        continue;
+      Variable *Var = Liveness->getVariable(i, Node);
+      if (!Var->hasReg())
+        continue;
+      const auto RegNum = Var->getRegNum();
+      if (RegNum == StackReg || RegNum == FrameOrStackReg)
+        continue;
+      if (IsLiveIn)
+        ++LiveRegCount[RegNum];
+      LiveRegs.push_back(Var);
+    }
+    // Sort the variables by regnum so they are always printed in a familiar
+    // order.
+    std::sort(LiveRegs.begin(), LiveRegs.end(),
+              [](const Variable *V1, const Variable *V2) {
+                return unsigned(V1->getRegNum()) < unsigned(V2->getRegNum());
+              });
+    bool First = true;
+    for (Variable *Var : LiveRegs) {
+      if (!First)
+        Str << ",";
+      First = false;
+      Var->emit(Func);
      }
    }
-  Str << "\n";
+  Str << " */\n";
  }
  
-void emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
-                         std::vector<SizeT> &LiveRegCount) {
-  if (!ALLOW_DUMP)
-    return;
-  bool First = true;
+/// Returns true if some text was emitted - in which case the caller definitely
+/// needs to emit a newline character.
+bool emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
+                         CfgVector<SizeT> &LiveRegCount) {
+  bool Printed = false;
+  if (!BuildDefs::dump())
+    return Printed;
    Variable *Dest = Instr->getDest();
-  if (Dest && Dest->hasReg())
+  // Normally we increment the live count for the dest register. But we
+  // shouldn't if the instruction's IsDestRedefined flag is set, because this
+  // means that the target lowering created this instruction as a non-SSA
+  // assignment; i.e., a different, previous instruction started the dest
+  // variable's live range.
+  if (!Instr->isDestRedefined() && Dest && Dest->hasReg())
      ++LiveRegCount[Dest->getRegNum()];
-  for (SizeT I = 0; I < Instr->getSrcSize(); ++I) {
-    Operand *Src = Instr->getSrc(I);
-    SizeT NumVars = Src->getNumVars();
-    for (SizeT J = 0; J < NumVars; ++J) {
-      const Variable *Var = Src->getVar(J);
-      if (Var->hasReg()) {
-        if (Instr->isLastUse(Var) && --LiveRegCount[Var->getRegNum()] == 0) {
-          if (First)
-            Str << " \t# END=";
-          else
-            Str << ",";
-          Var->emit(Func);
-          First = false;
-        }
-      }
+  FOREACH_VAR_IN_INST(Var, *Instr) {
+    bool ShouldReport = Instr->isLastUse(Var);
+    if (ShouldReport && Var->hasReg()) {
+      // Don't report end of live range until the live count reaches 0.
+      SizeT NewCount = --LiveRegCount[Var->getRegNum()];
+      if (NewCount)
+        ShouldReport = false;
+    }
+    if (ShouldReport) {
+      if (Printed)
+        Str << ",";
+      else
+        Str << " \t/* END=";
+      Var->emit(Func);
+      Printed = true;
      }
    }
+  if (Printed)
+    Str << " */";
+  return Printed;
  }
  
  void updateStats(Cfg *Func, const Inst *I) {
-  // Update emitted instruction count, plus fill/spill count for
-  // Variable operands without a physical register.
+  if (!BuildDefs::dump())
+    return;
+  // Update emitted instruction count, plus fill/spill count for Variable
+  // operands without a physical register.
    if (uint32_t Count = I->getEmitInstCount()) {
      Func->getContext()->statsUpdateEmitted(Count);
      if (Variable *Dest = I->getDest()) {
@@ -843,7 +1048,7 @@ void updateStats(Cfg *Func, const Inst *I) {
          Func->getContext()->statsUpdateFills();
      }
      for (SizeT S = 0; S < I->getSrcSize(); ++S) {
-      if (Variable *Src = llvm::dyn_cast<Variable>(I->getSrc(S))) {
+      if (auto *Src = llvm::dyn_cast<Variable>(I->getSrc(S))) {
          if (!Src->hasReg())
            Func->getContext()->statsUpdateSpills();
        }
@@ -854,17 +1059,36 @@ void updateStats(Cfg *Func, const Inst *I) {
  } // end of anonymous namespace
  
  void CfgNode::emit(Cfg *Func) const {
-  if (!ALLOW_DUMP)
+  if (!BuildDefs::dump())
      return;
    Func->setCurrentNode(this);
    Ostream &Str = Func->getContext()->getStrEmit();
    Liveness *Liveness = Func->getLiveness();
-  bool DecorateAsm =
-      Liveness && Func->getContext()->getFlags().getDecorateAsm();
+  const bool DecorateAsm = Liveness && getFlags().getDecorateAsm();
    Str << getAsmName() << ":\n";
-  std::vector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
-  if (DecorateAsm)
-    emitRegisterUsage(Str, Func, this, true, LiveRegCount);
+  // LiveRegCount keeps track of the number of currently live variables that
+  // each register is assigned to. Normally that would be only 0 or 1, but the
+  // register allocator's AllowOverlap inference allows it to be greater than 1
+  // for short periods.
+  CfgVector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
+  if (DecorateAsm) {
+    constexpr bool IsLiveIn = true;
+    emitRegisterUsage(Str, Func, this, IsLiveIn, LiveRegCount);
+    if (getInEdges().size()) {
+      Str << "\t\t\t\t/* preds=";
+      bool First = true;
+      for (CfgNode *I : getInEdges()) {
+        if (!First)
+          Str << ",";
+        First = false;
+        Str << "$" << I->getName();
+      }
+      Str << " */\n";
+    }
+    if (getLoopNestDepth()) {
+      Str << "\t\t\t\t/* loop depth=" << getLoopNestDepth() << " */\n";
+    }
+  }
  
    for (const Inst &I : Phis) {
      if (I.isDeleted())
@@ -876,50 +1100,280 @@ void CfgNode::emit(Cfg *Func) const {
      if (I.isDeleted())
        continue;
      if (I.isRedundantAssign()) {
+      // Usually, redundant assignments end the live range of the src variable
+      // and begin the live range of the dest variable, with no net effect on
+      // the liveness of their register. However, if the register allocator
+      // infers the AllowOverlap condition, then this may be a redundant
+      // assignment that does not end the src variable's live range, in which
+      // case the active variable count for that register needs to be bumped.
+      // That normally would have happened as part of emitLiveRangesEnded(),
+      // but that isn't called for redundant assignments.
        Variable *Dest = I.getDest();
-      if (DecorateAsm && Dest->hasReg() && !I.isLastUse(I.getSrc(0)))
+      if (DecorateAsm && Dest->hasReg()) {
          ++LiveRegCount[Dest->getRegNum()];
+        if (I.isLastUse(I.getSrc(0)))
+          --LiveRegCount[llvm::cast<Variable>(I.getSrc(0))->getRegNum()];
+      }
        continue;
      }
      I.emit(Func);
+    bool Printed = false;
      if (DecorateAsm)
-      emitLiveRangesEnded(Str, Func, &I, LiveRegCount);
-    Str << "\n";
+      Printed = emitLiveRangesEnded(Str, Func, &I, LiveRegCount);
+    if (Printed || llvm::isa<InstTarget>(&I))
+      Str << "\n";
      updateStats(Func, &I);
    }
-  if (DecorateAsm)
-    emitRegisterUsage(Str, Func, this, false, LiveRegCount);
+  if (DecorateAsm) {
+    constexpr bool IsLiveIn = false;
+    emitRegisterUsage(Str, Func, this, IsLiveIn, LiveRegCount);
+  }
  }
  
+// Helper class for emitIAS().
+namespace {
+class BundleEmitHelper {
+  BundleEmitHelper() = delete;
+  BundleEmitHelper(const BundleEmitHelper &) = delete;
+  BundleEmitHelper &operator=(const BundleEmitHelper &) = delete;
+
+public:
+  BundleEmitHelper(Assembler *Asm, const InstList &Insts)
+      : Asm(Asm), End(Insts.end()), BundleLockStart(End),
+        BundleSize(1 << Asm->getBundleAlignLog2Bytes()),
+        BundleMaskLo(BundleSize - 1), BundleMaskHi(~BundleMaskLo) {}
+  // Check whether we're currently within a bundle_lock region.
+  bool isInBundleLockRegion() const { return BundleLockStart != End; }
+  // Check whether the current bundle_lock region has the align_to_end option.
+  bool isAlignToEnd() const {
+    assert(isInBundleLockRegion());
+    return llvm::cast<InstBundleLock>(getBundleLockStart())->getOption() ==
+           InstBundleLock::Opt_AlignToEnd;
+  }
+  bool isPadToEnd() const {
+    assert(isInBundleLockRegion());
+    return llvm::cast<InstBundleLock>(getBundleLockStart())->getOption() ==
+           InstBundleLock::Opt_PadToEnd;
+  }
+  // Check whether the entire bundle_lock region falls within the same bundle.
+  bool isSameBundle() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre == SizeSnapshotPost ||
+           (SizeSnapshotPre & BundleMaskHi) ==
+               ((SizeSnapshotPost - 1) & BundleMaskHi);
+  }
+  // Get the bundle alignment of the first instruction of the bundle_lock
+  // region.
+  intptr_t getPreAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre & BundleMaskLo;
+  }
+  // Get the bundle alignment of the first instruction past the bundle_lock
+  // region.
+  intptr_t getPostAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPost & BundleMaskLo;
+  }
+  // Get the iterator pointing to the bundle_lock instruction, e.g. to roll
+  // back the instruction iteration to that point.
+  InstList::const_iterator getBundleLockStart() const {
+    assert(isInBundleLockRegion());
+    return BundleLockStart;
+  }
+  // Set up bookkeeping when the bundle_lock instruction is first processed.
+  void enterBundleLock(InstList::const_iterator I) {
+    assert(!isInBundleLockRegion());
+    BundleLockStart = I;
+    SizeSnapshotPre = Asm->getBufferSize();
+    Asm->setPreliminary(true);
+    assert(isInBundleLockRegion());
+  }
+  // Update bookkeeping when the bundle_unlock instruction is processed.
+  void enterBundleUnlock() {
+    assert(isInBundleLockRegion());
+    SizeSnapshotPost = Asm->getBufferSize();
+  }
+  // Update bookkeeping when we are completely finished with the bundle_lock
+  // region.
+  void leaveBundleLockRegion() { BundleLockStart = End; }
+  // Check whether the instruction sequence fits within the current bundle, and
+  // if not, add nop padding to the end of the current bundle.
+  void padToNextBundle() {
+    assert(isInBundleLockRegion());
+    if (!isSameBundle()) {
+      intptr_t PadToNextBundle = BundleSize - getPreAlignment();
+      Asm->padWithNop(PadToNextBundle);
+      SizeSnapshotPre += PadToNextBundle;
+      SizeSnapshotPost += PadToNextBundle;
+      assert((Asm->getBufferSize() & BundleMaskLo) == 0);
+      assert(Asm->getBufferSize() == SizeSnapshotPre);
+    }
+  }
+  // If align_to_end is specified, add padding such that the instruction
+  // sequences ends precisely at a bundle boundary.
+  void padForAlignToEnd() {
+    assert(isInBundleLockRegion());
+    if (isAlignToEnd()) {
+      if (intptr_t Offset = getPostAlignment()) {
+        Asm->padWithNop(BundleSize - Offset);
+        SizeSnapshotPre = Asm->getBufferSize();
+      }
+    }
+  }
+  // If pad_to_end is specified, add padding such that the first instruction
+  // after the instruction sequence starts at a bundle boundary.
+  void padForPadToEnd() {
+    assert(isInBundleLockRegion());
+    if (isPadToEnd()) {
+      if (intptr_t Offset = getPostAlignment()) {
+        Asm->padWithNop(BundleSize - Offset);
+        SizeSnapshotPre = Asm->getBufferSize();
+      }
+    }
+  } // Update bookkeeping when rolling back for the second pass.
+  void rollback() {
+    assert(isInBundleLockRegion());
+    Asm->setBufferSize(SizeSnapshotPre);
+    Asm->setPreliminary(false);
+  }
+
+private:
+  Assembler *const Asm;
+  // End is a sentinel value such that BundleLockStart==End implies that we are
+  // not in a bundle_lock region.
+  const InstList::const_iterator End;
+  InstList::const_iterator BundleLockStart;
+  const intptr_t BundleSize;
+  // Masking with BundleMaskLo identifies an address's bundle offset.
+  const intptr_t BundleMaskLo;
+  // Masking with BundleMaskHi identifies an address's bundle.
+  const intptr_t BundleMaskHi;
+  intptr_t SizeSnapshotPre = 0;
+  intptr_t SizeSnapshotPost = 0;
+};
+
+} // end of anonymous namespace
+
  void CfgNode::emitIAS(Cfg *Func) const {
    Func->setCurrentNode(this);
    Assembler *Asm = Func->getAssembler<>();
-  Asm->BindCfgNodeLabel(getIndex());
+  // TODO(stichnot): When sandboxing, defer binding the node label until just
+  // before the first instruction is emitted, to reduce the chance that a
+  // padding nop is a branch target.
+  Asm->bindCfgNodeLabel(this);
    for (const Inst &I : Phis) {
      if (I.isDeleted())
        continue;
      // Emitting a Phi instruction should cause an error.
      I.emitIAS(Func);
    }
-  for (const Inst &I : Insts) {
-    if (I.isDeleted())
+
+  // Do the simple emission if not sandboxed.
+  if (!getFlags().getUseSandboxing()) {
+    for (const Inst &I : Insts) {
+      if (!I.isDeleted() && !I.isRedundantAssign()) {
+        I.emitIAS(Func);
+        updateStats(Func, &I);
+      }
+    }
+    return;
+  }
+
+  // The remainder of the function handles emission with sandboxing. There are
+  // explicit bundle_lock regions delimited by bundle_lock and bundle_unlock
+  // instructions. All other instructions are treated as an implicit
+  // one-instruction bundle_lock region. Emission is done twice for each
+  // bundle_lock region. The first pass is a preliminary pass, after which we
+  // can figure out what nop padding is needed, then roll back, and make the
+  // final pass.
+  //
+  // Ideally, the first pass would be speculative and the second pass would
+  // only be done if nop padding were needed, but the structure of the
+  // integrated assembler makes it hard to roll back the state of label
+  // bindings, label links, and relocation fixups. Instead, the first pass just
+  // disables all mutation of that state.
+
+  BundleEmitHelper Helper(Asm, Insts);
+  InstList::const_iterator End = Insts.end();
+  // Retrying indicates that we had to roll back to the bundle_lock instruction
+  // to apply padding before the bundle_lock sequence.
+  bool Retrying = false;
+  for (InstList::const_iterator I = Insts.begin(); I != End; ++I) {
+    if (I->isDeleted() || I->isRedundantAssign())
        continue;
-    if (I.isRedundantAssign())
+
+    if (llvm::isa<InstBundleLock>(I)) {
+      // Set up the initial bundle_lock state. This should not happen while
+      // retrying, because the retry rolls back to the instruction following
+      // the bundle_lock instruction.
+      assert(!Retrying);
+      Helper.enterBundleLock(I);
        continue;
-    I.emitIAS(Func);
-    updateStats(Func, &I);
+    }
+
+    if (llvm::isa<InstBundleUnlock>(I)) {
+      Helper.enterBundleUnlock();
+      if (Retrying) {
+        // Make sure all instructions are in the same bundle.
+        assert(Helper.isSameBundle());
+        // If align_to_end is specified, make sure the next instruction begins
+        // the bundle.
+        assert(!Helper.isAlignToEnd() || Helper.getPostAlignment() == 0);
+        Helper.padForPadToEnd();
+        Helper.leaveBundleLockRegion();
+        Retrying = false;
+      } else {
+        // This is the first pass, so roll back for the retry pass.
+        Helper.rollback();
+        // Pad to the next bundle if the instruction sequence crossed a bundle
+        // boundary.
+        Helper.padToNextBundle();
+        // Insert additional padding to make AlignToEnd work.
+        Helper.padForAlignToEnd();
+        // Prepare for the retry pass after padding is done.
+        Retrying = true;
+        I = Helper.getBundleLockStart();
+      }
+      continue;
+    }
+
+    // I points to a non bundle_lock/bundle_unlock instruction.
+    if (Helper.isInBundleLockRegion()) {
+      I->emitIAS(Func);
+      // Only update stats during the final pass.
+      if (Retrying)
+        updateStats(Func, iteratorToInst(I));
+    } else {
+      // Treat it as though there were an implicit bundle_lock and
+      // bundle_unlock wrapping the instruction.
+      Helper.enterBundleLock(I);
+      I->emitIAS(Func);
+      Helper.enterBundleUnlock();
+      Helper.rollback();
+      Helper.padToNextBundle();
+      I->emitIAS(Func);
+      updateStats(Func, iteratorToInst(I));
+      Helper.leaveBundleLockRegion();
+    }
    }
+
+  // Don't allow bundle locking across basic blocks, to keep the backtracking
+  // mechanism simple.
+  assert(!Helper.isInBundleLockRegion());
+  assert(!Retrying);
  }
  
  void CfgNode::dump(Cfg *Func) const {
-  if (!ALLOW_DUMP)
+  if (!BuildDefs::dump())
      return;
    Func->setCurrentNode(this);
    Ostream &Str = Func->getContext()->getStrDump();
    Liveness *Liveness = Func->getLiveness();
-  if (Func->isVerbose(IceV_Instructions)) {
+  if (Func->isVerbose(IceV_Instructions) || Func->isVerbose(IceV_Loop))
      Str << getName() << ":\n";
-  }
+  // Dump the loop nest depth
+  if (Func->isVerbose(IceV_Loop))
+    Str << "    // LoopNestDepth = " << getLoopNestDepth() << "\n";
    // Dump list of predecessor nodes.
    if (Func->isVerbose(IceV_Preds) && !InEdges.empty()) {
      Str << "    // preds = ";
@@ -933,23 +1387,23 @@ void CfgNode::dump(Cfg *Func) const {
      Str << "\n";
    }
    // Dump the live-in variables.
-  LivenessBV LiveIn;
-  if (Liveness)
-    LiveIn = Liveness->getLiveIn(this);
-  if (Func->isVerbose(IceV_Liveness) && !LiveIn.empty()) {
-    Str << "    // LiveIn:";
-    for (SizeT i = 0; i < LiveIn.size(); ++i) {
-      if (LiveIn[i]) {
-        Variable *Var = Liveness->getVariable(i, this);
-        Str << " %" << Var->getName(Func);
-        if (Func->isVerbose(IceV_RegOrigins) && Var->hasReg()) {
-          Str << ":"
-              << Func->getTarget()->getRegName(Var->getRegNum(),
-                                               Var->getType());
+  if (Func->isVerbose(IceV_Liveness)) {
+    if (Liveness != nullptr && !Liveness->getLiveIn(this).empty()) {
+      const LivenessBV &LiveIn = Liveness->getLiveIn(this);
+      Str << "    // LiveIn:";
+      for (SizeT i = 0; i < LiveIn.size(); ++i) {
+        if (LiveIn[i]) {
+          Variable *Var = Liveness->getVariable(i, this);
+          Str << " %" << Var->getName();
+          if (Func->isVerbose(IceV_RegOrigins) && Var->hasReg()) {
+            Str << ":"
+                << Func->getTarget()->getRegName(Var->getRegNum(),
+                                                 Var->getType());
+          }
          }
        }
+      Str << "\n";
      }
-    Str << "\n";
    }
    // Dump each instruction.
    if (Func->isVerbose(IceV_Instructions)) {
@@ -959,23 +1413,23 @@ void CfgNode::dump(Cfg *Func) const {
        I.dumpDecorated(Func);
    }
    // Dump the live-out variables.
-  LivenessBV LiveOut;
-  if (Liveness)
-    LiveOut = Liveness->getLiveOut(this);
-  if (Func->isVerbose(IceV_Liveness) && !LiveOut.empty()) {
-    Str << "    // LiveOut:";
-    for (SizeT i = 0; i < LiveOut.size(); ++i) {
-      if (LiveOut[i]) {
-        Variable *Var = Liveness->getVariable(i, this);
-        Str << " %" << Var->getName(Func);
-        if (Func->isVerbose(IceV_RegOrigins) && Var->hasReg()) {
-          Str << ":"
-              << Func->getTarget()->getRegName(Var->getRegNum(),
-                                               Var->getType());
+  if (Func->isVerbose(IceV_Liveness)) {
+    if (Liveness != nullptr && !Liveness->getLiveOut(this).empty()) {
+      const LivenessBV &LiveOut = Liveness->getLiveOut(this);
+      Str << "    // LiveOut:";
+      for (SizeT i = 0; i < LiveOut.size(); ++i) {
+        if (LiveOut[i]) {
+          Variable *Var = Liveness->getVariable(i, this);
+          Str << " %" << Var->getName();
+          if (Func->isVerbose(IceV_RegOrigins) && Var->hasReg()) {
+            Str << ":"
+                << Func->getTarget()->getRegName(Var->getRegNum(),
+                                                 Var->getType());
+          }
          }
        }
+      Str << "\n";
      }
-    Str << "\n";
    }
    // Dump list of successor nodes.
    if (Func->isVerbose(IceV_Succs)) {
@@ -991,4 +1445,31 @@ void CfgNode::dump(Cfg *Func) const {
    }
  }
  
+void CfgNode::profileExecutionCount(VariableDeclaration *Var) {
+  GlobalContext *Ctx = Func->getContext();
+  GlobalString RMW_I64 = Ctx->getGlobalString("llvm.nacl.atomic.rmw.i64");
+
+  bool BadIntrinsic = false;
+  const Intrinsics::FullIntrinsicInfo *Info =
+      Ctx->getIntrinsicsInfo().find(RMW_I64, BadIntrinsic);
+  assert(!BadIntrinsic);
+  assert(Info != nullptr);
+
+  Operand *RMWI64Name = Ctx->getConstantExternSym(RMW_I64);
+  constexpr RelocOffsetT Offset = 0;
+  Constant *Counter = Ctx->getConstantSym(Offset, Var->getName());
+  Constant *AtomicRMWOp = Ctx->getConstantInt32(Intrinsics::AtomicAdd);
+  Constant *One = Ctx->getConstantInt64(1);
+  Constant *OrderAcquireRelease =
+      Ctx->getConstantInt32(Intrinsics::MemoryOrderAcquireRelease);
+
+  auto *Instr = InstIntrinsicCall::create(
+      Func, 5, Func->makeVariable(IceType_i64), RMWI64Name, Info->Info);
+  Instr->addArg(AtomicRMWOp);
+  Instr->addArg(Counter);
+  Instr->addArg(One);
+  Instr->addArg(OrderAcquireRelease);
+  Insts.push_front(Instr);
+}
+
  } // end of namespace Ice