From 72374d2610ac17b98d435887244d2c61746d77dc Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 29 Jan 2016 16:11:18 +0000
Subject: [PATCH] [ScheduleDAGInstrs::buildSchedGraph()] Handling of memory
 dependecies rewritten.

The buildSchedGraph() was in need of reworking as the AA features had been
added on top of earlier code. It was very difficult to understand, and buggy.
There had been found cases where scheduling dependencies had actually been
missed (see r228686).

AliasChain, RejectMemNodes, adjustChainDeps() and iterateChainSucc() have
been removed. There are instead now just the four maps from Value to SUs, which
have been renamed to Stores, Loads, NonAliasStores and NonAliasLoads.

An unknown store used to become the AliasChain, but now becomes a store mapped
to 'unknownValue' (in Stores). What used to be PendingLoads is instead the
list of SUs mapped to 'unknownValue' in Loads.

RejectMemNodes and adjustChainDeps() used to be a safety-net for everything.
The SU maps were sometimes cleared and SUs were put in RejectMemNodes, where
adjustChainDeps() would look. Instead of this, a more straight forward approach
is used in maintaining the SU maps without clearing them and simply letting
them grow over time. Instead of the cutt-off in adjustChainDeps() search, a
reduction of maps will be done if needed (see below).

Each SUnit either becomes the BarrierChain, or is put into one of the maps. For
each SUnit encountered, all the information about previous ones are still
available until a new BarrierChain is set, at which point the maps are cleared.

For huge regions, the algorithm becomes slow, therefore the maps will get
reduced at a threshold (current default is 1000 nodes), by a fraction (default 1/2).
These values can be tuned by use of CL options in case some test case shows that
they need to be changed (-dag-maps-huge-region and -dag-maps-reduction-size).

There has not been any considerable change observed in output quality or compile
time. There may now be more DAG edges inserted than before (i.e. if A->B->C,
then A->C is not needed). However, in a comparison run there were fewer total
calls to AA, and a somewhat improved compile time, which means this seems to
be not a problem.

http://reviews.llvm.org/D8705
Reviewers: Hal Finkel, Andy Trick.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@259201 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/PseudoSourceValue.h |   4 +
 include/llvm/CodeGen/ScheduleDAG.h       |  11 +
 include/llvm/CodeGen/ScheduleDAGInstrs.h |  70 ++-
 lib/CodeGen/ScheduleDAGInstrs.cpp        | 723 ++++++++++++++++---------------
 4 files changed, 457 insertions(+), 351 deletions(-)

diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index f67552030db..c3f6fde9fb3 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -27,6 +27,8 @@ class MachineMemOperand;
 class raw_ostream;
 
 raw_ostream &operator<<(raw_ostream &OS, const MachineMemOperand &MMO);
+class PseudoSourceValue;
+raw_ostream &operator<<(raw_ostream &OS, const PseudoSourceValue* PSV);
 
 /// Special value supplied for machine level alias analysis. It indicates that
 /// a memory access references the functions stack frame (e.g., a spill slot),
@@ -45,6 +47,8 @@ public:
 
 private:
   PSVKind Kind;
+  friend raw_ostream &llvm::operator<<(raw_ostream &OS,
+                                       const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
 
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index bda9dbd51ff..6469cabd3de 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -396,6 +396,17 @@ namespace llvm {
     /// specified node.
     bool addPred(const SDep &D, bool Required = true);
 
+    /// addPredBarrier - This adds a barrier edge to SU by calling
+    /// addPred(), with latency 0 generally or latency 1 for a store
+    /// followed by a load.
+    bool addPredBarrier(SUnit *SU) {
+      SDep Dep(SU, SDep::Barrier);
+      unsigned TrueMemOrderLatency =
+        ((SU->getInstr()->mayStore() && this->getInstr()->mayLoad()) ? 1 : 0);
+      Dep.setLatency(TrueMemOrderLatency);
+      return addPred(Dep);
+    }
+
     /// removePred - This removes the specified edge as a pred of the current
     /// node if it exists.  It also removes the current node as a successor of
     /// the specified node.
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 70537d1895f..a83a5158689 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -15,12 +15,14 @@
 #ifndef LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 #define LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <list>
 
 namespace llvm {
   class MachineFrameInfo;
@@ -84,6 +86,10 @@ namespace llvm {
   typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
     VReg2SUnitOperIdxMultiMap;
 
+  typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+  typedef SmallVector<PointerIntPair<ValueType, 1, bool>, 4>
+          UnderlyingObjectsVector;
+
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
   class ScheduleDAGInstrs : public ScheduleDAG {
@@ -149,10 +155,66 @@ namespace llvm {
     /// Tracks the last instructions in this region using each virtual register.
     VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
-    /// PendingLoads - Remember where unknown loads are after the most recent
-    /// unknown store, as we iterate. As with Defs and Uses, this is here
-    /// to minimize construction/destruction.
-    std::vector<SUnit *> PendingLoads;
+    AliasAnalysis *AAForDep;
+
+    /// Remember a generic side-effecting instruction as we proceed.
+    /// No other SU ever gets scheduled around it (except in the special
+    /// case of a huge region that gets reduced).
+    SUnit *BarrierChain;
+
+  public:
+
+    /// A list of SUnits, used in Value2SUsMap, during DAG construction.
+    /// Note: to gain speed it might be worth investigating an optimized
+    /// implementation of this data structure, such as a singly linked list
+    /// with a memory pool (SmallVector was tried but slow and SparseSet is not
+    /// applicable).
+    typedef std::list<SUnit *> SUList;
+  protected:
+    /// A map from ValueType to SUList, used during DAG construction,
+    /// as a means of remembering which SUs depend on which memory
+    /// locations.
+    class Value2SUsMap;
+
+    /// Remove in FIFO order some SUs from huge maps.
+    void reduceHugeMemNodeMaps(Value2SUsMap &stores,
+                               Value2SUsMap &loads, unsigned N);
+
+    /// Add a chain edge between SUa and SUb, but only if both AliasAnalysis
+    /// and Target fail to deny the dependency.
+    void addChainDependency(SUnit *SUa, SUnit *SUb,
+                            unsigned Latency = 0);
+
+    /// Add dependencies as needed from all SUs in list to SU.
+    void addChainDependencies(SUnit *SU, SUList &sus, unsigned Latency) {
+      for (auto *su : sus)
+        addChainDependency(SU, su, Latency);
+    }
+
+    /// Add dependencies as needed from all SUs in map, to SU.
+    void addChainDependencies(SUnit *SU, Value2SUsMap &Val2SUsMap);
+
+    /// Add dependencies as needed to SU, from all SUs mapped to V.
+    void addChainDependencies(SUnit *SU, Value2SUsMap &Val2SUsMap,
+                              ValueType V);
+
+    /// Add barrier chain edges from all SUs in map, and then clear
+    /// the map. This is equivalent to insertBarrierChain(), but
+    /// optimized for the common case where the new BarrierChain (a
+    /// global memory object) has a higher NodeNum than all SUs in
+    /// map. It is assumed BarrierChain has been set before calling
+    /// this.
+    void addBarrierChain(Value2SUsMap &map);
+
+    /// Insert a barrier chain in a huge region, far below current
+    /// SU. Add barrier chain edges from all SUs in map with higher
+    /// NodeNums than this new BarrierChain, and remove them from
+    /// map. It is assumed BarrierChain has been set before calling
+    /// this.
+    void insertBarrierChain(Value2SUsMap &map);
+
+    /// For an unanalyzable memory access, this Value is used in maps.
+    UndefValue *UnknownValue;
 
     /// DbgValues - Remember instruction that precedes DBG_VALUE.
     /// These are generated by buildSchedGraph but persist so they can be
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 00a0b0fc33a..37695f12e3b 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/IntEqClasses.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +27,8 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -50,12 +51,42 @@ static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
     cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
 
+// Note: the two options below might be used in tuning compile time vs
+// output quality. Setting HugeRegion so large that it will never be
+// reached means best-effort, but may be slow.
+
+// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
+// together hold this many SUs, a reduction of maps will be done.
+static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
+    cl::init(1000), cl::desc("The limit to use while constructing the DAG "
+                             "prior to scheduling, at which point a trade-off "
+                             "is made to avoid excessive compile time."));
+
+static cl::opt<unsigned> ReductionSize("dag-maps-reduction-size", cl::Hidden,
+    cl::desc("A huge scheduling region will have maps reduced by this many "
+	     "nodes at a time. Defaults to HugeRegion / 2."));
+
+static void dumpSUList(ScheduleDAGInstrs::SUList &L) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "{ ";
+  for (auto *su : L) {
+    dbgs() << "SU(" << su->NodeNum << ")";
+    if (su != L.back())
+      dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+#endif
+}
+
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      TrackLaneMasks(false), FirstDbgValue(nullptr) {
+      TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr),
+      UnknownValue(UndefValue::get(
+                     Type::getVoidTy(mf.getFunction()->getContext()))),
+      FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -121,10 +152,6 @@ static void getUnderlyingObjects(const Value *V,
   } while (!Working.empty());
 }
 
-typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
-typedef SmallVector<PointerIntPair<ValueType, 1, bool>, 4>
-UnderlyingObjectsVector;
-
 /// getUnderlyingObjectsForInstr - If this machine instr has memory reference
 /// information and it can be tracked to a normal reference to a known
 /// object, return the Value for that object.
@@ -544,41 +571,31 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
     return true;
   }
 
-  const Value *V = (*MI->memoperands_begin())->getValue();
-  if (!V)
+  if ((*MI->memoperands_begin())->getValue() == nullptr)
     return true;
 
-  SmallVector<Value *, 4> Objs;
-  getUnderlyingObjects(V, Objs, DL);
-  for (Value *V : Objs) {
-    // Does this pointer refer to a distinct and identifiable object?
-    if (!isIdentifiedObject(V))
-      return true;
-  }
-
   return false;
 }
 
 /// This returns true if the two MIs need a chain edge between them.
-/// If these are not even memory operations, we still may need
-/// chain deps between them. The question really is - could
-/// these two MIs be reordered during scheduling from memory dependency
-/// point of view.
+/// This is called on normal stores and loads.
 static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                              const DataLayout &DL, MachineInstr *MIa,
                              MachineInstr *MIb) {
   const MachineFunction *MF = MIa->getParent()->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
-  // Cover a trivial case - no edge is need to itself.
-  if (MIa == MIb)
-    return false;
- 
+  assert ((MIa->mayStore() || MIb->mayStore()) &&
+          "Dependency checked between two loads");
+
+  // buildSchedGraph() will clear list of stores if not using AA,
+  // which means all stores have to be chained without AA.
+  if (!AA && MIa->mayStore() && MIb->mayStore())
+    return true;
+
   // Let the target decide if memory accesses cannot possibly overlap.
-  if ((MIa->mayLoad() || MIa->mayStore()) &&
-      (MIb->mayLoad() || MIb->mayStore()))
-    if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA))
-      return false;
+  if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA))
+    return false;
 
   // FIXME: Need to handle multiple memory operands to support all targets.
   if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
@@ -587,11 +604,6 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   if (isUnsafeMemoryObject(MIa, MFI, DL) || isUnsafeMemoryObject(MIb, MFI, DL))
     return true;
 
-  // If we are dealing with two "normal" loads, we do not need an edge
-  // between them - they could be reordered.
-  if (!MIa->mayStore() && !MIb->mayStore())
-    return false;
-
   // To this point analysis is generic. From here on we do need AA.
   if (!AA)
     return true;
@@ -634,106 +646,15 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   return (AAResult != NoAlias);
 }
 
-/// This recursive function iterates over chain deps of SUb looking for
-/// "latest" node that needs a chain edge to SUa.
-static unsigned iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                                 const DataLayout &DL, SUnit *SUa, SUnit *SUb,
-                                 SUnit *ExitSU, unsigned *Depth,
-                                 SmallPtrSetImpl<const SUnit *> &Visited) {
-  if (!SUa || !SUb || SUb == ExitSU)
-    return *Depth;
-
-  // Remember visited nodes.
-  if (!Visited.insert(SUb).second)
-      return *Depth;
-  // If there is _some_ dependency already in place, do not
-  // descend any further.
-  // TODO: Need to make sure that if that dependency got eliminated or ignored
-  // for any reason in the future, we would not violate DAG topology.
-  // Currently it does not happen, but makes an implicit assumption about
-  // future implementation.
-  //
-  // Independently, if we encounter node that is some sort of global
-  // object (like a call) we already have full set of dependencies to it
-  // and we can stop descending.
-  if (SUa->isSucc(SUb) ||
-      isGlobalMemoryObject(AA, SUb->getInstr()))
-    return *Depth;
-
-  // If we do need an edge, or we have exceeded depth budget,
-  // add that edge to the predecessors chain of SUb,
-  // and stop descending.
-  if (*Depth > 200 ||
-      MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) {
-    SUb->addPred(SDep(SUa, SDep::MayAliasMem));
-    return *Depth;
-  }
-  // Track current depth.
-  (*Depth)++;
-  // Iterate over memory dependencies only.
-  for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
-       I != E; ++I)
-    if (I->isNormalMemoryOrBarrier())
-      iterateChainSucc(AA, MFI, DL, SUa, I->getSUnit(), ExitSU, Depth, Visited);
-  return *Depth;
-}
-
-/// This function assumes that "downward" from SU there exist
-/// tail/leaf of already constructed DAG. It iterates downward and
-/// checks whether SU can be aliasing any node dominated
-/// by it.
-static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                            const DataLayout &DL, SUnit *SU, SUnit *ExitSU,
-                            std::set<SUnit *> &CheckList,
-                            unsigned LatencyToLoad) {
-  if (!SU)
-    return;
-
-  SmallPtrSet<const SUnit*, 16> Visited;
-  unsigned Depth = 0;
-
-  for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end();
-       I != IE; ++I) {
-    if (SU == *I)
-      continue;
-    if (MIsNeedChainEdge(AA, MFI, DL, SU->getInstr(), (*I)->getInstr())) {
-      SDep Dep(SU, SDep::MayAliasMem);
-      Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
-      (*I)->addPred(Dep);
-    }
-
-    // Iterate recursively over all previously added memory chain
-    // successors. Keep track of visited nodes.
-    for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
-         JE = (*I)->Succs.end(); J != JE; ++J)
-      if (J->isNormalMemoryOrBarrier())
-        iterateChainSucc(AA, MFI, DL, SU, J->getSUnit(), ExitSU, &Depth,
-                         Visited);
-  }
-}
-
-/// Check whether two objects need a chain edge, if so, add it
-/// otherwise remember the rejected SU.
-static inline void addChainDependency(AliasAnalysis *AA,
-                                      const MachineFrameInfo *MFI,
-                                      const DataLayout &DL, SUnit *SUa,
-                                      SUnit *SUb, std::set<SUnit *> &RejectList,
-                                      unsigned TrueMemOrderLatency = 0,
-                                      bool isNormalMemory = false) {
-  // If this is a false dependency,
-  // do not add the edge, but remember the rejected node.
-  if (MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) {
-    SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
-    Dep.setLatency(TrueMemOrderLatency);
+/// Check whether two objects need a chain edge and add it if needed.
+void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
+                                            unsigned Latency) {
+  if (MIsNeedChainEdge(AAForDep, MFI, MF.getDataLayout(), SUa->getInstr(),
+		       SUb->getInstr())) {
+    SDep Dep(SUa, SDep::MayAliasMem);
+    Dep.setLatency(Latency);
     SUb->addPred(Dep);
   }
-  else {
-    // Duplicate entries should be ignored.
-    RejectList.insert(SUb);
-    DEBUG(dbgs() << "\tReject chain dep between SU("
-          << SUa->NodeNum << ") and SU("
-          << SUb->NodeNum << ")\n");
-  }
 }
 
 /// Create an SUnit for each real instruction, numbered in top-down topological
@@ -832,6 +753,122 @@ void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
   }
 }
 
+class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
+
+  /// Current total number of SUs in map.
+  unsigned NumNodes;
+
+  /// 1 for loads, 0 for stores. (see comment in SUList)
+  unsigned TrueMemOrderLatency;
+public:
+
+  Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
+
+  /// To keep NumNodes up to date, insert() is used instead of
+  /// this operator w/ push_back().
+  ValueType &operator[](const SUList &Key) {
+    llvm_unreachable("Don't use. Use insert() instead."); };
+
+  /// Add SU to the SUList of V. If Map grows huge, reduce its size
+  /// by calling reduce().
+  void inline insert(SUnit *SU, ValueType V) {
+    MapVector::operator[](V).push_back(SU);
+    NumNodes++;
+  }
+
+  /// Clears the list of SUs mapped to V.
+  void inline clearList(ValueType V) {
+    iterator Itr = find(V);
+    if (Itr != end()) {
+      assert (NumNodes >= Itr->second.size());
+      NumNodes -= Itr->second.size();
+
+      Itr->second.clear();
+    }
+  }
+
+  /// Clears map from all contents.
+  void clear() {
+    MapVector<ValueType, SUList>::clear();
+    NumNodes = 0;
+  }
+
+  unsigned inline size() const { return NumNodes; }
+
+  /// Count the number of SUs in this map after a reduction.
+  void reComputeSize(void) {
+    NumNodes = 0;
+    for (auto &I : *this)
+      NumNodes += I.second.size();
+  }
+
+  unsigned inline getTrueMemOrderLatency() const {
+    return TrueMemOrderLatency;
+  }
+
+  void dump();
+};
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap) {
+  for (auto &I : Val2SUsMap)
+    addChainDependencies(SU, I.second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap,
+                                             ValueType V) {
+  Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
+  if (Itr != Val2SUsMap.end())
+    addChainDependencies(SU, Itr->second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  for (auto &I : map) {
+    SUList &sus = I.second;
+    for (auto *SU : sus)
+      SU->addPredBarrier(BarrierChain);
+  }
+  map.clear();
+}
+
+void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  // Go through all lists of SUs.
+  for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
+    Value2SUsMap::iterator CurrItr = I++;
+    SUList &sus = CurrItr->second;
+    SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
+    for (; SUItr != SUEE; ++SUItr) {
+      // Stop on BarrierChain or any instruction above it.
+      if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
+        break;
+
+      (*SUItr)->addPredBarrier(BarrierChain);
+    }
+
+    // Remove also the BarrierChain from list if present.
+    if (*SUItr == BarrierChain)
+      SUItr++;
+
+    // Remove all SUs that are now successors of BarrierChain.
+    if (SUItr != sus.begin())
+      sus.erase(sus.begin(), SUItr);
+  }
+
+  // Remove all entries with empty su lists.
+  map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
+      return (mapEntry.second.empty()); });
+
+  // Recompute the size of the map (NumNodes).
+  map.reComputeSize();
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
@@ -843,7 +880,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
-  AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
+  AAForDep = UseAA ? AA : nullptr;
+
+  BarrierChain = nullptr;
 
   this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
@@ -855,19 +894,30 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   if (PDiffs)
     PDiffs->init(SUnits.size());
 
-  // We build scheduling units by walking a block's instruction list from bottom
-  // to top.
-
-  // Remember where a generic side-effecting instruction is as we proceed.
-  SUnit *BarrierChain = nullptr, *AliasChain = nullptr;
-
-  // Memory references to specific known memory locations are tracked
-  // so that they can be given more precise dependencies. We track
-  // separately the known memory locations that may alias and those
-  // that are known not to alias
-  MapVector<ValueType, std::vector<SUnit *> > AliasMemDefs, NonAliasMemDefs;
-  MapVector<ValueType, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
-  std::set<SUnit*> RejectMemNodes;
+  // We build scheduling units by walking a block's instruction list
+  // from bottom to top.
+
+  // Each MIs' memory operand(s) is analyzed to a list of underlying
+  // objects. The SU is then inserted in the SUList(s) mapped from
+  // that Value(s). Each Value thus gets mapped to a list of SUs
+  // depending on it, defs and uses kept separately. Two SUs are
+  // non-aliasing to each other if they depend on different Values
+  // exclusively.
+  Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
+
+  // Certain memory accesses are known to not alias any SU in Stores
+  // or Loads, and have therefore their own 'NonAlias'
+  // domain. E.g. spill / reload instructions never alias LLVM I/R
+  // Values. It is assumed that this type of memory accesses always
+  // have a proper memory operand modelling, and are therefore never
+  // unanalyzable. This means they are non aliasing against all nodes
+  // in Stores and Loads, including the unanalyzable ones.
+  Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
+
+  // Always reduce a huge region with half of the elements, except
+  // when user sets this number explicitly.
+  if (ReductionSize.getNumOccurrences() == 0)
+    ReductionSize = (HugeRegion / 2);
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
@@ -962,221 +1012,123 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       ExitSU.addPred(Dep);
     }
 
-    // Add chain dependencies.
-    // Chain dependencies used to enforce memory order should have
-    // latency of 0 (except for true dependency of Store followed by
-    // aliased Load... we estimate that with a single cycle of latency
-    // assuming the hardware will bypass)
-    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
-    // after stack slots are lowered to actual addresses.
-    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
-    // produce more precise dependence information.
-    unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0;
+    // Add memory dependencies (Note: isStoreToStackSlot and
+    // isLoadFromStackSLot are not usable after stack slots are lowered to
+    // actual addresses).
+
+    // This is a barrier event that acts as a pivotal node in the DAG.
     if (isGlobalMemoryObject(AA, MI)) {
-      // Be conservative with these and add dependencies on all memory
-      // references, even those that are known to not alias.
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-             NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
-          I->second[i]->addPred(SDep(SU, SDep::Barrier));
-        }
-      }
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-             NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
-          SDep Dep(SU, SDep::Barrier);
-          Dep.setLatency(TrueMemOrderLatency);
-          I->second[i]->addPred(Dep);
-        }
-      }
-      // Add SU to the barrier chain.
+
+      // Become the barrier chain.
       if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
+        BarrierChain->addPredBarrier(SU);
       BarrierChain = SU;
-      // This is a barrier event that acts as a pivotal node in the DAG,
-      // so it is safe to clear list of exposed nodes.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-      RejectMemNodes.clear();
-      NonAliasMemDefs.clear();
-      NonAliasMemUses.clear();
-
-      // fall-through
-    new_alias_chain:
-      // Chain all possibly aliasing memory references through SU.
-      if (AliasChain) {
-        unsigned ChainLatency = 0;
-        if (AliasChain->getInstr()->mayLoad())
-          ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                           RejectMemNodes, ChainLatency);
-      }
-      AliasChain = SU;
-      for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                           PendingLoads[k], RejectMemNodes,
-                           TrueMemOrderLatency);
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-           AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             I->second[i], RejectMemNodes);
-      }
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-           AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             I->second[i], RejectMemNodes, TrueMemOrderLatency);
-      }
-      // This call must come after calls to addChainDependency() since it
-      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
-      // adds to.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-      PendingLoads.clear();
-      AliasMemDefs.clear();
-      AliasMemUses.clear();
-    } else if (MI->mayStore()) {
-      // Add dependence on barrier chain, if needed.
-      // There is no point to check aliasing on barrier event. Even if
-      // SU and barrier _could_ be reordered, they should not. In addition,
-      // we have lost all RejectMemNodes below barrier.
-      if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
 
-      UnderlyingObjectsVector Objs;
-      getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
+      DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
 
+      // Add dependencies against everything below it and clear maps.
+      addBarrierChain(Stores);
+      addBarrierChain(Loads);
+      addBarrierChain(NonAliasStores);
+      addBarrierChain(NonAliasLoads);
+
+      continue;
+    }
+
+    // If it's not a store or a variant load, we're done.
+    if (!MI->mayStore() && !(MI->mayLoad() && !MI->isInvariantLoad(AA)))
+      continue;
+
+    // Always add dependecy edge to BarrierChain if present.
+    if (BarrierChain)
+      BarrierChain->addPredBarrier(SU);
+
+    // Find the underlying objects for MI. The Objs vector is either
+    // empty, or filled with the Values of memory locations which this
+    // SU depends on. An empty vector means the memory location is
+    // unknown, and may alias anything except NonAlias nodes.
+    UnderlyingObjectsVector Objs;
+    getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
+
+    if (MI->mayStore()) {
       if (Objs.empty()) {
-        // Treat all other stores conservatively.
-        goto new_alias_chain;
+        // An unknown store depends on all stores and loads.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+        addChainDependencies(SU, Loads);
+        addChainDependencies(SU, NonAliasLoads);
+
+        // If we're not using AA, clear Stores map since all stores
+        // will be chained.
+        if (!AAForDep)
+          Stores.clear();
+
+	// Map this store to 'UnknownValue'.
+	Stores.insert(SU, UnknownValue);
+        continue;
       }
 
-      bool MayAlias = false;
-      for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end();
-           K != KE; ++K) {
-        ValueType V = K->getPointer();
-        bool ThisMayAlias = K->getInt();
-        if (ThisMayAlias)
-          MayAlias = true;
-
-        // A store to a specific PseudoSourceValue. Add precise dependencies.
-        // Record the def in MemDefs, first adding a dep if there is
-        // an existing def.
-        MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-          ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-        MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
-          ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
-        if (I != IE) {
-          for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                               I->second[i], RejectMemNodes, 0, true);
-
-          // If we're not using AA, then we only need one store per object.
-          if (!AAForDep)
-            I->second.clear();
-          I->second.push_back(SU);
-        } else {
-          if (ThisMayAlias) {
-            if (!AAForDep)
-              AliasMemDefs[V].clear();
-            AliasMemDefs[V].push_back(SU);
-          } else {
-            if (!AAForDep)
-              NonAliasMemDefs[V].clear();
-            NonAliasMemDefs[V].push_back(SU);
-          }
-        }
-        // Handle the uses in MemUses, if there are any.
-        MapVector<ValueType, std::vector<SUnit *> >::iterator J =
-          ((ThisMayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V));
-        MapVector<ValueType, std::vector<SUnit *> >::iterator JE =
-          ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
-        if (J != JE) {
-          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                               J->second[i], RejectMemNodes,
-                               TrueMemOrderLatency, true);
-          J->second.clear();
-        }
+      // Add precise dependencies against all previously seen memory
+      // accesses mapped to the same Value(s).
+      for (auto &underlObj : Objs) {
+        ValueType V = underlObj.getPointer();
+        bool ThisMayAlias = underlObj.getInt();
+
+        Value2SUsMap &stores_ = (ThisMayAlias ? Stores : NonAliasStores);
+
+        // Add dependencies to previous stores and loads mapped to V.
+        addChainDependencies(SU, stores_, V);
+        addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
+
+        // If we're not using AA, then we only need one store per object.
+        if (!AAForDep)
+          stores_.clearList(V);
+
+        // Map this store to V.
+        stores_.insert(SU, V);
       }
-      if (MayAlias) {
-        // Add dependencies from all the PendingLoads, i.e. loads
-        // with no underlying object.
-        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             PendingLoads[k], RejectMemNodes,
-                             TrueMemOrderLatency);
-        // Add dependence on alias chain, if needed.
-        if (AliasChain)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                             RejectMemNodes);
+      // The store may have dependencies to unanalyzable loads and
+      // stores.
+      addChainDependencies(SU, Loads, UnknownValue);
+      addChainDependencies(SU, Stores, UnknownValue);
+    }
+    else { // SU is a load.
+      if (Objs.empty()) {
+        // An unknown load depends on all stores.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+
+        Loads.insert(SU, UnknownValue);
+        continue;
       }
-      // This call must come after calls to addChainDependency() since it
-      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
-      // adds to.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-    } else if (MI->mayLoad()) {
-      bool MayAlias = true;
-      if (MI->isInvariantLoad(AA)) {
-        // Invariant load, no chain dependencies needed!
-      } else {
-        UnderlyingObjectsVector Objs;
-        getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
-
-        if (Objs.empty()) {
-          // A load with no underlying object. Depend on all
-          // potentially aliasing stores.
-          for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-                 AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                                 I->second[i], RejectMemNodes);
-
-          PendingLoads.push_back(SU);
-          MayAlias = true;
-        } else {
-          MayAlias = false;
-        }
 
-        for (UnderlyingObjectsVector::iterator
-             J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
-          ValueType V = J->getPointer();
-          bool ThisMayAlias = J->getInt();
-
-          if (ThisMayAlias)
-            MayAlias = true;
-
-          // A load from a specific PseudoSourceValue. Add precise dependencies.
-          MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-            ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-          MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
-            ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
-          if (I != IE)
-            for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                                 I->second[i], RejectMemNodes, 0, true);
-          if (ThisMayAlias)
-            AliasMemUses[V].push_back(SU);
-          else
-            NonAliasMemUses[V].push_back(SU);
-        }
-        // Add dependencies on alias and barrier chains, if needed.
-        if (MayAlias && AliasChain)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                             RejectMemNodes);
-        if (MayAlias)
-          // This call must come after calls to addChainDependency() since it
-          // consumes the 'RejectMemNodes' list that addChainDependency()
-          // possibly adds to.
-          adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU,
-                          RejectMemNodes, /*Latency=*/0);
-        if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Barrier));
+      for (auto &underlObj : Objs) {
+        ValueType V = underlObj.getPointer();
+        bool ThisMayAlias = underlObj.getInt();
+
+        // Add precise dependencies against all previously seen stores
+        // mapping to the same Value(s).
+        addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+
+        // Map this load to V.
+        (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
       }
+      // The load may have dependencies to unanalyzable stores.
+      addChainDependencies(SU, Stores, UnknownValue);
+    }
+
+    // Reduce maps if they grow huge.
+    if (Stores.size() + Loads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
+      reduceHugeMemNodeMaps(Stores, Loads, ReductionSize);
+    }
+    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
+      reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, ReductionSize);
     }
   }
+
   if (DbgMI)
     FirstDbgValue = DbgMI;
 
@@ -1184,7 +1136,84 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Uses.clear();
   CurrentVRegDefs.clear();
   CurrentVRegUses.clear();
-  PendingLoads.clear();
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
+  PSV->printCustom(OS);
+  return OS;
+}
+
+void ScheduleDAGInstrs::Value2SUsMap::dump() {
+  for (auto &Itr : *this) {
+    if (Itr.first.is<const Value*>()) {
+      const Value *V = Itr.first.get<const Value*>();
+      if (isa<UndefValue>(V))
+        dbgs() << "Unknown";
+      else
+        V->printAsOperand(dbgs());
+    }
+    else if (Itr.first.is<const PseudoSourceValue*>())
+      dbgs() <<  Itr.first.get<const PseudoSourceValue*>();
+    else
+      llvm_unreachable("Unknown Value type.");
+
+    dbgs() << " : ";
+    dumpSUList(Itr.second);
+  }
+}
+
+/// Reduce maps in FIFO order, by N SUs. This is better than turning
+/// every Nth memory SU into BarrierChain in buildSchedGraph(), since
+/// it avoids unnecessary edges between seen SUs above the new
+/// BarrierChain, and those below it.
+void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
+                                              Value2SUsMap &loads, unsigned N) {
+  DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
+
+  // Insert all SU's NodeNums into a vector and sort it.
+  std::vector<unsigned> NodeNums;
+  NodeNums.reserve(stores.size() + loads.size());
+  for (auto &I : stores)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  for (auto &I : loads)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  std::sort(NodeNums.begin(), NodeNums.end());
+
+  // The N last elements in NodeNums will be removed, and the SU with
+  // the lowest NodeNum of them will become the new BarrierChain to
+  // let the not yet seen SUs have a dependency to the removed SUs.
+  assert (N <= NodeNums.size());
+  SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
+  if (BarrierChain) {
+    // The aliasing and non-aliasing maps reduce independently of each
+    // other, but share a common BarrierChain. Check if the
+    // newBarrierChain is above the former one. If it is not, it may
+    // introduce a loop to use newBarrierChain, so keep the old one.
+    if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
+      BarrierChain->addPredBarrier(newBarrierChain);
+      BarrierChain = newBarrierChain;
+      DEBUG(dbgs() << "Inserting new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+    }
+    else
+      DEBUG(dbgs() << "Keeping old barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+  }
+  else
+    BarrierChain = newBarrierChain;
+
+  insertBarrierChain(stores);
+  insertBarrierChain(loads);
+
+  DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
 }
 
 /// \brief Initialize register live-range state for updating kills.
-- 
2.11.0