Merging r341094:

[android-x86/external-llvm.git] / lib / Transforms / Scalar / SROA.cpp
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp

index 12fcd55..bf482bf 100644 (file)
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -24,36 +24,75 @@
  //===----------------------------------------------------------------------===//
  
  #include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
  #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
  #include "llvm/Analysis/AssumptionCache.h"
  #include "llvm/Analysis/GlobalsModRef.h"
  #include "llvm/Analysis/Loads.h"
  #include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DIBuilder.h"
  #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
  #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
  #include "llvm/IR/Instructions.h"
  #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
  #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
  #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
  #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Compiler.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TimeValue.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
  #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
  
  #ifndef NDEBUG
  // We only use this for a debug check.
@@ -87,10 +126,12 @@ static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
                                          cl::Hidden);
  
  namespace {
-/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+
+/// A custom IRBuilder inserter which prefixes all names, but only in
  /// Assert builds.
  class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
    std::string Prefix;
+
    const Twine getNameWithPrefix(const Twine &Name) const {
      return Name.isTriviallyEmpty() ? Name : Prefix + Name;
    }
@@ -106,30 +147,29 @@ protected:
    }
  };
  
-/// \brief Provide a typedef for IRBuilder that drops names in release builds.
-using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-}
+/// Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
  
-namespace {
-/// \brief A used slice of an alloca.
+/// A used slice of an alloca.
  ///
  /// This structure represents a slice of an alloca used by some instruction. It
  /// stores both the begin and end offsets of this use, a pointer to the use
  /// itself, and a flag indicating whether we can classify the use as splittable
  /// or not when forming partitions of the alloca.
  class Slice {
-  /// \brief The beginning offset of the range.
-  uint64_t BeginOffset;
+  /// The beginning offset of the range.
+  uint64_t BeginOffset = 0;
  
-  /// \brief The ending offset, not included in the range.
-  uint64_t EndOffset;
+  /// The ending offset, not included in the range.
+  uint64_t EndOffset = 0;
  
-  /// \brief Storage for both the use of this slice and whether it can be
+  /// Storage for both the use of this slice and whether it can be
    /// split.
    PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
  
  public:
-  Slice() : BeginOffset(), EndOffset() {}
+  Slice() = default;
+
    Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
        : BeginOffset(BeginOffset), EndOffset(EndOffset),
          UseAndIsSplittable(U, IsSplittable) {}
@@ -145,7 +185,7 @@ public:
    bool isDead() const { return getUse() == nullptr; }
    void kill() { UseAndIsSplittable.setPointer(nullptr); }
  
-  /// \brief Support for ordering ranges.
+  /// Support for ordering ranges.
    ///
    /// This provides an ordering over ranges such that start offsets are
    /// always increasing, and within equal start offsets, the end offsets are
@@ -163,7 +203,7 @@ public:
      return false;
    }
  
-  /// \brief Support comparison with a single offset to allow binary searches.
+  /// Support comparison with a single offset to allow binary searches.
    friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
                                                uint64_t RHSOffset) {
      return LHS.beginOffset() < RHSOffset;
@@ -179,14 +219,17 @@ public:
    }
    bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
  };
+
  } // end anonymous namespace
  
  namespace llvm {
+
  template <typename T> struct isPodLike;
  template <> struct isPodLike<Slice> { static const bool value = true; };
-}
  
-/// \brief Representation of the alloca slices.
+} // end namespace llvm
+
+/// Representation of the alloca slices.
  ///
  /// This class represents the slices of an alloca which are formed by its
  /// various uses. If a pointer escapes, we can't fully build a representation
@@ -195,32 +238,34 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
  /// starting at a particular offset before splittable slices.
  class llvm::sroa::AllocaSlices {
  public:
-  /// \brief Construct the slices of a particular alloca.
+  /// Construct the slices of a particular alloca.
    AllocaSlices(const DataLayout &DL, AllocaInst &AI);
  
-  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  /// Test whether a pointer to the allocation escapes our analysis.
    ///
    /// If this is true, the slices are never fully built and should be
    /// ignored.
    bool isEscaped() const { return PointerEscapingInstr; }
  
-  /// \brief Support for iterating over the slices.
+  /// Support for iterating over the slices.
    /// @{
-  typedef SmallVectorImpl<Slice>::iterator iterator;
-  typedef iterator_range<iterator> range;
+  using iterator = SmallVectorImpl<Slice>::iterator;
+  using range = iterator_range<iterator>;
+
    iterator begin() { return Slices.begin(); }
    iterator end() { return Slices.end(); }
  
-  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
-  typedef iterator_range<const_iterator> const_range;
+  using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+  using const_range = iterator_range<const_iterator>;
+
    const_iterator begin() const { return Slices.begin(); }
    const_iterator end() const { return Slices.end(); }
    /// @}
  
-  /// \brief Erase a range of slices.
+  /// Erase a range of slices.
    void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
  
-  /// \brief Insert new slices for this alloca.
+  /// Insert new slices for this alloca.
    ///
    /// This moves the slices into the alloca's slices collection, and re-sorts
    /// everything so that the usual ordering properties of the alloca's slices
@@ -229,7 +274,7 @@ public:
      int OldSize = Slices.size();
      Slices.append(NewSlices.begin(), NewSlices.end());
      auto SliceI = Slices.begin() + OldSize;
-    std::sort(SliceI, Slices.end());
+    llvm::sort(SliceI, Slices.end());
      std::inplace_merge(Slices.begin(), SliceI, Slices.end());
    }
  
@@ -238,10 +283,10 @@ public:
    class partition_iterator;
    iterator_range<partition_iterator> partitions();
  
-  /// \brief Access the dead users for this alloca.
+  /// Access the dead users for this alloca.
    ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
  
-  /// \brief Access the dead operands referring to this alloca.
+  /// Access the dead operands referring to this alloca.
    ///
    /// These are operands which have cannot actually be used to refer to the
    /// alloca as they are outside its range and the user doesn't correct for
@@ -263,14 +308,15 @@ public:
  private:
    template <typename DerivedT, typename RetT = void> class BuilderBase;
    class SliceBuilder;
+
    friend class AllocaSlices::SliceBuilder;
  
  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// \brief Handle to alloca instruction to simplify method interfaces.
+  /// Handle to alloca instruction to simplify method interfaces.
    AllocaInst &AI;
  #endif
  
-  /// \brief The instruction responsible for this alloca not having a known set
+  /// The instruction responsible for this alloca not having a known set
    /// of slices.
    ///
    /// When an instruction (potentially) escapes the pointer to the alloca, we
@@ -278,7 +324,7 @@ private:
    /// alloca. This will be null if the alloca slices are analyzed successfully.
    Instruction *PointerEscapingInstr;
  
-  /// \brief The slices of the alloca.
+  /// The slices of the alloca.
    ///
    /// We store a vector of the slices formed by uses of the alloca here. This
    /// vector is sorted by increasing begin offset, and then the unsplittable
@@ -286,7 +332,7 @@ private:
    /// details.
    SmallVector<Slice, 8> Slices;
  
-  /// \brief Instructions which will become dead if we rewrite the alloca.
+  /// Instructions which will become dead if we rewrite the alloca.
    ///
    /// Note that these are not separated by slice. This is because we expect an
    /// alloca to be completely rewritten or not rewritten at all. If rewritten,
@@ -294,7 +340,7 @@ private:
    /// they come from outside of the allocated space.
    SmallVector<Instruction *, 8> DeadUsers;
  
-  /// \brief Operands which will become dead if we rewrite the alloca.
+  /// Operands which will become dead if we rewrite the alloca.
    ///
    /// These are operands that in their particular use can be replaced with
    /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
@@ -305,7 +351,7 @@ private:
    SmallVector<Use *, 8> DeadOperands;
  };
  
-/// \brief A partition of the slices.
+/// A partition of the slices.
  ///
  /// An ephemeral representation for a range of slices which can be viewed as
  /// a partition of the alloca. This range represents a span of the alloca's
@@ -319,34 +365,34 @@ private:
    friend class AllocaSlices;
    friend class AllocaSlices::partition_iterator;
  
-  typedef AllocaSlices::iterator iterator;
+  using iterator = AllocaSlices::iterator;
  
-  /// \brief The beginning and ending offsets of the alloca for this
+  /// The beginning and ending offsets of the alloca for this
    /// partition.
    uint64_t BeginOffset, EndOffset;
  
-  /// \brief The start end end iterators of this partition.
+  /// The start and end iterators of this partition.
    iterator SI, SJ;
  
-  /// \brief A collection of split slice tails overlapping the partition.
+  /// A collection of split slice tails overlapping the partition.
    SmallVector<Slice *, 4> SplitTails;
  
-  /// \brief Raw constructor builds an empty partition starting and ending at
+  /// Raw constructor builds an empty partition starting and ending at
    /// the given iterator.
    Partition(iterator SI) : SI(SI), SJ(SI) {}
  
  public:
-  /// \brief The start offset of this partition.
+  /// The start offset of this partition.
    ///
    /// All of the contained slices start at or after this offset.
    uint64_t beginOffset() const { return BeginOffset; }
  
-  /// \brief The end offset of this partition.
+  /// The end offset of this partition.
    ///
    /// All of the contained slices end at or before this offset.
    uint64_t endOffset() const { return EndOffset; }
  
-  /// \brief The size of the partition.
+  /// The size of the partition.
    ///
    /// Note that this can never be zero.
    uint64_t size() const {
@@ -354,7 +400,7 @@ public:
      return EndOffset - BeginOffset;
    }
  
-  /// \brief Test whether this partition contains no slices, and merely spans
+  /// Test whether this partition contains no slices, and merely spans
    /// a region occupied by split slices.
    bool empty() const { return SI == SJ; }
  
@@ -371,7 +417,7 @@ public:
    iterator end() const { return SJ; }
    /// @}
  
-  /// \brief Get the sequence of split slice tails.
+  /// Get the sequence of split slice tails.
    ///
    /// These tails are of slices which start before this partition but are
    /// split and overlap into the partition. We accumulate these while forming
@@ -379,7 +425,7 @@ public:
    ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
  };
  
-/// \brief An iterator over partitions of the alloca's slices.
+/// An iterator over partitions of the alloca's slices.
  ///
  /// This iterator implements the core algorithm for partitioning the alloca's
  /// slices. It is a forward iterator as we don't support backtracking for
@@ -393,28 +439,28 @@ class AllocaSlices::partition_iterator
                                    Partition> {
    friend class AllocaSlices;
  
-  /// \brief Most of the state for walking the partitions is held in a class
+  /// Most of the state for walking the partitions is held in a class
    /// with a nice interface for examining them.
    Partition P;
  
-  /// \brief We need to keep the end of the slices to know when to stop.
+  /// We need to keep the end of the slices to know when to stop.
    AllocaSlices::iterator SE;
  
-  /// \brief We also need to keep track of the maximum split end offset seen.
+  /// We also need to keep track of the maximum split end offset seen.
    /// FIXME: Do we really?
-  uint64_t MaxSplitSliceEndOffset;
+  uint64_t MaxSplitSliceEndOffset = 0;
  
-  /// \brief Sets the partition to be empty at given iterator, and sets the
+  /// Sets the partition to be empty at given iterator, and sets the
    /// end iterator.
    partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
-      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+      : P(SI), SE(SE) {
      // If not already at the end, advance our state to form the initial
      // partition.
      if (SI != SE)
        advance();
    }
  
-  /// \brief Advance the iterator to the next partition.
+  /// Advance the iterator to the next partition.
    ///
    /// Requires that the iterator not be at the end of the slices.
    void advance() {
@@ -431,20 +477,21 @@ class AllocaSlices::partition_iterator
          // Remove the uses which have ended in the prior partition. This
          // cannot change the max split slice end because we just checked that
          // the prior partition ended prior to that max.
-        P.SplitTails.erase(
-            std::remove_if(
-                P.SplitTails.begin(), P.SplitTails.end(),
-                [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
-            P.SplitTails.end());
-        assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
-                           [&](Slice *S) {
-                             return S->endOffset() == MaxSplitSliceEndOffset;
-                           }) &&
+        P.SplitTails.erase(llvm::remove_if(P.SplitTails,
+                                           [&](Slice *S) {
+                                             return S->endOffset() <=
+                                                    P.EndOffset;
+                                           }),
+                           P.SplitTails.end());
+        assert(llvm::any_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() == MaxSplitSliceEndOffset;
+                            }) &&
                 "Could not find the current max split slice offset!");
-        assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
-                           [&](Slice *S) {
-                             return S->endOffset() <= MaxSplitSliceEndOffset;
-                           }) &&
+        assert(llvm::all_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() <= MaxSplitSliceEndOffset;
+                            }) &&
                 "Max split slice end offset is not actually the max!");
        }
      }
@@ -568,7 +615,7 @@ public:
    Partition &operator*() { return P; }
  };
  
-/// \brief A forward range over the partitions of the alloca's slices.
+/// A forward range over the partitions of the alloca's slices.
  ///
  /// This accesses an iterator range over the partitions of the alloca's
  /// slices. It computes these partitions on the fly based on the overlapping
@@ -592,7 +639,7 @@ static Value *foldSelectInst(SelectInst &SI) {
    return nullptr;
  }
  
-/// \brief A helper that folds a PHI node or a select.
+/// A helper that folds a PHI node or a select.
  static Value *foldPHINodeOrSelectInst(Instruction &I) {
    if (PHINode *PN = dyn_cast<PHINode>(&I)) {
      // If PN merges together the same value, return that value.
@@ -601,14 +648,15 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
    return foldSelectInst(cast<SelectInst>(I));
  }
  
-/// \brief Builder for the alloca slices.
+/// Builder for the alloca slices.
  ///
  /// This class builds a set of alloca slices by recursively visiting the uses
  /// of an alloca and making a slice for each load and store at each offset.
  class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
    friend class PtrUseVisitor<SliceBuilder>;
    friend class InstVisitor<SliceBuilder>;
-  typedef PtrUseVisitor<SliceBuilder> Base;
+
+  using Base = PtrUseVisitor<SliceBuilder>;
  
    const uint64_t AllocSize;
    AllocaSlices &AS;
@@ -616,7 +664,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
    SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
    SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
  
-  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  /// Set to de-duplicate dead instructions found in the use walk.
    SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
  
  public:
@@ -635,11 +683,12 @@ private:
      // Completely skip uses which have a zero size or start either before or
      // past the end of the allocation.
      if (Size == 0 || Offset.uge(AllocSize)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
-                   << " which has zero size or starts outside of the "
-                   << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << I << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+                        << Offset
+                        << " which has zero size or starts outside of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
        return markAsDead(I);
      }
  
@@ -654,10 +703,11 @@ private:
      // them, and so have to record at least the information here.
      assert(AllocSize >= BeginOffset); // Established above.
      if (Size > AllocSize - BeginOffset) {
-      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
-                   << " to remain within the " << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << I << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+                        << Offset << " to remain within the " << AllocSize
+                        << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
        EndOffset = AllocSize;
      }
  
@@ -681,7 +731,7 @@ private:
        // langref in a very strict sense. If we ever want to enable
        // SROAStrictInbounds, this code should be factored cleanly into
        // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
-      // by writing out the code here where we have tho underlying allocation
+      // by writing out the code here where we have the underlying allocation
        // size readily available.
        APInt GEPOffset = Offset;
        const DataLayout &DL = GEPI.getModule()->getDataLayout();
@@ -693,7 +743,7 @@ private:
            break;
  
          // Handle a struct index, which adds its field offset to the pointer.
-        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
            unsigned ElementIdx = OpC->getZExtValue();
            const StructLayout *SL = DL.getStructLayout(STy);
            GEPOffset +=
@@ -750,18 +800,18 @@ private:
      uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
  
      // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
+    // bounds of the allocation, it's behavior is undefined, so simply
      // ignore it. Note that this is more strict than the generic clamping
      // behavior of insertUse. We also try to handle cases which might run the
      // risk of overflow.
      // FIXME: We should instead consider the pointer to have escaped if this
      // function is being instrumented for addressing bugs or race conditions.
      if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
-                   << " which extends past the end of the " << AllocSize
-                   << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << SI << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+                        << Offset << " which extends past the end of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << SI << "\n");
        return markAsDead(SI);
      }
  
@@ -975,7 +1025,7 @@ private:
  
    void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
  
-  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  /// Disable SROA entirely if there are unhandled users of the alloca.
    void visitInstruction(Instruction &I) { PI.setAborted(&I); }
  };
  
@@ -996,22 +1046,21 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
      return;
    }
  
-  Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
-                              [](const Slice &S) {
-                                return S.isDead();
-                              }),
-               Slices.end());
+  Slices.erase(
+      llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
+      Slices.end());
  
  #ifndef NDEBUG
    if (SROARandomShuffleSlices) {
-    std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+    std::mt19937 MT(static_cast<unsigned>(
+        std::chrono::system_clock::now().time_since_epoch().count()));
      std::shuffle(Slices.begin(), Slices.end(), MT);
    }
  #endif
  
    // Sort the uses. This arranges for the offsets to be in ascending order,
    // and the sizes to be in descending order.
-  std::sort(Slices.begin(), Slices.end());
+  llvm::sort(Slices.begin(), Slices.end());
  }
  
  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1156,6 +1205,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
    if (!HaveLoad)
      return false;
  
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
    // We can only transform this if it is safe to push the loads into the
    // predecessor blocks. The only thing to watch out for is that we can't put
    // a possibly trapping load in the predecessor if it is a critical edge.
@@ -1177,7 +1228,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
      // If this pointer is always safe to load, or if we can prove that there
      // is already a load in the block, then we can move the load to the pred
      // block.
-    if (isSafeToLoadUnconditionally(InVal, MaxAlign, TI))
+    if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI))
        continue;
  
      return false;
@@ -1187,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
  }
  
  static void speculatePHINodeLoads(PHINode &PN) {
-  DEBUG(dbgs() << "    original: " << PN << "\n");
+  LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
  
    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
    IRBuilderTy PHIBuilder(&PN);
@@ -1210,10 +1261,21 @@ static void speculatePHINodeLoads(PHINode &PN) {
    }
  
    // Inject loads into all of the pred blocks.
+  DenseMap<BasicBlock*, Value*> InjectedLoads;
    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
      BasicBlock *Pred = PN.getIncomingBlock(Idx);
-    TerminatorInst *TI = Pred->getTerminator();
      Value *InVal = PN.getIncomingValue(Idx);
+
+    // A PHI node is allowed to have multiple (duplicated) entries for the same
+    // basic block, as long as the value is the same. So if we already injected
+    // a load in the predecessor, then we should reuse the same load for all
+    // duplicated entries.
+    if (Value* V = InjectedLoads.lookup(Pred)) {
+      NewPN->addIncoming(V, Pred);
+      continue;
+    }
+
+    TerminatorInst *TI = Pred->getTerminator();
      IRBuilderTy PredBuilder(TI);
  
      LoadInst *Load = PredBuilder.CreateLoad(
@@ -1223,9 +1285,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
      if (AATags)
        Load->setAAMetadata(AATags);
      NewPN->addIncoming(Load, Pred);
+    InjectedLoads[Pred] = Load;
    }
  
-  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
    PN.eraseFromParent();
  }
  
@@ -1245,18 +1308,19 @@ static void speculatePHINodeLoads(PHINode &PN) {
  static bool isSafeSelectToSpeculate(SelectInst &SI) {
    Value *TValue = SI.getTrueValue();
    Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
  
    for (User *U : SI.users()) {
      LoadInst *LI = dyn_cast<LoadInst>(U);
      if (!LI || !LI->isSimple())
        return false;
  
-    // Both operands to the select need to be dereferencable, either
+    // Both operands to the select need to be dereferenceable, either
      // absolutely (e.g. allocas) or at this point because we can see other
      // accesses to it.
-    if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), LI))
+    if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
        return false;
-    if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), LI))
+    if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI))
        return false;
    }
  
@@ -1264,7 +1328,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
  }
  
  static void speculateSelectInstLoads(SelectInst &SI) {
-  DEBUG(dbgs() << "    original: " << SI << "\n");
+  LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
  
    IRBuilderTy IRB(&SI);
    Value *TV = SI.getTrueValue();
@@ -1295,14 +1359,14 @@ static void speculateSelectInstLoads(SelectInst &SI) {
      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
                                  LI->getName() + ".sroa.speculated");
  
-    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
      LI->replaceAllUsesWith(V);
      LI->eraseFromParent();
    }
    SI.eraseFromParent();
  }
  
-/// \brief Build a GEP out of a base pointer and indices.
+/// Build a GEP out of a base pointer and indices.
  ///
  /// This will return the BasePtr if that is valid, or build a new GEP
  /// instruction using the IRBuilder if GEP-ing is needed.
@@ -1320,7 +1384,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
                                 NamePrefix + "sroa_idx");
  }
  
-/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// Get a natural GEP off of the BasePtr walking through Ty toward
  /// TargetTy without changing the offset of the pointer.
  ///
  /// This routine assumes we've already established a properly offset GEP with
@@ -1369,7 +1433,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
  }
  
-/// \brief Recursively compute indices for a natural GEP.
+/// Recursively compute indices for a natural GEP.
  ///
  /// This is the recursive step for getNaturalGEPWithOffset that walks down the
  /// element types adding appropriate indices for the GEP.
@@ -1437,7 +1501,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                    Indices, NamePrefix);
  }
  
-/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// Get a natural GEP from a base pointer to a particular offset and
  /// resulting in a particular type.
  ///
  /// The goal is to produce a "natural" looking GEP that works with the existing
@@ -1472,7 +1536,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                    Indices, NamePrefix);
  }
  
-/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
  /// resulting pointer has PointerTy.
  ///
  /// This tries very hard to compute a "natural" GEP which arrives at the offset
@@ -1549,7 +1613,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
      if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
        Ptr = cast<Operator>(Ptr)->getOperand(0);
      } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
          break;
        Ptr = GA->getAliasee();
      } else {
@@ -1581,7 +1645,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
    return Ptr;
  }
  
-/// \brief Compute the adjusted alignment for a load or store from an offset.
+/// Compute the adjusted alignment for a load or store from an offset.
  static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
                                       const DataLayout &DL) {
    unsigned Alignment;
@@ -1602,7 +1666,7 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
    return MinAlign(Alignment, Offset);
  }
  
-/// \brief Test whether we can convert a value from the old to the new type.
+/// Test whether we can convert a value from the old to the new type.
  ///
  /// This predicate should be used to guard calls to convertValue in order to
  /// ensure that we only try to convert viable values. The strategy is that we
@@ -1632,17 +1696,28 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
    OldTy = OldTy->getScalarType();
    NewTy = NewTy->getScalarType();
    if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
-    if (NewTy->isPointerTy() && OldTy->isPointerTy())
-      return true;
-    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
-      return true;
+    if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+      return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
+        cast<PointerType>(OldTy)->getPointerAddressSpace();
+    }
+
+    // We can convert integers to integral pointers, but not to non-integral
+    // pointers.
+    if (OldTy->isIntegerTy())
+      return !DL.isNonIntegralPointerType(NewTy);
+
+    // We can convert integral pointers to integers, but non-integral pointers
+    // need to remain pointers.
+    if (!DL.isNonIntegralPointerType(OldTy))
+      return NewTy->isIntegerTy();
+
      return false;
    }
  
    return true;
  }
  
-/// \brief Generic routine to convert an SSA value to a value of a different
+/// Generic routine to convert an SSA value to a value of a different
  /// type.
  ///
  /// This will try various different casting techniques, such as bitcasts,
@@ -1661,8 +1736,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
  
    // See if we need inttoptr for this type pair. A cast involving both scalars
    // and vectors requires and additional bitcast.
-  if (OldTy->getScalarType()->isIntegerTy() &&
-      NewTy->getScalarType()->isPointerTy()) {
+  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
      // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
      if (OldTy->isVectorTy() && !NewTy->isVectorTy())
        return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
@@ -1678,8 +1752,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
  
    // See if we need ptrtoint for this type pair. A cast involving both scalars
    // and vectors requires and additional bitcast.
-  if (OldTy->getScalarType()->isPointerTy() &&
-      NewTy->getScalarType()->isIntegerTy()) {
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
      // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
      if (OldTy->isVectorTy() && !NewTy->isVectorTy())
        return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
@@ -1696,7 +1769,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
    return IRB.CreateBitCast(V, NewTy);
  }
  
-/// \brief Test whether the given slice use can be promoted to a vector.
+/// Test whether the given slice use can be promoted to a vector.
  ///
  /// This function is called to test each entry in a partition which is slated
  /// for a single slice.
@@ -1767,7 +1840,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
    return true;
  }
  
-/// \brief Test whether the given alloca partitioning and range of slices can be
+/// Test whether the given alloca partitioning and range of slices can be
  /// promoted to a vector.
  ///
  /// This is a quick test to check whether we can rewrite a particular alloca
@@ -1810,11 +1883,12 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
    // do that until all the backends are known to produce good code for all
    // integer vector types.
    if (!HaveCommonEltTy) {
-    CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
-                                      [](VectorType *VTy) {
-                         return !VTy->getElementType()->isIntegerTy();
-                       }),
-                       CandidateTys.end());
+    CandidateTys.erase(
+        llvm::remove_if(CandidateTys,
+                        [](VectorType *VTy) {
+                          return !VTy->getElementType()->isIntegerTy();
+                        }),
+        CandidateTys.end());
  
      // If there were no integer vector types, give up.
      if (CandidateTys.empty())
@@ -1823,6 +1897,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
      // Rank the remaining candidate vector types. This is easy because we know
      // they're all integer vectors. We sort by ascending number of elements.
      auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
        assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
               "Cannot have vector types of different sizes!");
        assert(RHSTy->getElementType()->isIntegerTy() &&
@@ -1831,7 +1906,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
               "All non-integer types eliminated!");
        return RHSTy->getNumElements() < LHSTy->getNumElements();
      };
-    std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+    llvm::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
      CandidateTys.erase(
          std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
          CandidateTys.end());
@@ -1878,7 +1953,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
    return nullptr;
  }
  
-/// \brief Test whether a slice of an alloca is valid for integer widening.
+/// Test whether a slice of an alloca is valid for integer widening.
  ///
  /// This implements the necessary checking for the \c isIntegerWideningViable
  /// test below on a single slice of the alloca.
@@ -1905,6 +1980,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
      // We can't handle loads that extend past the allocated memory.
      if (DL.getTypeStoreSize(LI->getType()) > Size)
        return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerLoad.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
      // Note that we don't count vector loads or stores as whole-alloca
      // operations which enable integer widening because we would prefer to use
      // vector widening instead.
@@ -1926,6 +2005,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
      // We can't handle stores that extend past the allocated memory.
      if (DL.getTypeStoreSize(ValueTy) > Size)
        return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerStore.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
      // Note that we don't count vector loads or stores as whole-alloca
      // operations which enable integer widening because we would prefer to use
      // vector widening instead.
@@ -1956,7 +2039,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
    return true;
  }
  
-/// \brief Test whether the given alloca partition's integer operations can be
+/// Test whether the given alloca partition's integer operations can be
  /// widened to promotable ones.
  ///
  /// This is a quick test to check whether we can rewrite the integer loads and
@@ -2007,7 +2090,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
  static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                               IntegerType *Ty, uint64_t Offset,
                               const Twine &Name) {
-  DEBUG(dbgs() << "       start: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
    IntegerType *IntTy = cast<IntegerType>(V->getType());
    assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
           "Element extends past full value");
@@ -2016,13 +2099,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
      ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
    if (ShAmt) {
      V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
-    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
    }
    assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
           "Cannot extract to a larger integer!");
    if (Ty != IntTy) {
      V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
-    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n");
    }
    return V;
  }
@@ -2033,10 +2116,10 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
    IntegerType *Ty = cast<IntegerType>(V->getType());
    assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
           "Cannot insert a larger integer!");
-  DEBUG(dbgs() << "       start: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
    if (Ty != IntTy) {
      V = IRB.CreateZExt(V, IntTy, Name + ".ext");
-    DEBUG(dbgs() << "    extended: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
    }
    assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
           "Element store outside of alloca store");
@@ -2045,15 +2128,15 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
      ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
    if (ShAmt) {
      V = IRB.CreateShl(V, ShAmt, Name + ".shift");
-    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
    }
  
    if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
      APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
      Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
-    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n");
      V = IRB.CreateOr(Old, V, Name + ".insert");
-    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n");
    }
    return V;
  }
@@ -2070,7 +2153,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
    if (NumElements == 1) {
      V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
                                   Name + ".extract");
-    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n");
      return V;
    }
  
@@ -2080,7 +2163,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
      Mask.push_back(IRB.getInt32(i));
    V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                ConstantVector::get(Mask), Name + ".extract");
-  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
    return V;
  }
  
@@ -2094,7 +2177,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
      // Single element to insert.
      V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
                                  Name + ".insert");
-    DEBUG(dbgs() << "     insert: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n");
      return V;
    }
  
@@ -2119,7 +2202,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
        Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
    V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                ConstantVector::get(Mask), Name + ".expand");
-  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
  
    Mask.clear();
    for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
@@ -2127,11 +2210,11 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
  
    V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
  
-  DEBUG(dbgs() << "    blend: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
    return V;
  }
  
-/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// Visitor to rewrite instructions using p particular slice of an alloca
  /// to use a new alloca.
  ///
  /// Also implements the rewriting to vector-based accesses when the partition
@@ -2140,8 +2223,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
  class llvm::sroa::AllocaSliceRewriter
      : public InstVisitor<AllocaSliceRewriter, bool> {
    // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
-  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+  friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+  using Base = InstVisitor<AllocaSliceRewriter, bool>;
  
    const DataLayout &DL;
    AllocaSlices &AS;
@@ -2171,20 +2255,22 @@ class llvm::sroa::AllocaSliceRewriter
  
    // The original offset of the slice currently being rewritten relative to
    // the original alloca.
-  uint64_t BeginOffset, EndOffset;
+  uint64_t BeginOffset = 0;
+  uint64_t EndOffset = 0;
+
    // The new offsets of the slice currently being rewritten relative to the
    // original alloca.
    uint64_t NewBeginOffset, NewEndOffset;
  
    uint64_t SliceSize;
-  bool IsSplittable;
-  bool IsSplit;
-  Use *OldUse;
-  Instruction *OldPtr;
+  bool IsSplittable = false;
+  bool IsSplit = false;
+  Use *OldUse = nullptr;
+  Instruction *OldPtr = nullptr;
  
    // Track post-rewrite users which are PHI nodes and Selects.
-  SmallPtrSetImpl<PHINode *> &PHIUsers;
-  SmallPtrSetImpl<SelectInst *> &SelectUsers;
+  SmallSetVector<PHINode *, 8> &PHIUsers;
+  SmallSetVector<SelectInst *, 8> &SelectUsers;
  
    // Utility IR builder, whose name prefix is setup for each visited use, and
    // the insertion point is set to point to the user.
@@ -2196,8 +2282,8 @@ public:
                        uint64_t NewAllocaBeginOffset,
                        uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
                        VectorType *PromotableVecTy,
-                      SmallPtrSetImpl<PHINode *> &PHIUsers,
-                      SmallPtrSetImpl<SelectInst *> &SelectUsers)
+                      SmallSetVector<PHINode *, 8> &PHIUsers,
+                      SmallSetVector<SelectInst *, 8> &SelectUsers)
        : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
          NewAllocaBeginOffset(NewAllocaBeginOffset),
          NewAllocaEndOffset(NewAllocaEndOffset),
@@ -2210,8 +2296,7 @@ public:
          VecTy(PromotableVecTy),
          ElementTy(VecTy ? VecTy->getElementType() : nullptr),
          ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
-        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
-        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+        PHIUsers(PHIUsers), SelectUsers(SelectUsers),
          IRB(NewAI.getContext(), ConstantFolder()) {
      if (VecTy) {
        assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
@@ -2228,9 +2313,9 @@ public:
      IsSplittable = I->isSplittable();
      IsSplit =
          BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
-    DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
-    DEBUG(AS.printSlice(dbgs(), I, ""));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+    LLVM_DEBUG(dbgs() << "\n");
  
      // Compute the intersecting offset range.
      assert(BeginOffset < NewAllocaEndOffset);
@@ -2260,7 +2345,7 @@ private:
  
    // Every instruction which can end up as a user must have a rewrite rule.
    bool visitInstruction(Instruction &I) {
-    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
      llvm_unreachable("No rewrite rule for this instruction!");
    }
  
@@ -2292,7 +2377,8 @@ private:
  #endif
  
      return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
  #ifndef NDEBUG
                            Twine(OldName) + "."
  #else
@@ -2301,7 +2387,7 @@ private:
                            );
    }
  
-  /// \brief Compute suitable alignment to access this slice of the *new*
+  /// Compute suitable alignment to access this slice of the *new*
    /// alloca.
    ///
    /// You can optionally pass a type to this routine and if that type's ABI
@@ -2363,10 +2449,15 @@ private:
    }
  
    bool visitLoadInst(LoadInst &LI) {
-    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
      Value *OldOp = LI.getOperand(0);
      assert(OldOp == OldPtr);
  
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+
+    unsigned AS = LI.getPointerAddressSpace();
+
      Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                               : LI.getType();
      const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2383,8 +2474,25 @@ private:
                   TargetTy->isIntegerTy()))) {
        LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                                LI.isVolatile(), LI.getName());
+      if (AATags)
+        NewLI->setAAMetadata(AATags);
        if (LI.isVolatile())
-        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
+      // Try to preserve nonnull metadata
        V = NewLI;
  
        // If this is an integer load past the end of the slice (which means the
@@ -2399,12 +2507,14 @@ private:
                                  "endian_shift");
            }
      } else {
-      Type *LTy = TargetTy->getPointerTo();
+      Type *LTy = TargetTy->getPointerTo(AS);
        LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                                getSliceAlign(TargetTy),
                                                LI.isVolatile(), LI.getName());
+      if (AATags)
+        NewLI->setAAMetadata(AATags);
        if (LI.isVolatile())
-        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
  
        V = NewLI;
        IsPtrAdjusted = true;
@@ -2427,23 +2537,24 @@ private:
        // the computed value, and then replace the placeholder with LI, leaving
        // LI only used for this computation.
        Value *Placeholder =
-          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
        V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                          "insert");
        LI.replaceAllUsesWith(V);
        Placeholder->replaceAllUsesWith(&LI);
-      delete Placeholder;
+      Placeholder->deleteValue();
      } else {
        LI.replaceAllUsesWith(V);
      }
  
      Pass.DeadInsts.insert(&LI);
      deleteIfTriviallyDead(OldOp);
-    DEBUG(dbgs() << "          to: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
      return !LI.isVolatile() && !IsPtrAdjusted;
    }
  
-  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+                                  AAMDNodes AATags) {
      if (V->getType() != VecTy) {
        unsigned BeginIndex = getIndex(NewBeginOffset);
        unsigned EndIndex = getIndex(NewEndOffset);
@@ -2461,14 +2572,15 @@ private:
        V = insertVector(IRB, Old, V, BeginIndex, "vec");
      }
      StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    if (AATags)
+      Store->setAAMetadata(AATags);
      Pass.DeadInsts.insert(&SI);
  
-    (void)Store;
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
      return true;
    }
  
-  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
      assert(IntTy && "We cannot extract an integer from the alloca");
      assert(!SI.isVolatile());
      if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2481,17 +2593,22 @@ private:
      }
      V = convertValue(DL, IRB, V, NewAllocaTy);
      StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    if (AATags)
+      Store->setAAMetadata(AATags);
      Pass.DeadInsts.insert(&SI);
-    (void)Store;
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
      return true;
    }
  
    bool visitStoreInst(StoreInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
      Value *OldOp = SI.getOperand(1);
      assert(OldOp == OldPtr);
  
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+
      Value *V = SI.getValueOperand();
  
      // Strip all inbounds GEPs and pointer casts to try to dig out any root
@@ -2513,9 +2630,9 @@ private:
      }
  
      if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
      if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI);
+      return rewriteIntegerStore(V, SI, AATags);
  
      const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
      StoreInst *NewSI;
@@ -2540,20 +2657,24 @@ private:
        NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                       SI.isVolatile());
      } else {
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
        NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                       SI.isVolatile());
      }
+    NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    if (AATags)
+      NewSI->setAAMetadata(AATags);
      if (SI.isVolatile())
-      NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
+      NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
      Pass.DeadInsts.insert(&SI);
      deleteIfTriviallyDead(OldOp);
  
-    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
      return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
    }
  
-  /// \brief Compute an integer value from splatting an i8 across the given
+  /// Compute an integer value from splatting an i8 across the given
    /// number of bytes.
    ///
    /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
@@ -2580,25 +2701,27 @@ private:
      return V;
    }
  
-  /// \brief Compute a vector splat for a given element value.
+  /// Compute a vector splat for a given element value.
    Value *getVectorSplat(Value *V, unsigned NumElements) {
      V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
-    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n");
      return V;
    }
  
    bool visitMemSetInst(MemSetInst &II) {
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
      assert(II.getRawDest() == OldPtr);
  
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
+
      // If the memset has a variable size, it cannot be split, just adjust the
      // pointer to the new alloca.
      if (!isa<Constant>(II.getLength())) {
        assert(!IsSplit);
        assert(NewBeginOffset == BeginOffset);
        II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
-      Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+      II.setDestAlignment(getSliceAlign());
  
        deleteIfTriviallyDead(OldPtr);
        return false;
@@ -2623,8 +2746,9 @@ private:
        CallInst *New = IRB.CreateMemSet(
            getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
            getSliceAlign(), II.isVolatile());
-      (void)New;
-      DEBUG(dbgs() << "          to: " << *New << "\n");
+      if (AATags)
+        New->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
        return false;
      }
  
@@ -2686,10 +2810,11 @@ private:
        V = convertValue(DL, IRB, V, AllocaTy);
      }
  
-    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
-                                        II.isVolatile());
-    (void)New;
-    DEBUG(dbgs() << "          to: " << *New << "\n");
+    StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                            II.isVolatile());
+    if (AATags)
+      New->setAAMetadata(AATags);
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
      return !II.isVolatile();
    }
  
@@ -2697,7 +2822,10 @@ private:
      // Rewriting of memory transfer instructions can be a bit tricky. We break
      // them into two categories: split intrinsics and unsplit intrinsics.
  
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
+
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
  
      bool IsDest = &II.getRawDestUse() == OldUse;
      assert((IsDest && II.getRawDest() == OldPtr) ||
@@ -2714,18 +2842,16 @@ private:
      // update both source and dest of a single call.
      if (!IsSplittable) {
        Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
-      if (IsDest)
+      if (IsDest) {
          II.setDest(AdjustedPtr);
-      else
+        II.setDestAlignment(SliceAlign);
+      }
+      else {
          II.setSource(AdjustedPtr);
-
-      if (II.getAlignment() > SliceAlign) {
-        Type *CstTy = II.getAlignmentCst()->getType();
-        II.setAlignment(
-            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+        II.setSourceAlignment(SliceAlign);
        }
  
-      DEBUG(dbgs() << "          to: " << II << "\n");
+      LLVM_DEBUG(dbgs() << "          to: " << II << "\n");
        deleteIfTriviallyDead(OldPtr);
        return false;
      }
@@ -2775,8 +2901,10 @@ private:
      // Compute the relative offset for the other pointer within the transfer.
      unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
      APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
-    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
-                                   OtherOffset.zextOrTrunc(64).getZExtValue());
+    unsigned OtherAlign =
+      IsDest ? II.getSourceAlignment() : II.getDestAlignment();
+    OtherAlign =  MinAlign(OtherAlign ? OtherAlign : 1,
+                           OtherOffset.zextOrTrunc(64).getZExtValue());
  
      if (EmitMemCpy) {
        // Compute the other pointer, folding as much as possible to produce
@@ -2788,11 +2916,25 @@ private:
        Type *SizeTy = II.getLength()->getType();
        Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
  
-      CallInst *New = IRB.CreateMemCpy(
-          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
-          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
-      (void)New;
-      DEBUG(dbgs() << "          to: " << *New << "\n");
+      Value *DestPtr, *SrcPtr;
+      unsigned DestAlign, SrcAlign;
+      // Note: IsDest is true iff we're copying into the new alloca slice
+      if (IsDest) {
+        DestPtr = OurPtr;
+        DestAlign = SliceAlign;
+        SrcPtr = OtherPtr;
+        SrcAlign = OtherAlign;
+      } else {
+        DestPtr = OtherPtr;
+        DestAlign = OtherAlign;
+        SrcPtr = OurPtr;
+        SrcAlign = SliceAlign;
+      }
+      CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+                                       Size, II.isVolatile());
+      if (AATags)
+        New->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
        return false;
      }
  
@@ -2840,8 +2982,11 @@ private:
        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
        Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
      } else {
-      Src =
-          IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+      LoadInst *Load = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
+                                             "copyload");
+      if (AATags)
+        Load->setAAMetadata(AATags);
+      Src = Load;
      }
  
      if (VecTy && !IsWholeAlloca && IsDest) {
@@ -2859,20 +3004,32 @@ private:
  
      StoreInst *Store = cast<StoreInst>(
          IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
-    (void)Store;
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    if (AATags)
+      Store->setAAMetadata(AATags);
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
      return !II.isVolatile();
    }
  
    bool visitIntrinsicInst(IntrinsicInst &II) {
      assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
             II.getIntrinsicID() == Intrinsic::lifetime_end);
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
      assert(II.getArgOperand(1) == OldPtr);
  
      // Record this instruction for deletion.
      Pass.DeadInsts.insert(&II);
  
+    // Lifetime intrinsics are only promotable if they cover the whole alloca.
+    // Therefore, we drop lifetime intrinsics which don't cover the whole
+    // alloca.
+    // (In theory, intrinsics which partially cover an alloca could be
+    // promoted, but PromoteMemToReg doesn't handle that case.)
+    // FIXME: Check whether the alloca is promotable before dropping the
+    // lifetime intrinsics?
+    if (NewBeginOffset != NewAllocaBeginOffset ||
+        NewEndOffset != NewAllocaEndOffset)
+      return true;
+
      ConstantInt *Size =
          ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
                           NewEndOffset - NewBeginOffset);
@@ -2884,12 +3041,49 @@ private:
        New = IRB.CreateLifetimeEnd(Ptr, Size);
  
      (void)New;
-    DEBUG(dbgs() << "          to: " << *New << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
+
      return true;
    }
  
+  void fixLoadStoreAlign(Instruction &Root) {
+    // This algorithm implements the same visitor loop as
+    // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
+    // or store found.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<Instruction *, 4> Uses;
+    Visited.insert(&Root);
+    Uses.push_back(&Root);
+    do {
+      Instruction *I = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        unsigned LoadAlign = LI->getAlignment();
+        if (!LoadAlign)
+          LoadAlign = DL.getABITypeAlignment(LI->getType());
+        LI->setAlignment(std::min(LoadAlign, getSliceAlign()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        unsigned StoreAlign = SI->getAlignment();
+        if (!StoreAlign) {
+          Value *Op = SI->getOperand(0);
+          StoreAlign = DL.getABITypeAlignment(Op->getType());
+        }
+        SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
+        continue;
+      }
+
+      assert(isa<BitCastInst>(I) || isa<PHINode>(I) ||
+             isa<SelectInst>(I) || isa<GetElementPtrInst>(I));
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)).second)
+          Uses.push_back(cast<Instruction>(U));
+    } while (!Uses.empty());
+  }
+
    bool visitPHINode(PHINode &PN) {
-    DEBUG(dbgs() << "    original: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
      assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
      assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
  
@@ -2908,9 +3102,12 @@ private:
      // Replace the operands which were using the old pointer.
      std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
  
-    DEBUG(dbgs() << "          to: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << PN << "\n");
      deleteIfTriviallyDead(OldPtr);
  
+    // Fix the alignment of any loads or stores using this PHI node.
+    fixLoadStoreAlign(PN);
+
      // PHIs can't be promoted on their own, but often can be speculated. We
      // check the speculation outside of the rewriter so that we see the
      // fully-rewritten alloca.
@@ -2919,7 +3116,7 @@ private:
    }
  
    bool visitSelectInst(SelectInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
      assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
             "Pointer isn't an operand!");
      assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
@@ -2932,9 +3129,12 @@ private:
      if (SI.getOperand(2) == OldPtr)
        SI.setOperand(2, NewPtr);
  
-    DEBUG(dbgs() << "          to: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << SI << "\n");
      deleteIfTriviallyDead(OldPtr);
  
+    // Fix the alignment of any loads or stores using this select.
+    fixLoadStoreAlign(SI);
+
      // Selects can't be promoted on their own, but often can be speculated. We
      // check the speculation outside of the rewriter so that we see the
      // fully-rewritten alloca.
@@ -2944,14 +3144,15 @@ private:
  };
  
  namespace {
-/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+
+/// Visitor to rewrite aggregate loads and stores as scalar.
  ///
  /// This pass aggressively rewrites all aggregate loads and stores on
  /// a particular pointer (or any pointer derived from it which we can identify)
  /// with scalar loads and stores.
  class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
    // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+  friend class InstVisitor<AggLoadStoreRewriter, bool>;
  
    /// Queue of pointer uses to analyze and potentially rewrite.
    SmallVector<Use *, 8> Queue;
@@ -2967,7 +3168,7 @@ public:
    /// Rewrite loads and stores through a pointer and all pointers derived from
    /// it.
    bool rewrite(Instruction &I) {
-    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
      enqueueUsers(I);
      bool Changed = false;
      while (!Queue.empty()) {
@@ -2989,17 +3190,20 @@ private:
    // Conservative default is to not rewrite anything.
    bool visitInstruction(Instruction &I) { return false; }
  
-  /// \brief Generic recursive split emission class.
+  /// Generic recursive split emission class.
    template <typename Derived> class OpSplitter {
    protected:
      /// The builder used to form new instructions.
      IRBuilderTy IRB;
+
      /// The indices which to be used with insert- or extractvalue to select the
      /// appropriate value within the aggregate.
      SmallVector<unsigned, 4> Indices;
+
      /// The indices to a GEP instruction which will move Ptr to the correct slot
      /// within the aggregate.
      SmallVector<Value *, 4> GEPIndices;
+
      /// The base pointer of the original op, used as a base for GEPing the
      /// split operations.
      Value *Ptr;
@@ -3010,7 +3214,7 @@ private:
          : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
  
    public:
-    /// \brief Generic recursive split emission routine.
+    /// Generic recursive split emission routine.
      ///
      /// This method recursively splits an aggregate op (load or store) into
      /// scalar or vector ops. It splits recursively until it hits a single value
@@ -3062,8 +3266,10 @@ private:
    };
  
    struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
-    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+    AAMDNodes AATags;
+
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
  
      /// Emit a leaf load of a single value. This is called at the leaves of the
      /// recursive emission to actually load values.
@@ -3072,9 +3278,11 @@ private:
        // Load the single value and insert it using the indices.
        Value *GEP =
            IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
-      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+      LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load");
+      if (AATags)
+        Load->setAAMetadata(AATags);
        Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
-      DEBUG(dbgs() << "          to: " << *Load << "\n");
+      LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
      }
    };
  
@@ -3084,8 +3292,10 @@ private:
        return false;
  
      // We have an aggregate being loaded, split it apart.
-    DEBUG(dbgs() << "    original: " << LI << "\n");
-    LoadOpSplitter Splitter(&LI, *U);
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+    LoadOpSplitter Splitter(&LI, *U, AATags);
      Value *V = UndefValue::get(LI.getType());
      Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
      LI.replaceAllUsesWith(V);
@@ -3094,19 +3304,26 @@ private:
    }
  
    struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
-    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+    AAMDNodes AATags;
  
      /// Emit a leaf store of a single value. This is called at the leaves of the
      /// recursive emission to actually produce stores.
      void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
        assert(Ty->isSingleValueType());
        // Extract the single value and store it using the indices.
-      Value *Store = IRB.CreateStore(
-          IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
-          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
+      //
+      // The gep and extractvalue values are factored out of the CreateStore
+      // call to make the output independent of the argument evaluation order.
+      Value *ExtractValue =
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+      Value *InBoundsGEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+      StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+      if (AATags)
+        Store->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
      }
    };
  
@@ -3118,8 +3335,10 @@ private:
        return false;
  
      // We have an aggregate being stored, split it apart.
-    DEBUG(dbgs() << "    original: " << SI << "\n");
-    StoreOpSplitter Splitter(&SI, *U);
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+    StoreOpSplitter Splitter(&SI, *U, AATags);
      Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
      SI.eraseFromParent();
      return true;
@@ -3145,9 +3364,10 @@ private:
      return false;
    }
  };
-}
  
-/// \brief Strip aggregate type wrapping.
+} // end anonymous namespace
+
+/// Strip aggregate type wrapping.
  ///
  /// This removes no-op aggregate types wrapping an underlying type. It will
  /// strip as many layers of types as it can without changing either the type
@@ -3177,7 +3397,7 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
    return stripAggregateTypeWrapping(DL, InnerTy);
  }
  
-/// \brief Try to find a partition of the aggregate type passed in for a given
+/// Try to find a partition of the aggregate type passed in for a given
  /// offset and size.
  ///
  /// This recurses through the aggregate type and tries to compute a subtype
@@ -3199,20 +3419,11 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
      return nullptr;
  
    if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
-    // We can't partition pointers...
-    if (SeqTy->isPointerTy())
-      return nullptr;
-
      Type *ElementTy = SeqTy->getElementType();
      uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
      uint64_t NumSkippedElements = Offset / ElementSize;
-    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
-      if (NumSkippedElements >= ArrTy->getNumElements())
-        return nullptr;
-    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
-      if (NumSkippedElements >= VecTy->getNumElements())
-        return nullptr;
-    }
+    if (NumSkippedElements >= SeqTy->getNumElements())
+      return nullptr;
      Offset -= NumSkippedElements * ElementSize;
  
      // First check if we need to recurse.
@@ -3292,7 +3503,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
    return SubTy;
  }
  
-/// \brief Pre-split loads and stores to simplify rewriting.
+/// Pre-split loads and stores to simplify rewriting.
  ///
  /// We want to break up the splittable load+store pairs as much as
  /// possible. This is important to do as a preprocessing step, as once we
@@ -3323,7 +3534,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
  ///
  /// \returns true if any changes are made.
  bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
-  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
  
    // Track the loads and stores which are candidates for pre-splitting here, in
    // the order they first appear during the partition scan. These give stable
@@ -3355,7 +3566,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    // maybe it would make it more principled?
    SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
  
-  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
    for (auto &P : AS.partitions()) {
      for (Slice &S : P) {
        Instruction *I = cast<Instruction>(S.getUse()->getUser());
@@ -3410,7 +3621,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        }
  
        // Record the initial split.
-      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n");
        auto &Offsets = SplitOffsetsMap[I];
        assert(Offsets.Splits.empty() &&
               "Should not have splits the first time we see an instruction!");
@@ -3446,63 +3657,63 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    // match relative to their starting offset. We have to verify this prior to
    // any rewriting.
    Stores.erase(
-      std::remove_if(Stores.begin(), Stores.end(),
-                     [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
-                       // Lookup the load we are storing in our map of split
-                       // offsets.
-                       auto *LI = cast<LoadInst>(SI->getValueOperand());
-                       // If it was completely unsplittable, then we're done,
-                       // and this store can't be pre-split.
-                       if (UnsplittableLoads.count(LI))
-                         return true;
-
-                       auto LoadOffsetsI = SplitOffsetsMap.find(LI);
-                       if (LoadOffsetsI == SplitOffsetsMap.end())
-                         return false; // Unrelated loads are definitely safe.
-                       auto &LoadOffsets = LoadOffsetsI->second;
-
-                       // Now lookup the store's offsets.
-                       auto &StoreOffsets = SplitOffsetsMap[SI];
-
-                       // If the relative offsets of each split in the load and
-                       // store match exactly, then we can split them and we
-                       // don't need to remove them here.
-                       if (LoadOffsets.Splits == StoreOffsets.Splits)
-                         return false;
-
-                       DEBUG(dbgs()
-                             << "    Mismatched splits for load and store:\n"
-                             << "      " << *LI << "\n"
-                             << "      " << *SI << "\n");
-
-                       // We've found a store and load that we need to split
-                       // with mismatched relative splits. Just give up on them
-                       // and remove both instructions from our list of
-                       // candidates.
-                       UnsplittableLoads.insert(LI);
-                       return true;
-                     }),
+      llvm::remove_if(Stores,
+                      [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+                        // Lookup the load we are storing in our map of split
+                        // offsets.
+                        auto *LI = cast<LoadInst>(SI->getValueOperand());
+                        // If it was completely unsplittable, then we're done,
+                        // and this store can't be pre-split.
+                        if (UnsplittableLoads.count(LI))
+                          return true;
+
+                        auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+                        if (LoadOffsetsI == SplitOffsetsMap.end())
+                          return false; // Unrelated loads are definitely safe.
+                        auto &LoadOffsets = LoadOffsetsI->second;
+
+                        // Now lookup the store's offsets.
+                        auto &StoreOffsets = SplitOffsetsMap[SI];
+
+                        // If the relative offsets of each split in the load and
+                        // store match exactly, then we can split them and we
+                        // don't need to remove them here.
+                        if (LoadOffsets.Splits == StoreOffsets.Splits)
+                          return false;
+
+                        LLVM_DEBUG(
+                            dbgs()
+                            << "    Mismatched splits for load and store:\n"
+                            << "      " << *LI << "\n"
+                            << "      " << *SI << "\n");
+
+                        // We've found a store and load that we need to split
+                        // with mismatched relative splits. Just give up on them
+                        // and remove both instructions from our list of
+                        // candidates.
+                        UnsplittableLoads.insert(LI);
+                        return true;
+                      }),
        Stores.end());
    // Now we have to go *back* through all the stores, because a later store may
    // have caused an earlier store's load to become unsplittable and if it is
    // unsplittable for the later store, then we can't rely on it being split in
    // the earlier store either.
-  Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
-                              [&UnsplittableLoads](StoreInst *SI) {
-                                auto *LI =
-                                    cast<LoadInst>(SI->getValueOperand());
-                                return UnsplittableLoads.count(LI);
-                              }),
+  Stores.erase(llvm::remove_if(Stores,
+                               [&UnsplittableLoads](StoreInst *SI) {
+                                 auto *LI =
+                                     cast<LoadInst>(SI->getValueOperand());
+                                 return UnsplittableLoads.count(LI);
+                               }),
                 Stores.end());
    // Once we've established all the loads that can't be split for some reason,
    // filter any that made it into our list out.
-  Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
-                             [&UnsplittableLoads](LoadInst *LI) {
-                               return UnsplittableLoads.count(LI);
-                             }),
+  Loads.erase(llvm::remove_if(Loads,
+                              [&UnsplittableLoads](LoadInst *LI) {
+                                return UnsplittableLoads.count(LI);
+                              }),
                Loads.end());
  
-
    // If no loads or stores are left, there is no pre-splitting to be done for
    // this alloca.
    if (Loads.empty() && Stores.empty())
@@ -3547,19 +3758,21 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
      Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
      IRB.SetInsertPoint(LI);
  
-    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+    LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
  
      uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
      int Idx = 0, Size = Offsets.Splits.size();
      for (;;) {
        auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
        LoadInst *PLoad = IRB.CreateAlignedLoad(
            getAdjustedPtr(IRB, DL, BasePtr,
-                         APInt(DL.getPointerSizeInBits(), PartOffset),
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
                           PartPtrTy, BasePtr->getName() + "."),
            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
            LI->getName());
+      PLoad->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
  
        // Append this load onto the list of split loads so we can find it later
        // to rewrite the stores.
@@ -3570,9 +3783,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
            Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
                  /*IsSplittable*/ false));
-      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
-                   << ", " << NewSlices.back().endOffset() << "): " << *PLoad
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PLoad << "\n");
  
        // See if we've handled all the splits.
        if (Idx >= Size)
@@ -3592,14 +3805,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        StoreInst *SI = cast<StoreInst>(LU);
        if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
          DeferredStores = true;
-        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI
+                          << "\n");
          continue;
        }
  
        Value *StoreBasePtr = SI->getPointerOperand();
        IRB.SetInsertPoint(SI);
  
-      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+      LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
  
        for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
          LoadInst *PLoad = SplitLoads[Idx];
@@ -3607,13 +3821,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
          auto *PartPtrTy =
              PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
  
+        auto AS = SI->getPointerAddressSpace();
          StoreInst *PStore = IRB.CreateAlignedStore(
-            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
-                                  APInt(DL.getPointerSizeInBits(), PartOffset),
-                                  PartPtrTy, StoreBasePtr->getName() + "."),
+            PLoad,
+            getAdjustedPtr(IRB, DL, StoreBasePtr,
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                           PartPtrTy, StoreBasePtr->getName() + "."),
              getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
-        (void)PStore;
-        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+        PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
+        LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
        }
  
        // We want to immediately iterate on any allocas impacted by splitting
@@ -3662,7 +3878,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
      Value *LoadBasePtr = LI->getPointerOperand();
      Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
  
-    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+    LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
  
      // Check whether we have an already split load.
      auto SplitLoadsMapI = SplitLoadsMap.find(LI);
@@ -3672,14 +3888,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
               "Too few split loads for the number of splits in the store!");
      } else {
-      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+      LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n");
      }
  
      uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
      int Idx = 0, Size = Offsets.Splits.size();
      for (;;) {
        auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
  
        // Either lookup a split load or create one.
        LoadInst *PLoad;
@@ -3687,20 +3904,23 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
          PLoad = (*SplitLoads)[Idx];
        } else {
          IRB.SetInsertPoint(LI);
+        auto AS = LI->getPointerAddressSpace();
          PLoad = IRB.CreateAlignedLoad(
              getAdjustedPtr(IRB, DL, LoadBasePtr,
-                           APInt(DL.getPointerSizeInBits(), PartOffset),
-                           PartPtrTy, LoadBasePtr->getName() + "."),
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                           LoadPartPtrTy, LoadBasePtr->getName() + "."),
              getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
              LI->getName());
        }
  
        // And store this partition.
        IRB.SetInsertPoint(SI);
+      auto AS = SI->getPointerAddressSpace();
        StoreInst *PStore = IRB.CreateAlignedStore(
-          PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
-                                APInt(DL.getPointerSizeInBits(), PartOffset),
-                                PartPtrTy, StoreBasePtr->getName() + "."),
+          PLoad,
+          getAdjustedPtr(IRB, DL, StoreBasePtr,
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                         StorePartPtrTy, StoreBasePtr->getName() + "."),
            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
  
        // Now build a new slice for the alloca.
@@ -3708,11 +3928,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
            Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
                  /*IsSplittable*/ false));
-      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
-                   << ", " << NewSlices.back().endOffset() << "): " << *PStore
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PStore << "\n");
        if (!SplitLoads) {
-        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+        LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
        }
  
        // See if we've finished all the splits.
@@ -3760,32 +3980,31 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    }
  
    // Remove the killed slices that have ben pre-split.
-  AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
-    return S.isDead();
-  }), AS.end());
+  AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
+           AS.end());
  
    // Insert our new slices. This will sort and merge them into the sorted
    // sequence.
    AS.insert(NewSlices);
  
-  DEBUG(dbgs() << "  Pre-split slices:\n");
+  LLVM_DEBUG(dbgs() << "  Pre-split slices:\n");
  #ifndef NDEBUG
    for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
-    DEBUG(AS.print(dbgs(), I, "    "));
+    LLVM_DEBUG(AS.print(dbgs(), I, "    "));
  #endif
  
    // Finally, don't try to promote any allocas that new require re-splitting.
    // They have already been added to the worklist above.
    PromotableAllocas.erase(
-      std::remove_if(
-          PromotableAllocas.begin(), PromotableAllocas.end(),
+      llvm::remove_if(
+          PromotableAllocas,
            [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
        PromotableAllocas.end());
  
    return true;
  }
  
-/// \brief Rewrite an alloca partition's users.
+/// Rewrite an alloca partition's users.
  ///
  /// This routine drives both of the rewriting goals of the SROA pass. It tries
  /// to rewrite uses of an alloca partition to be conducive for SSA value
@@ -3828,10 +4047,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    // exact same type as the original, and with the same access offsets. In that
    // case, re-use the existing alloca, but still run through the rewriter to
    // perform phi and select speculation.
+  // P.beginOffset() can be non-zero even with the same type in a case with
+  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
    AllocaInst *NewAI;
-  if (SliceTy == AI.getAllocatedType()) {
-    assert(P.beginOffset() == 0 &&
-           "Non-zero begin offset but same alloca type");
+  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
      NewAI = &AI;
      // FIXME: We should be able to bail at this point with "nothing changed".
      // FIXME: We might want to defer PHI speculation until after here.
@@ -3850,22 +4069,24 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
      if (Alignment <= DL.getABITypeAlignment(SliceTy))
        Alignment = 0;
      NewAI = new AllocaInst(
-        SliceTy, nullptr, Alignment,
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
          AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+    // Copy the old AI debug location over to the new one.
+    NewAI->setDebugLoc(AI.getDebugLoc());
      ++NumNewAllocas;
    }
  
-  DEBUG(dbgs() << "Rewriting alloca partition "
-               << "[" << P.beginOffset() << "," << P.endOffset()
-               << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+                    << "[" << P.beginOffset() << "," << P.endOffset()
+                    << ") to: " << *NewAI << "\n");
  
    // Track the high watermark on the worklist as it is only relevant for
    // promoted allocas. We will reset it to this point if the alloca is not in
    // fact scheduled for promotion.
    unsigned PPWOldSize = PostPromotionWorklist.size();
    unsigned NumUses = 0;
-  SmallPtrSet<PHINode *, 8> PHIUsers;
-  SmallPtrSet<SelectInst *, 8> SelectUsers;
+  SmallSetVector<PHINode *, 8> PHIUsers;
+  SmallSetVector<SelectInst *, 8> SelectUsers;
  
    AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
                                 P.endOffset(), IsIntegerPromotable, VecTy,
@@ -3881,24 +4102,20 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    }
  
    NumAllocaPartitionUses += NumUses;
-  MaxUsesPerAllocaPartition =
-      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+  MaxUsesPerAllocaPartition.updateMax(NumUses);
  
    // Now that we've processed all the slices in the new partition, check if any
    // PHIs or Selects would block promotion.
-  for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
-                                            E = PHIUsers.end();
-       I != E; ++I)
-    if (!isSafePHIToSpeculate(**I)) {
+  for (PHINode *PHI : PHIUsers)
+    if (!isSafePHIToSpeculate(*PHI)) {
        Promotable = false;
        PHIUsers.clear();
        SelectUsers.clear();
        break;
      }
-  for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
-                                               E = SelectUsers.end();
-       I != E; ++I)
-    if (!isSafeSelectToSpeculate(**I)) {
+
+  for (SelectInst *Sel : SelectUsers)
+    if (!isSafeSelectToSpeculate(*Sel)) {
        Promotable = false;
        PHIUsers.clear();
        SelectUsers.clear();
@@ -3920,21 +4137,25 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
        Worklist.insert(NewAI);
      }
    } else {
-    // If we can't promote the alloca, iterate on it to check for new
-    // refinements exposed by splitting the current alloca. Don't iterate on an
-    // alloca which didn't actually change and didn't get promoted.
-    if (NewAI != &AI)
-      Worklist.insert(NewAI);
-
      // Drop any post-promotion work items if promotion didn't happen.
      while (PostPromotionWorklist.size() > PPWOldSize)
        PostPromotionWorklist.pop_back();
+
+    // We couldn't promote and we didn't create a new partition, nothing
+    // happened.
+    if (NewAI == &AI)
+      return nullptr;
+
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
    }
  
    return NewAI;
  }
  
-/// \brief Walks the slices of an alloca and form partitions based on them,
+/// Walks the slices of an alloca and form partitions based on them,
  /// rewriting each of their uses.
  bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
    if (AS.begin() == AS.end())
@@ -3947,40 +4168,71 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
    // First try to pre-split loads and stores.
    Changed |= presplitLoadsAndStores(AI, AS);
  
-  // Now that we have identified any pre-splitting opportunities, mark any
-  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
-  // to split these during pre-splitting, we want to force them to be
-  // rewritten into a partition.
+  // Now that we have identified any pre-splitting opportunities,
+  // mark loads and stores unsplittable except for the following case.
+  // We leave a slice splittable if all other slices are disjoint or fully
+  // included in the slice, such as whole-alloca loads and stores.
+  // If we fail to split these during pre-splitting, we want to force them
+  // to be rewritten into a partition.
    bool IsSorted = true;
-  for (Slice &S : AS) {
-    if (!S.isSplittable())
-      continue;
-    // FIXME: We currently leave whole-alloca splittable loads and stores. This
-    // used to be the only splittable loads and stores and we need to be
-    // confident that the above handling of splittable loads and stores is
-    // completely sufficient before we forcibly disable the remaining handling.
-    if (S.beginOffset() == 0 &&
-        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
-      continue;
-    if (isa<LoadInst>(S.getUse()->getUser()) ||
-        isa<StoreInst>(S.getUse()->getUser())) {
-      S.makeUnsplittable();
-      IsSorted = false;
+
+  uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+  const uint64_t MaxBitVectorSize = 1024;
+  if (AllocaSize <= MaxBitVectorSize) {
+    // If a byte boundary is included in any load or store, a slice starting or
+    // ending at the boundary is not splittable.
+    SmallBitVector SplittableOffset(AllocaSize + 1, true);
+    for (Slice &S : AS)
+      for (unsigned O = S.beginOffset() + 1;
+           O < S.endOffset() && O < AllocaSize; O++)
+        SplittableOffset.reset(O);
+
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+          (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
+    }
+  }
+  else {
+    // We only allow whole-alloca splittable loads and stores
+    // for a large alloca to avoid creating too large BitVector.
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
      }
    }
+
    if (!IsSorted)
-    std::sort(AS.begin(), AS.end());
+    llvm::sort(AS.begin(), AS.end());
  
-  /// \brief Describes the allocas introduced by rewritePartition
-  /// in order to migrate the debug info.
-  struct Piece {
+  /// Describes the allocas introduced by rewritePartition in order to migrate
+  /// the debug info.
+  struct Fragment {
      AllocaInst *Alloca;
      uint64_t Offset;
      uint64_t Size;
-    Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+    Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
        : Alloca(AI), Offset(O), Size(S) {}
    };
-  SmallVector<Piece, 4> Pieces;
+  SmallVector<Fragment, 4> Fragments;
  
    // Rewrite each partition.
    for (auto &P : AS.partitions()) {
@@ -3991,58 +4243,80 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
          uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
          // Don't include any padding.
          uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
        }
      }
      ++NumPartitions;
    }
  
    NumAllocaPartitions += NumPartitions;
-  MaxPartitionsPerAlloca =
-      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+  MaxPartitionsPerAlloca.updateMax(NumPartitions);
  
    // Migrate debug information from the old alloca to the new alloca(s)
    // and the individual partitions.
-  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
-    auto *Var = DbgDecl->getVariable();
-    auto *Expr = DbgDecl->getExpression();
+  TinyPtrVector<DbgInfoIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
+  if (!DbgDeclares.empty()) {
+    auto *Var = DbgDeclares.front()->getVariable();
+    auto *Expr = DbgDeclares.front()->getExpression();
+    auto VarSize = Var->getSizeInBits();
      DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
      uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
-    for (auto Piece : Pieces) {
-      // Create a piece expression describing the new partition or reuse AI's
+    for (auto Fragment : Fragments) {
+      // Create a fragment expression describing the new partition or reuse AI's
        // expression if there is only one partition.
-      auto *PieceExpr = Expr;
-      if (Piece.Size < AllocaSize || Expr->isBitPiece()) {
+      auto *FragmentExpr = Expr;
+      if (Fragment.Size < AllocaSize || Expr->isFragment()) {
          // If this alloca is already a scalar replacement of a larger aggregate,
-        // Piece.Offset describes the offset inside the scalar.
-        uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
-        uint64_t Start = Offset + Piece.Offset;
-        uint64_t Size = Piece.Size;
-        if (Expr->isBitPiece()) {
-          uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+        // Fragment.Offset describes the offset inside the scalar.
+        auto ExprFragment = Expr->getFragmentInfo();
+        uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+        uint64_t Start = Offset + Fragment.Offset;
+        uint64_t Size = Fragment.Size;
+        if (ExprFragment) {
+          uint64_t AbsEnd =
+              ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
            if (Start >= AbsEnd)
              // No need to describe a SROAed padding.
              continue;
            Size = std::min(Size, AbsEnd - Start);
          }
-        PieceExpr = DIB.createBitPieceExpression(Start, Size);
-      } else {
-        assert(Pieces.size() == 1 &&
-               "partition is as large as original alloca");
+        // The new, smaller fragment is stenciled out from the old fragment.
+        if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+          assert(Start >= OrigFragment->OffsetInBits &&
+                 "new fragment is outside of original fragment");
+          Start -= OrigFragment->OffsetInBits;
+        }
+
+        // The alloca may be larger than the variable.
+        if (VarSize) {
+          if (Size > *VarSize)
+            Size = *VarSize;
+          if (Size == 0 || Start + Size > *VarSize)
+            continue;
+        }
+
+        // Avoid creating a fragment expression that covers the entire variable.
+        if (!VarSize || *VarSize != Size) {
+          if (auto E =
+                  DIExpression::createFragmentExpression(Expr, Start, Size))
+            FragmentExpr = *E;
+          else
+            continue;
+        }
        }
  
-      // Remove any existing dbg.declare intrinsic describing the same alloca.
-      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
-        OldDDI->eraseFromParent();
+      // Remove any existing intrinsics describing the same alloca.
+      for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
+        OldDII->eraseFromParent();
  
-      DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
-                        &AI);
+      DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
+                        DbgDeclares.front()->getDebugLoc(), &AI);
      }
    }
    return Changed;
  }
  
-/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with undef, deleting the used value if it becomes dead.
  void SROA::clobberUse(Use &U) {
    Value *OldV = U;
    // Replace the use with an undef value.
@@ -4057,13 +4331,13 @@ void SROA::clobberUse(Use &U) {
      }
  }
  
-/// \brief Analyze an alloca for SROA.
+/// Analyze an alloca for SROA.
  ///
  /// This analyzes the alloca to ensure we can reason about it, builds
  /// the slices of the alloca, and then hands it off to be split and
  /// rewritten as needed.
  bool SROA::runOnAlloca(AllocaInst &AI) {
-  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
    ++NumAllocasAnalyzed;
  
    // Special case dead allocas, as they're trivial.
@@ -4087,7 +4361,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
  
    // Build the slices using a recursive instruction-visiting builder.
    AllocaSlices AS(DL, AI);
-  DEBUG(AS.print(dbgs()));
+  LLVM_DEBUG(AS.print(dbgs()));
    if (AS.isEscaped())
      return Changed;
  
@@ -4115,18 +4389,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
  
    Changed |= splitAlloca(AI, AS);
  
-  DEBUG(dbgs() << "  Speculating PHIs\n");
+  LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
    while (!SpeculatablePHIs.empty())
      speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
  
-  DEBUG(dbgs() << "  Speculating Selects\n");
+  LLVM_DEBUG(dbgs() << "  Speculating Selects\n");
    while (!SpeculatableSelects.empty())
      speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
  
    return Changed;
  }
  
-/// \brief Delete the dead instructions accumulated in this run.
+/// Delete the dead instructions accumulated in this run.
  ///
  /// Recursively deletes the dead instructions we've accumulated. This is done
  /// at the very end to maximize locality of the recursive delete and to
@@ -4135,11 +4409,21 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
  ///
  /// We also record the alloca instructions deleted here so that they aren't
  /// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(
+bool SROA::deleteDeadInstructions(
      SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+  bool Changed = false;
    while (!DeadInsts.empty()) {
      Instruction *I = DeadInsts.pop_back_val();
-    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    // If the instruction is an alloca, find the possible dbg.declare connected
+    // to it, and remove it too. We must do this before calling RAUW or we will
+    // not be able to find it.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      DeletedAllocas.insert(AI);
+      for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(AI))
+        OldDII->eraseFromParent();
+    }
  
      I->replaceAllUsesWith(UndefValue::get(I->getType()));
  
@@ -4151,18 +4435,14 @@ void SROA::deleteDeadInstructions(
            DeadInsts.insert(U);
        }
  
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      DeletedAllocas.insert(AI);
-      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
-        DbgDecl->eraseFromParent();
-    }
-
      ++NumDeleted;
      I->eraseFromParent();
+    Changed = true;
    }
+  return Changed;
  }
  
-/// \brief Promote the allocas, using the best available technique.
+/// Promote the allocas, using the best available technique.
  ///
  /// This attempts to promote whatever allocas have been identified as viable in
  /// the PromotableAllocas list. If that list is empty, there is nothing to do.
@@ -4173,15 +4453,15 @@ bool SROA::promoteAllocas(Function &F) {
  
    NumPromoted += PromotableAllocas.size();
  
-  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
    PromotableAllocas.clear();
    return true;
  }
  
  PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
                                  AssumptionCache &RunAC) {
-  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
    C = &F.getContext();
    DT = &RunDT;
    AC = &RunAC;
@@ -4201,7 +4481,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
    do {
      while (!Worklist.empty()) {
        Changed |= runOnAlloca(*Worklist.pop_back_val());
-      deleteDeadInstructions(DeletedAllocas);
+      Changed |= deleteDeadInstructions(DeletedAllocas);
  
        // Remove the deleted allocas from various lists so that we don't try to
        // continue processing them.
@@ -4209,9 +4489,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
          auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
          Worklist.remove_if(IsInSet);
          PostPromotionWorklist.remove_if(IsInSet);
-        PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
-                                               PromotableAllocas.end(),
-                                               IsInSet),
+        PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
                                  PromotableAllocas.end());
          DeletedAllocas.clear();
        }
@@ -4223,12 +4501,16 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
      PostPromotionWorklist.clear();
    } while (!Worklist.empty());
  
-  // FIXME: Even when promoting allocas we should preserve some abstract set of
-  // CFG-specific analyses.
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
  }
  
-PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
    return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
                   AM.getResult<AssumptionAnalysis>(F));
  }
@@ -4242,11 +4524,14 @@ class llvm::sroa::SROALegacyPass : public FunctionPass {
    SROA Impl;
  
  public:
+  static char ID;
+
    SROALegacyPass() : FunctionPass(ID) {
      initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
    }
+
    bool runOnFunction(Function &F) override {
-    if (skipOptnoneFunction(F))
+    if (skipFunction(F))
        return false;
  
      auto PA = Impl.runImpl(
@@ -4254,6 +4539,7 @@ public:
          getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
      return !PA.areAllPreserved();
    }
+
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<AssumptionCacheTracker>();
      AU.addRequired<DominatorTreeWrapperPass>();
@@ -4261,8 +4547,7 @@ public:
      AU.setPreservesCFG();
    }
  
-  const char *getPassName() const override { return "SROA"; }
-  static char ID;
+  StringRef getPassName() const override { return "SROA"; }
  };
  
  char SROALegacyPass::ID = 0;