From 48b84df15d5b726ed5ea38cd78aafcd62e5d7fa2 Mon Sep 17 00:00:00 2001
From: Mohammad Shahid <Asghar-ahmad.Shahid@amd.com>
Date: Fri, 3 Mar 2017 10:02:47 +0000
Subject: [PATCH] [SLP] Fixes the bug due to absence of in order uses of
 scalars which needs to be available for VectorizeTree() API.This API uses it
 for proper mask computation to be used in shufflevector IR. The fix is to
 compute the mask for out of order memory accesses while building the
 vectorizable tree instead of actual vectorization of vectorizable tree.It
 also needs to recompute the proper Lane for external use of vectorizable
 scalars based on shuffle mask.

Reviewers: mkuper

Differential Revision: https://reviews.llvm.org/D30159

Change-Id: Ide8773ce0ad3562f3cf4d1a0ad0f487e2f60ce5d

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296863 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h         |   7 +-
 lib/Analysis/LoopAccessAnalysis.cpp                |  32 +++--
 lib/Transforms/Vectorize/SLPVectorizer.cpp         | 151 +++++++++++----------
 .../SLPVectorizer/X86/jumbled-load-bug.ll          |  43 ++++++
 test/Transforms/SLPVectorizer/X86/jumbled-same.ll  |   2 +-
 5 files changed, 152 insertions(+), 83 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 0ad52c0e62b..fe86608921d 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -660,12 +660,15 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
 /// \brief Try to sort an array of loads / stores.
 ///
 /// An array of loads / stores can only be sorted if all pointer operands
-/// refer to the same object, and the differences between these pointers 
+/// refer to the same object, and the differences between these pointers
 /// are known to be constant. If that is the case, this returns true, and the
 /// sorted array is returned in \p Sorted. Otherwise, this returns false, and
 /// \p Sorted is invalid.
+//  If \p Mask is not null, it also returns the \p Mask which is the shuffle
+//  mask for actual memory access order.
 bool sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                     ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
+                     ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted,
+                     SmallVectorImpl<unsigned> *Mask = nullptr);
 
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index a3ed412b738..0d588b901a5 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1040,7 +1040,8 @@ static unsigned getAddressSpaceOperand(Value *I) {
 
 bool llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
                            ScalarEvolution &SE,
-                           SmallVectorImpl<Value *> &Sorted) {
+                           SmallVectorImpl<Value *> &Sorted,
+                           SmallVectorImpl<unsigned> *Mask) {
   SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
   OffValPairs.reserve(VL.size());
   Sorted.reserve(VL.size());
@@ -1050,7 +1051,6 @@ bool llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
   Value *Ptr0 = getPointerOperand(VL[0]);
   const SCEV *Scev0 = SE.getSCEV(Ptr0);
   Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
-
   for (auto *Val : VL) {
     // The only kind of access we care about here is load.
     if (!isa<LoadInst>(Val))
@@ -1077,14 +1077,30 @@ bool llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
     OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
   }
 
-  std::sort(OffValPairs.begin(), OffValPairs.end(),
-            [](const std::pair<int64_t, Value *> &Left,
-               const std::pair<int64_t, Value *> &Right) {
-              return Left.first < Right.first;
+  SmallVector<unsigned, 4> UseOrder(VL.size());
+  for (unsigned i = 0; i < VL.size(); i++) {
+    UseOrder[i] = i;
+  }
+
+  // Sort the memory accesses and keep the order of their uses in UseOrder.
+  std::sort(UseOrder.begin(), UseOrder.end(),
+            [&OffValPairs](unsigned Left, unsigned Right) {
+              return OffValPairs[Left].first < OffValPairs[Right].first;
             });
 
-  for (auto &it : OffValPairs)
-    Sorted.push_back(it.second);
+  for (unsigned i = 0; i < VL.size(); i++)
+    Sorted.emplace_back(OffValPairs[UseOrder[i]].second);
+
+  // Sort UseOrder to compute the Mask.
+  if (Mask) {
+    Mask->reserve(VL.size());
+    for (unsigned i = 0; i < VL.size(); i++)
+      Mask->emplace_back(i);
+    std::sort(Mask->begin(), Mask->end(),
+              [&UseOrder](unsigned Left, unsigned Right) {
+                return UseOrder[Left] < UseOrder[Right];
+              });
+  }
 
   return true;
 }
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index edaab89ae46..f056679ae89 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -423,10 +423,8 @@ private:
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
 
-  /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars
-  /// in order of its usage in a user program, for example ADD1, ADD2 and so on
-  /// or LOAD1 , LOAD2 etc.
-  Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
@@ -466,8 +464,8 @@ private:
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0), NeedToShuffle(0) {}
+    TreeEntry()
+        : Scalars(), VectorizedValue(nullptr), NeedToGather(0), ShuffleMask() {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
@@ -495,19 +493,23 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather;
 
-    /// Do we need to shuffle the load ?
-    bool NeedToShuffle;
+    /// Records optional suffle mask for jumbled memory accesses in this.
+    SmallVector<unsigned, 8> ShuffleMask;
+
   };
 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          bool NeedToShuffle) {
+                          ArrayRef<unsigned> ShuffleMask = None) {
     VectorizableTree.emplace_back();
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
-    Last->NeedToShuffle = NeedToShuffle;
+    if (!ShuffleMask.empty())
+      Last->ShuffleMask.insert(Last->ShuffleMask.begin(), ShuffleMask.begin(),
+                               ShuffleMask.end());
+
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
@@ -1030,21 +1032,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false);
     return;
   }
 
   // Don't handle vectors.
   if (VL[0]->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
       return;
     }
   unsigned Opcode = getSameOpcode(VL);
@@ -1061,7 +1063,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false);
     return;
   }
 
@@ -1073,7 +1075,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
       return;
     }
   }
@@ -1086,7 +1088,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false);
         return;
       }
     }
@@ -1099,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (ScalarToTreeEntry.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
       return;
     }
   }
@@ -1109,7 +1111,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
       return;
     }
   }
@@ -1123,7 +1125,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false);
     return;
   }
 
@@ -1132,7 +1134,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     for (unsigned j = i+1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false);
         return;
       }
 
@@ -1147,7 +1149,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1164,12 +1166,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false, false);
+            newTreeEntry(VL, false);
             return;
           }
         }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1191,7 +1193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       } else {
         BS.cancelScheduling(VL);
       }
-      newTreeEntry(VL, Reuse, false);
+      newTreeEntry(VL, Reuse);
       return;
     }
     case Instruction::Load: {
@@ -1207,7 +1209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1218,7 +1220,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1238,7 +1240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, false);
+        newTreeEntry(VL, true);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1255,7 +1257,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (VL.size() > 2 && !ReverseConsecutive) {
         bool ShuffledLoads = true;
         SmallVector<Value *, 8> Sorted;
-        if (sortMemAccesses(VL, *DL, *SE, Sorted)) {
+        SmallVector<unsigned, 4> Mask;
+        if (sortMemAccesses(VL, *DL, *SE, Sorted, &Mask)) {
           auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
           for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
             if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
@@ -1264,14 +1267,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             }
           }
           if (ShuffledLoads) {
-            newTreeEntry(NewVL, true, true);
+            newTreeEntry(NewVL, true, makeArrayRef(Mask.begin(), Mask.end()));
             return;
           }
         }
       }
 
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
 
       if (ReverseConsecutive) {
         ++NumLoadsWantToChangeOrder;
@@ -1298,12 +1301,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1326,13 +1329,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1364,7 +1367,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1393,7 +1396,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (cast<Instruction>(Val)->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           return;
         }
       }
@@ -1406,7 +1409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           return;
         }
       }
@@ -1418,12 +1421,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1440,12 +1443,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1463,7 +1466,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1477,7 +1480,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1488,7 +1491,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false, false);
+            newTreeEntry(VL, false);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1501,14 +1504,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1525,11 +1528,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1553,7 +1556,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
     default:
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1792,7 +1795,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
             TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
                                            VecTy, alignment, 0);
-      if (E->NeedToShuffle) {
+      if (!E->ShuffleMask.empty()) {
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
       }
@@ -2358,8 +2361,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   if (ScalarToTreeEntry.count(VL[0])) {
     int Idx = ScalarToTreeEntry[VL[0]];
     TreeEntry *E = &VectorizableTree[Idx];
-    if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE)))
-      return vectorizeTree(VL, E);
+    if (E->isSame(VL) ||
+        (!E->ShuffleMask.empty() && E->isFoundJumbled(VL, *DL, *SE)))
+      return vectorizeTree(E);
   }
 
   Type *ScalarTy = VL[0]->getType();
@@ -2370,10 +2374,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
-  if (E->VectorizedValue && !E->NeedToShuffle) {
+  if (E->VectorizedValue && E->ShuffleMask.empty()) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
@@ -2611,27 +2615,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
 
       // As program order of scalar loads are jumbled, the vectorized 'load'
       // must be followed by a 'shuffle' with the required jumbled mask.
-      if (!VL.empty() && (E->NeedToShuffle)) {
-        assert(VL.size() == E->Scalars.size() &&
-               "Equal number of scalars expected");
+      if (!E->ShuffleMask.empty()) {
         SmallVector<Constant *, 8> Mask;
-        for (Value *Val : VL) {
-          if (ScalarToTreeEntry.count(Val)) {
-            int Idx = ScalarToTreeEntry[Val];
-            TreeEntry *E = &VectorizableTree[Idx];
-            for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
-              if (E->Scalars[Lane] == Val) {
-                Mask.push_back(Builder.getInt32(Lane));
-                break;
-              }
-            }
-          }
+        for (unsigned Lane = 0, LE = E->ShuffleMask.size(); Lane != LE;
+             ++Lane) {
+          Mask.push_back(Builder.getInt32(E->ShuffleMask[Lane]));
         }
-
         // Generate shuffle for jumbled memory access
         Value *Undef = UndefValue::get(VecTy);
         Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
                                                   ConstantVector::get(Mask));
+        E->VectorizedValue = Shuf;
+        ++NumVectorInstructions;
         return Shuf;
       }
 
@@ -2816,7 +2811,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(), &VectorizableTree[0]);
+  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
@@ -2861,8 +2856,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
-
-    Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    unsigned i = 0;
+    Value *Lane;
+    // In case vectorizable scalars use are not in-order, scalars would have
+    // been shuffled.Recompute the proper Lane of ExternalUse.
+    if (!E->ShuffleMask.empty()) {
+      SmallVector<unsigned, 4> Val(E->ShuffleMask.size());
+      for (; i < E->ShuffleMask.size(); i++) {
+        if (E->ShuffleMask[i] == (unsigned)ExternalUse.Lane)
+          break;
+      }
+      Lane = Builder.getInt32(i);
+    } else {
+      Lane = Builder.getInt32(ExternalUse.Lane);
+    }
     // If User == nullptr, the Scalar is used as extra arg. Generate
     // ExtractElement instruction and update the record for this scalar in
     // ExternallyUsedValues.
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
new file mode 100644
index 00000000000..38a42b38eba
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -slp-vectorizer | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x i32> @zot() #0 {
+; CHECK-LABEL: @zot(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i8> undef, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x i8> [[I0]], i8 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x i8> [[I1]], i8 [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x i8> [[I2]], i8 [[TMP6]], i32 3
+; CHECK-NEXT:    [[RET:%.*]] = zext <4 x i8> [[I3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+bb:
+  %p0 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 0
+  %p1 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 1
+  %p2 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 2
+  %p3 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 3
+  %v3 = load i8, i8* %p3, align 1
+  %v2 = load i8, i8* %p2, align 1
+  %v0 = load i8, i8* %p0, align 1
+  %v1 = load i8, i8* %p1, align 1
+  %i0 = insertelement <4 x i8> undef, i8 %v1, i32 0
+  %i1 = insertelement <4 x i8> %i0, i8 %v0, i32 1
+  %i2 = insertelement <4 x i8> %i1, i8 %v2, i32 2
+  %i3 = insertelement <4 x i8> %i2, i8 %v3, i32 3
+  %ret = zext <4 x i8> %i3 to <4 x i32>
+  ret <4 x i32> %ret
+}
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-same.ll b/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
index 623ab1669c1..bb8eceea239 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
@@ -13,7 +13,7 @@ define i32 @fn1() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-- 
2.11.0