From bf8772ed2cc89a495e2692919331d7a03e76d791 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Oct 2012 08:26:33 +0000
Subject: [PATCH] Vectorize: teach cavVectorizeMemory to distinguish between
 A[i]+=x and A[B[i]]+=x. If the pointer is consecutive then it is safe to read
 and write. If the pointer is non-loop-consecutive then it is unsafe to
 vectorize it because we may hit an ordering issue.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166371 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 211 +++++++++++++++++++----------
 test/Transforms/LoopVectorize/increment.ll |  66 +++++++++
 test/Transforms/LoopVectorize/read-only.ll |  32 +++++
 3 files changed, 235 insertions(+), 74 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/increment.ll
 create mode 100644 test/Transforms/LoopVectorize/read-only.ll

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 968d4719f59..c11c66f1aeb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -76,7 +76,7 @@ public:
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
     ///Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
+    createEmptyLoop(Legal);
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
    vectorizeLoop(Legal);
@@ -86,7 +86,7 @@ public:
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
@@ -107,10 +107,6 @@ private:
   /// for each element in the vector. Starting from zero.
   Value *getConsecutiveVector(Value* Val);
 
-  /// Check that the GEP operands are all uniform except for the last index
-  /// which has to be the induction variable.
-  bool isConsecutiveGep(GetElementPtrInst *Gep);
-
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -196,6 +192,10 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
+  /// Check that the GEP operands are all uniform except for the last index
+  /// which has to be the induction variable.
+  bool isConsecutiveGep(Value *Ptr);
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -221,6 +221,8 @@ private:
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns True, if 'Phi' is an induction variable.
+  bool isInductionVariable(PHINode *Phi);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -338,8 +340,8 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-
-bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
+bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
 
@@ -348,7 +350,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return false;
 
   // We can emit wide load/stores only of the last index is the induction
@@ -460,7 +462,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::createEmptyLoop() {
+void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -510,7 +512,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
-  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+  OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
 
@@ -637,7 +639,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Special handling for the induction var.
         if (OldInduction == Inst)
           continue;
-        // This is phase I of vectorizing PHIs.
+        // This is phase one of vectorizing PHIs.
         // This has to be a reduction variable.
         assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
         Type *VecTy = VectorType::get(Inst->getType(), VF);
@@ -704,7 +706,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = SI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
         // This store does not use GEPs.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -728,7 +730,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
         // We don't have a gep. Scalarize the load.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -930,6 +932,16 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
         return false;
       }
+
+      if (isInductionVariable(Phi)) {
+        if (Induction) {
+          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+          return false;
+        }
+        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+        Induction = Phi;
+        continue;
+      }
       if (AddReductionVar(Phi, IntegerAdd)) {
         DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
         continue;
@@ -938,28 +950,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
         continue;
       }
-      if (Induction) {
-        DEBUG(dbgs() << "LV: Found too many PHIs.\n");
-        return false;
-      }
-      // Found the induction variable.
-      Induction = Phi;
-
-      // Check that the PHI is consecutive and starts at zero.
-      const SCEV *PhiScev = SE->getSCEV(Phi);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-      if (!AR) {
-        DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-        return false;
-      }
-
-      const SCEV *Step = AR->getStepRecurrence(*SE);
-      const SCEV *Start = AR->getStart();
-
-      if (!Step->isOne() || !Start->isZero()) {
-        DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
-        return false;
-      }
     }// end of PHI handling
 
     // We still don't handle functions.
@@ -1004,16 +994,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
-  // Holds the read and write pointers that we find.
-  typedef SmallVector<Value*, 10> ValueVector;
-  ValueVector Reads;
-  ValueVector Writes;
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
 
+  // Scan the BB and collect legal loads and stores.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
-    // If this is a load, record its pointer. If it is not a load, abort.
-    // Notice that we don't handle function calls that read or write.
+    // If this is a load, save it. If this instruction can read from memory
+    // but is not a load, then we quit. Notice that we don't handle function
+    // calls that read or write.
     if (I->mayReadFromMemory()) {
       LoadInst *Ld = dyn_cast<LoadInst>(I);
       if (!Ld) return false;
@@ -1021,13 +1014,11 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple load.\n");
         return false;
       }
-
-      Value* Ptr = Ld->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Reads, DL);
+      Loads.push_back(Ld);
+      continue;
     }
 
-    // Record store pointers. Abort on all other instructions that write to
-    // memory.
+    // Save store instructions. Abort if other instructions write to memory.
     if (I->mayWriteToMemory()) {
       StoreInst *St = dyn_cast<StoreInst>(I);
       if (!St) return false;
@@ -1035,45 +1026,99 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple store.\n");
         return false;
       }
-
-      Value* Ptr = St->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Writes, DL);
+      Stores.push_back(St);
     }
   } // next instr.
 
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
 
-  // Check that the underlying objects of the reads and writes are either
-  // disjoint memory locations, or that they are no-alias arguments.
-  ValueVector::iterator r, re, w, we;
-  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (!isIdentifiedSafeObject(*r)) {
-      DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
-      return false;
-    }
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+        return true;
   }
 
-  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!isIdentifiedSafeObject(*w)) {
-      DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
-      return false;
-    }
+  // Holds the read and read-write *pointers* that we find.
+  ValueVector Reads;
+  ValueVector ReadWrites;
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = dyn_cast<StoreInst>(*I);
+    assert(ST && "Bad StoreInst");
+    Value* Ptr = ST->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to
+    // the read-write list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr))
+      ReadWrites.push_back(Ptr);
   }
 
-  // Check that there are no multiple write locations to the same pointer.
-  SmallPtrSet<Value*, 8> WritePointerSet;
-  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!WritePointerSet.insert(*w)) {
-      DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n");
-      return false;
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = dyn_cast<LoadInst>(*I);
+    assert(LD && "Bad LoadInst");
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+      Reads.push_back(Ptr);
+  }
+
+  // Now that the pointers are in two lists (Reads and ReadWrites), we
+  // can check that there are no conflicts between each of the writes and
+  // between the writes to the reads.
+  ValueSet WriteObjects;
+  ValueVector TempObjects;
+
+  // Check that the read-writes do not conflict with other read-write
+  // pointers.
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (!WriteObjects.insert(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **it <<"\n");
+        return false;
+      }
     }
+    TempObjects.clear();
   }
 
-  // Check that the reads and the writes are disjoint.
-  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (WritePointerSet.count(*r)) {
-      DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n");
-      return false;
+  /// Check that the reads don't conflict with the read-writes.
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (WriteObjects.count(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
+              << **it <<"\n");
+        return false;
+      }
     }
+    TempObjects.clear();
   }
 
   // All is okay.
@@ -1198,6 +1243,24 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     }
 }
 
+bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  // Check that the PHI is consecutive and starts at zero.
+  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Start = AR->getStart();
+
+  if (!Step->isOne() || !Start->isZero()) {
+    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+    return false;
+  }
+  return true;
+}
+
 } // namespace
 
 char LoopVectorize::ID = 0;
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 00000000000..e818d685626
--- /dev/null
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK: @histogram
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %1 = load i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 00000000000..4095ea68ef7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_only_func
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32* %B, i64 %4
+  %6 = load i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
-- 
2.11.0