Vectorizer: Add support for loops with an unknown count. For example:

author Nadav Rotem <nrotem@apple.com>

Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)

committer Nadav Rotem <nrotem@apple.com>

Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)
author Nadav Rotem <nrotem@apple.com>
Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)
committer Nadav Rotem <nrotem@apple.com>
Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index f84e392..80fdad3 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -28,6 +28,8 @@
  #include "llvm/Analysis/LoopPass.h"
  #include "llvm/Value.h"
  #include "llvm/Function.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Verifier.h"
  #include "llvm/Module.h"
  #include "llvm/Type.h"
  #include "llvm/ADT/SmallVector.h"
@@ -65,8 +67,8 @@ public:
  
    /// Ctor.
    SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
-                            unsigned VecWidth):
-  Orig(OrigLoop), SE(Se), LI(Li), VF(VecWidth),
+                            LPPassManager *Lpm, unsigned VecWidth):
+  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
     Builder(0), Induction(0), OldInduction(0) { }
  
    ~SingleBlockLoopVectorizer() {
@@ -76,20 +78,20 @@ public:
    // Perform the actual loop widening (vectorization).
    void vectorize() {
      ///Create a new empty loop. Unlink the old loop and connect the new one.
-    copyEmptyLoop();
+    createEmptyLoop();
      /// Widen each instruction in the old loop to a new one in the new loop.
      vectorizeLoop();
-    // Delete the old loop.
-    deleteOldLoop();
+    // register the new loop.
+    cleanup();
   }
  
  private:
    /// Create an empty loop, based on the loop ranges of the old loop.
-  void copyEmptyLoop();
+  void createEmptyLoop();
    /// Copy and widen the instructions from the old loop.
    void vectorizeLoop();
-  /// Delete the old loop.
-  void deleteOldLoop();
+  /// Insert the new loop to the loop hierarchy and pass manager.
+  void cleanup();
  
    /// This instruction is un-vectorizable. Implement it as a sequence
    /// of scalars.
@@ -123,6 +125,8 @@ private:
    ScalarEvolution *SE;
    // Loop Info.
    LoopInfo *LI;
+  // Loop Pass Manager;
+  LPPassManager *LPM;
    // The vectorization factor to use.
    unsigned VF;
  
@@ -132,9 +136,9 @@ private:
    // --- Vectorization state ---
  
    /// The new Induction variable which was added to the new block.
-  Instruction *Induction;
+  PHINode *Induction;
    /// The induction variable of the old basic block.
-  Instruction *OldInduction;
+  PHINode *OldInduction;
    // Maps scalars to widened vectors.
    DenseMap<Value*, Value*> WidenMap;
  };
@@ -184,6 +188,7 @@ struct LoopVectorize : public LoopPass {
    ScalarEvolution *SE;
    DataLayout *DL;
    LoopInfo *LI;
+  DominatorTree *DT;
  
    virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
      // Only vectorize innermost loops.
@@ -194,6 +199,7 @@ struct LoopVectorize : public LoopPass {
      SE = &getAnalysis<ScalarEvolution>();
      DL = getAnalysisIfAvailable<DataLayout>();
      LI = &getAnalysis<LoopInfo>();
+    DT = &getAnalysis<DominatorTree>();
  
      DEBUG(dbgs() << "LV: Checking a loop in \"" <<
            L->getHeader()->getParent()->getName() << "\"\n");
@@ -203,8 +209,7 @@ struct LoopVectorize : public LoopPass {
      unsigned MaxVF = LVL.getLoopMaxVF();
  
      // Check that we can vectorize using the chosen vectorization width.
-    if ((MaxVF < DefaultVectorizationFactor) ||
-        (MaxVF % DefaultVectorizationFactor)) {
+    if (MaxVF < DefaultVectorizationFactor) {
        DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
        return false;
      }
@@ -212,11 +217,10 @@ struct LoopVectorize : public LoopPass {
      DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
  
      // If we decided that is is *legal* to vectorizer the loop. Do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor);
+    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
      LB.vectorize();
  
-    // The loop is now vectorized. Remove it from LMP.
-    LPM.deleteLoopFromQueue(L);
+    DEBUG(verifyFunction(*L->getHeader()->getParent()));
      return true;
    }
  
@@ -226,6 +230,7 @@ struct LoopVectorize : public LoopPass {
      AU.addRequired<AliasAnalysis>();
      AU.addRequired<LoopInfo>();
      AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
    }
  
  };
@@ -327,7 +332,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
      Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
  
      // If the src is an instruction that appeared earlier in the basic block
-    // then it should already be vectorized. 
+    // then it should already be vectorized.
      if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
        assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
        // The parameter is a vector value from earlier.
@@ -378,28 +383,71 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
      WidenMap[Instr] = VecResults;
  }
  
-void SingleBlockLoopVectorizer::copyEmptyLoop() {
-  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
-  BasicBlock *PH = Orig->getLoopPreheader();
+void SingleBlockLoopVectorizer::createEmptyLoop() {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+   [  ] <-- vector loop bypass.
+  /  |
+ /   v
+|   [ ]     <-- vector pre header.
+|    |
+|    v
+|   [  ] \
+|   [  ]_|   <-- vector loop.
+|    |
+ \   v
+   >[ ]   <--- middle-block.
+  /  |
+ /   v
+|   [ ]     <--- new preheader.
+|    |
+|    v
+|   [ ] \
+|   [ ]_|   <-- old scalar loop to handle remainder. ()
+ \   |
+  \  v
+   >[ ]     <-- exit block.
+   ...
+   */
+
+  // This is the original scalar-loop preheader.
+  BasicBlock *BypassBlock = Orig->getLoopPreheader();
    BasicBlock *ExitBlock = Orig->getExitBlock();
-  assert(ExitBlock && "Invalid loop exit");
+  assert(ExitBlock && "Must have an exit block");
+
+  BasicBlock *ScalarBody = Orig->getHeader();
+  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+  assert(ScalarBody && BypassBlock && "Invalid loop structure");
+
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
+                                                 "vector.body");
  
-  // Create a new single-basic block loop.
-  BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop",
-                                      PH->getParent(), ExitBlock);
+  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
+                                                  "middle.block");
+
+
+  BasicBlock *ScalarPH =
+          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                       "scalar.preheader");
  
    // Find the induction variable.
    BasicBlock *OldBasicBlock = Orig->getHeader();
-  PHINode *OldInd = dyn_cast<PHINode>(OldBasicBlock->begin());
-  assert(OldInd && "We must have a single phi node.");
-  Type *IdxTy = OldInd->getType();
+  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
  
    // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
    // inside the loop.
-  Builder = new IRBuilder<>(BB);
+  Builder = new IRBuilder<>(VecBody);
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
  
    // Generate the induction variable.
-  PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index");
+  Induction = Builder->CreatePHI(IdxTy, 2, "index");
    Constant *Zero = ConstantInt::get(IdxTy, 0);
    Constant *Step = ConstantInt::get(IdxTy, VF);
  
@@ -407,32 +455,78 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() {
    const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
    assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
  
-  // Get the trip count from the count by adding 1.
+  // Get the total trip count from the count by adding 1.
    ExitCount = SE->getAddExpr(ExitCount,
                               SE->getConstant(ExitCount->getType(), 1));
  
    // Expand the trip count and place the new instructions in the preheader.
    // Notice that the pre-header does not change, only the loop body.
    SCEVExpander Exp(*SE, "induction");
-  Instruction *Loc = Orig->getLoopPreheader()->getTerminator();
-  if (ExitCount->getType() != Phi->getType())
-    ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType());
-  Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc);
-  
+  Instruction *Loc = BypassBlock->getTerminator();
+
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != Induction->getType())
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  Constant *CIVF = ConstantInt::get(IdxTy, VF);
+  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+
+  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               "cmp.zero", Loc);
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
+  // Remove the old terminator.
+  Loc->eraseFromParent();
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                                CountRoundDown, "cmp.n",
+                                MiddleBlock->getTerminator());
+
+  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+  // Remove the old terminator.
+  MiddleBlock->getTerminator()->eraseFromParent();
+
    // Create i+1 and fill the PHINode.
-  Value *Next = Builder->CreateAdd(Phi, Step, "index.next");
-  Phi->addIncoming(Zero, PH);
-  Phi->addIncoming(Next, BB);
+  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(NextIdx, VecBody);
    // Create the compare.
-  Value *ICmp = Builder->CreateICmpEQ(Next, Count);
-  Builder->CreateCondBr(ICmp, ExitBlock, BB);
-  // Fix preheader.
-  PH->getTerminator()->setSuccessor(0, BB);
-  Builder->SetInsertPoint(BB->getFirstInsertionPt());
-
-  // Save the induction variables.
-  Induction = Phi;
-  OldInduction = OldInd;
+  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+  // Now we have two terminators. Remove the old one from the block.
+  VecBody->getTerminator()->eraseFromParent();
+
+  // Fix the scalar body iteration count.
+  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Register the new loop.
+  Loop* Lp = new Loop();
+  LPM->insertLoop(Lp, Orig->getParentLoop());
+
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+  Loop *ParentLoop = Orig->getParentLoop();
+  if (ParentLoop) {
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  }
  }
  
  void SingleBlockLoopVectorizer::vectorizeLoop() {
@@ -575,16 +669,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
    }// end of for_each instr.
  }
  
-void SingleBlockLoopVectorizer::deleteOldLoop() {
+void SingleBlockLoopVectorizer::cleanup() {
    // The original basic block.
-  BasicBlock *BB = Orig->getHeader();
    SE->forgetLoop(Orig);
-
-  LI->removeBlock(BB);
-  Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase());
-
-  // Remove the old loop block.
-  DeleteDeadBlock(BB);
  }
  
  unsigned LoopVectorizationLegality::getLoopMaxVF() {
@@ -605,26 +692,25 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
    BasicBlock *BB = TheLoop->getHeader();
    DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
  
-  // Find the max vectorization factor.
-  unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB);
-
-
-  // Perform an early check. Do not scan the block if we did not find a loop.
-  if (MaxVF < 2) {
-    DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n");
-    return 1;
-  }
-
    // Go over each instruction and look at memory deps.
    if (!canVectorizeBlock(*BB)) {
      DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
      return 1;
    }
  
-  DEBUG(dbgs() << "LV: We can vectorize this loop! VF="<<MaxVF<<"\n");
-  
-  // Okay! We can vectorize. Return the max trip multiple.
-  return MaxVF;
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+    return 1;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+
+  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return the
+  // maximum SIMD size.
+  return DefaultVectorizationFactor;
  }
  
  bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -725,6 +811,11 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      }
    } // next instr.
  
+  if (NumPhis != 1) {
+      DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
+      return false;
+  }
+
    // Check that the underlying objects of the reads and writes are either
    // disjoint memory locations, or that they are no-alias arguments.
    ValueVector::iterator r, re, w, we;
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll

index 68eab9b..4e9e6f9 100644 (file)
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -45,9 +45,8 @@ define void @example1() nounwind uwtable ssp {
    ret void
  }
  
-; We can't vectorize this loop because it has non constant loop bounds.
  ;CHECK: @example2
-;CHECK-NOT: <4 x i32>
+;CHECK: store <4 x i32>
  ;CHECK: ret void
  define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
    %1 = icmp sgt i32 %n, 0
@@ -114,9 +113,8 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
    ret void
  }
  
-; We can't vectorize this loop because it has non constant loop bounds.
  ;CHECK: @example4
-;CHECK-NOT: <4 x i32>
+;CHECK: load <4 x i32>
  ;CHECK: ret void
  define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
    %1 = add nsw i32 %n, -1
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll

index 76835b7..04c5c84 100644 (file)
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
  
  ;CHECK: @example1
  ;CHECK: shl i32
-;CHECK: sext i32
+;CHECK: zext i32
  ;CHECK: load <4 x i32>
  ;CHECK: add <4 x i32>
  ;CHECK: store <4 x i32>
author	Nadav Rotem <nrotem@apple.com>
	Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)
committer	Nadav Rotem <nrotem@apple.com>
	Thu, 18 Oct 2012 05:29:12 +0000 (05:29 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/gcc-examples.ll		patch \| blob \| history
test/Transforms/LoopVectorize/non-const-n.ll		patch \| blob \| history