From bc38653ff6295c9c3f97f999a65e6d16df57b8e9 Mon Sep 17 00:00:00 2001
From: Matthew Simpson <mssimpso@codeaurora.org>
Date: Thu, 14 Jul 2016 14:36:06 +0000
Subject: [PATCH] [LV] Avoid unnecessary IV scalar-to-vector-to-scalar
 conversions

This patch prevents increases in the number of instructions, pre-instcombine,
due to induction variable scalarization. An increase in instructions can lead
to an increase in the compile-time required to simplify the induction
variables. We now maintain a new map for scalarized induction variables to
prevent us from converting between the scalar and vector forms.

This patch should resolve compile-time regressions seen after r274627.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275419 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 114 +++++++++++++--------
 test/Transforms/LoopVectorize/induction.ll         |  49 +++++++++
 test/Transforms/LoopVectorize/reverse_induction.ll |  24 -----
 3 files changed, 118 insertions(+), 69 deletions(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index eebcc47ead0..13589418706 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -403,12 +403,12 @@ protected:
   /// to each vector element of Val. The sequence starts at StartIndex.
   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
 
-  /// Compute a step vector like the above function, but scalarize the
-  /// arithmetic instead. The results of the computation are inserted into a
-  /// new vector with VF elements. \p Val is the initial value, \p Step is the
-  /// size of the step, and \p StartIdx indicates the index of the increment
-  /// from which to start computing the steps.
-  Value *getScalarizedStepVector(Value *Val, int StartIdx, Value *Step);
+  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+  /// variable on which to base the steps, \p Step is the size of the step, and
+  /// \p EntryVal is the value from the original loop that maps to the steps.
+  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
+  /// can be a truncate instruction).
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
 
   /// Create a vector induction phi node based on an existing scalar one. This
   /// currently only works for integer induction variables with a constant
@@ -417,11 +417,11 @@ protected:
   void createVectorIntInductionPHI(const InductionDescriptor &II,
                                    VectorParts &Entry, IntegerType *TruncType);
 
-  /// Widen an integer induction variable \p IV. If \p TruncType is provided,
-  /// the induction variable will first be truncated to the specified type. The
+  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
+  /// induction variable will first be truncated to the corresponding type. The
   /// widened values are placed in \p Entry.
   void widenIntInduction(PHINode *IV, VectorParts &Entry,
-                         IntegerType *TruncType = nullptr);
+                         TruncInst *Trunc = nullptr);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -572,6 +572,17 @@ protected:
   PHINode *OldInduction;
   /// Maps scalars to widened vectors.
   ValueMap WidenMap;
+
+  /// A map of induction variables from the original loop to their
+  /// corresponding VF * UF scalarized values in the vectorized loop. The
+  /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap
+  /// maps original loop values to their vector versions in the new loop,
+  /// ScalarIVMap maps induction variables from the original loop that are not
+  /// vectorized to their scalar equivalents in the vector loop. Maintaining a
+  /// separate map for scalarized induction variables allows us to avoid
+  /// unnecessary scalar-to-vector-to-scalar conversions.
+  DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap;
+
   /// Store instructions that should be predicated, as a pair
   ///   <StoreInst, Predicate>
   SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
@@ -1889,7 +1900,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
 }
 
 void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
-                                            IntegerType *TruncType) {
+                                            TruncInst *Trunc) {
 
   auto II = Legal->getInductionVars()->find(IV);
   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -1897,6 +1908,9 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
   auto ID = II->second;
   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
 
+  // If a truncate instruction was provided, get the smaller type.
+  auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr;
+
   // The step of the induction.
   Value *Step = nullptr;
 
@@ -1939,20 +1953,21 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
     }
   }
 
-  // If an induction variable is only used for counting loop iterations or
-  // calculating addresses, it shouldn't be widened. Scalarize the step vector
-  // to give InstCombine a better chance of simplifying it.
-  if (VF > 1 && ValuesNotWidened->count(IV)) {
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = getScalarizedStepVector(ScalarIV, VF * Part, Step);
-    return;
-  }
-
-  // Finally, splat the scalar induction variable, and build the necessary step
-  // vectors.
+  // Splat the scalar induction variable, and build the necessary step vectors.
   Value *Broadcasted = getBroadcastInstrs(ScalarIV);
   for (unsigned Part = 0; Part < UF; ++Part)
     Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+
+  // If an induction variable is only used for counting loop iterations or
+  // calculating addresses, it doesn't need to be widened. Create scalar steps
+  // that can be used by instructions we will later scalarize. Note that the
+  // addition of the scalar steps will not increase the number of instructions
+  // in the loop in the common case prior to InstCombine. We will be trading
+  // one vector extract for each scalar step.
+  if (VF > 1 && ValuesNotWidened->count(IV)) {
+    auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
+    buildScalarSteps(ScalarIV, Step, EntryVal);
+  }
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
@@ -1983,27 +1998,25 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   return Builder.CreateAdd(Val, Step, "induction");
 }
 
-Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Val, int StartIdx,
-                                                    Value *Step) {
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+                                           Value *EntryVal) {
 
-  // We can't create a vector with less than two elements.
+  // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
 
   // Get the value type and ensure it and the step have the same integer type.
-  Type *ValTy = Val->getType()->getScalarType();
-  assert(ValTy->isIntegerTy() && ValTy == Step->getType() &&
+  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
          "Val and Step should have the same integer type");
 
-  // Compute the scalarized step vector. We perform scalar arithmetic and then
-  // insert the results into the step vector.
-  Value *StepVector = UndefValue::get(ToVectorTy(ValTy, VF));
-  for (unsigned I = 0; I < VF; ++I) {
-    auto *Mul = Builder.CreateMul(ConstantInt::get(ValTy, StartIdx + I), Step);
-    auto *Add = Builder.CreateAdd(Val, Mul);
-    StepVector = Builder.CreateInsertElement(StepVector, Add, I);
-  }
-
-  return StepVector;
+  // Compute the scalar steps and save the results in ScalarIVMap.
+  for (unsigned Part = 0; Part < UF; ++Part)
+    for (unsigned I = 0; I < VF; ++I) {
+      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I);
+      auto *Mul = Builder.CreateMul(StartIdx, Step);
+      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
+      ScalarIVMap[EntryVal].push_back(Add);
+    }
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
@@ -2459,8 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
                  "Must be last index or loop invariant");
 
           VectorParts &GEPParts = getVectorValue(GepOperand);
-          Value *Index = GEPParts[0];
-          Index = Builder.CreateExtractElement(Index, Zero);
+
+          // If GepOperand is an induction variable, and there's a scalarized
+          // version of it available, use it. Otherwise, we will need to create
+          // an extractelement instruction.
+          Value *Index = ScalarIVMap.count(GepOperand)
+                             ? ScalarIVMap[GepOperand][0]
+                             : Builder.CreateExtractElement(GEPParts[0], Zero);
+
           Gep2->setOperand(i, Index);
           Gep2->setName("gep.indvar.idx");
         }
@@ -2663,11 +2682,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
         Cloned->setName(Instr->getName() + ".cloned");
       // Replace the operands of the cloned instructions with extracted scalars.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        Value *Op = Params[op][Part];
-        // Param is a vector. Need to extract the right lane.
-        if (Op->getType()->isVectorTy())
-          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
-        Cloned->setOperand(op, Op);
+
+        // If the operand is an induction variable, and there's a scalarized
+        // version of it available, use it. Otherwise, we will need to create
+        // an extractelement instruction if vectorizing.
+        auto *NewOp = Params[op][Part];
+        auto *ScalarOp = Instr->getOperand(op);
+        if (ScalarIVMap.count(ScalarOp))
+          NewOp = ScalarIVMap[ScalarOp][VF * Part + Width];
+        else if (NewOp->getType()->isVectorTy())
+          NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width));
+        Cloned->setOperand(op, NewOp);
       }
       addNewMetadata(Cloned, Instr);
 
@@ -4160,8 +4185,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       auto ID = Legal->getInductionVars()->lookup(OldInduction);
       if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
           ID.getConstIntStepValue()) {
-        auto *TruncType = cast<IntegerType>(CI->getType());
-        widenIntInduction(OldInduction, Entry, TruncType);
+        widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI));
         addMetadata(Entry, &I);
         break;
       }
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index beee3978abb..c1f0bd95dbd 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -73,6 +74,26 @@ loopexit:
 ; for (int i = 0; i < n; ++i)
 ;   sum += a[i];
 ;
+; CHECK-LABEL: @scalarize_induction_variable_01(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   %[[i1:.+]] = add i64 %index, 1
+; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i1]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %index, 0
+; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %index, 1
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %index, 2
+; UNROLL-NO-IC:   %[[i3:.+]] = add i64 %index, 3
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i3]]
+;
 ; IND-LABEL: @scalarize_induction_variable_01(
 ; IND:     vector.body:
 ; IND:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
@@ -112,6 +133,34 @@ for.end:
 ; for (int i ; 0; i < n; i += 8)
 ;   s += (a[i] + b[i] + 1.0f);
 ;
+; CHECK-LABEL: @scalarize_induction_variable_02(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %offset.idx = shl i64 %index, 3
+; CHECK:   %[[i0:.+]] = add i64 %offset.idx, 0
+; CHECK:   %[[i1:.+]] = add i64 %offset.idx, 8
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %offset.idx = shl i64 %index, 3
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %offset.idx, 0
+; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %offset.idx, 8
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %offset.idx, 16
+; UNROLL-NO-IC:   %[[i3:.+]] = add i64 %offset.idx, 24
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i3]]
+;
 ; IND-LABEL: @scalarize_induction_variable_02(
 ; IND: vector.body:
 ; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll
index 7eb35100c75..24ffb6167de 100644
--- a/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -8,21 +8,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i64 %startval, %index
 ; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0
 ; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1
 ; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2
 ; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
-; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3
 ; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0
 ; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1
 ; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2
 ; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
-; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3
 
 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
 entry:
@@ -48,21 +40,13 @@ loopend:
 ; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i128 %startval, %index
 ; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
-; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0
 ; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
-; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1
 ; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
-; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2
 ; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
-; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3
 ; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
-; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0
 ; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
-; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1
 ; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
-; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2
 ; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
-; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3
 
 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
 entry:
@@ -88,21 +72,13 @@ loopend:
 ; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i16 %startval, {{.*}}
 ; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
-; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0
 ; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
-; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1
 ; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
-; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2
 ; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
-; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3
 ; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
-; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0
 ; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
-; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1
 ; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
-; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2
 ; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
-; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3
 
 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
 entry:
-- 
2.11.0