From bc38653ff6295c9c3f97f999a65e6d16df57b8e9 Mon Sep 17 00:00:00 2001 From: Matthew Simpson Date: Thu, 14 Jul 2016 14:36:06 +0000 Subject: [PATCH] [LV] Avoid unnecessary IV scalar-to-vector-to-scalar conversions This patch prevents increases in the number of instructions, pre-instcombine, due to induction variable scalarization. An increase in instructions can lead to an increase in the compile-time required to simplify the induction variables. We now maintain a new map for scalarized induction variables to prevent us from converting between the scalar and vector forms. This patch should resolve compile-time regressions seen after r274627. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275419 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 114 +++++++++++++-------- test/Transforms/LoopVectorize/induction.ll | 49 +++++++++ test/Transforms/LoopVectorize/reverse_induction.ll | 24 ----- 3 files changed, 118 insertions(+), 69 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index eebcc47ead0..13589418706 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -403,12 +403,12 @@ protected: /// to each vector element of Val. The sequence starts at StartIndex. virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); - /// Compute a step vector like the above function, but scalarize the - /// arithmetic instead. The results of the computation are inserted into a - /// new vector with VF elements. \p Val is the initial value, \p Step is the - /// size of the step, and \p StartIdx indicates the index of the increment - /// from which to start computing the steps. - Value *getScalarizedStepVector(Value *Val, int StartIdx, Value *Step); + /// Compute scalar induction steps. \p ScalarIV is the scalar induction + /// variable on which to base the steps, \p Step is the size of the step, and + /// \p EntryVal is the value from the original loop that maps to the steps. + /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it + /// can be a truncate instruction). + void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal); /// Create a vector induction phi node based on an existing scalar one. This /// currently only works for integer induction variables with a constant @@ -417,11 +417,11 @@ protected: void createVectorIntInductionPHI(const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType); - /// Widen an integer induction variable \p IV. If \p TruncType is provided, - /// the induction variable will first be truncated to the specified type. The + /// Widen an integer induction variable \p IV. If \p Trunc is provided, the + /// induction variable will first be truncated to the corresponding type. The /// widened values are placed in \p Entry. void widenIntInduction(PHINode *IV, VectorParts &Entry, - IntegerType *TruncType = nullptr); + TruncInst *Trunc = nullptr); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -572,6 +572,17 @@ protected: PHINode *OldInduction; /// Maps scalars to widened vectors. ValueMap WidenMap; + + /// A map of induction variables from the original loop to their + /// corresponding VF * UF scalarized values in the vectorized loop. The + /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap + /// maps original loop values to their vector versions in the new loop, + /// ScalarIVMap maps induction variables from the original loop that are not + /// vectorized to their scalar equivalents in the vector loop. Maintaining a + /// separate map for scalarized induction variables allows us to avoid + /// unnecessary scalar-to-vector-to-scalar conversions. + DenseMap> ScalarIVMap; + /// Store instructions that should be predicated, as a pair /// SmallVector, 4> PredicatedStores; @@ -1889,7 +1900,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( } void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, - IntegerType *TruncType) { + TruncInst *Trunc) { auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -1897,6 +1908,9 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + // If a truncate instruction was provided, get the smaller type. + auto *TruncType = Trunc ? cast(Trunc->getType()) : nullptr; + // The step of the induction. Value *Step = nullptr; @@ -1939,20 +1953,21 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, } } - // If an induction variable is only used for counting loop iterations or - // calculating addresses, it shouldn't be widened. Scalarize the step vector - // to give InstCombine a better chance of simplifying it. - if (VF > 1 && ValuesNotWidened->count(IV)) { - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getScalarizedStepVector(ScalarIV, VF * Part, Step); - return; - } - - // Finally, splat the scalar induction variable, and build the necessary step - // vectors. + // Splat the scalar induction variable, and build the necessary step vectors. Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + + // If an induction variable is only used for counting loop iterations or + // calculating addresses, it doesn't need to be widened. Create scalar steps + // that can be used by instructions we will later scalarize. Note that the + // addition of the scalar steps will not increase the number of instructions + // in the loop in the common case prior to InstCombine. We will be trading + // one vector extract for each scalar step. + if (VF > 1 && ValuesNotWidened->count(IV)) { + auto *EntryVal = Trunc ? cast(Trunc) : IV; + buildScalarSteps(ScalarIV, Step, EntryVal); + } } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, @@ -1983,27 +1998,25 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, return Builder.CreateAdd(Val, Step, "induction"); } -Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Val, int StartIdx, - Value *Step) { +void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, + Value *EntryVal) { - // We can't create a vector with less than two elements. + // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. - Type *ValTy = Val->getType()->getScalarType(); - assert(ValTy->isIntegerTy() && ValTy == Step->getType() && + Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); + assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalarized step vector. We perform scalar arithmetic and then - // insert the results into the step vector. - Value *StepVector = UndefValue::get(ToVectorTy(ValTy, VF)); - for (unsigned I = 0; I < VF; ++I) { - auto *Mul = Builder.CreateMul(ConstantInt::get(ValTy, StartIdx + I), Step); - auto *Add = Builder.CreateAdd(Val, Mul); - StepVector = Builder.CreateInsertElement(StepVector, Add, I); - } - - return StepVector; + // Compute the scalar steps and save the results in ScalarIVMap. + for (unsigned Part = 0; Part < UF; ++Part) + for (unsigned I = 0; I < VF; ++I) { + auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); + auto *Mul = Builder.CreateMul(StartIdx, Step); + auto *Add = Builder.CreateAdd(ScalarIV, Mul); + ScalarIVMap[EntryVal].push_back(Add); + } } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { @@ -2459,8 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); - Value *Index = GEPParts[0]; - Index = Builder.CreateExtractElement(Index, Zero); + + // If GepOperand is an induction variable, and there's a scalarized + // version of it available, use it. Otherwise, we will need to create + // an extractelement instruction. + Value *Index = ScalarIVMap.count(GepOperand) + ? ScalarIVMap[GepOperand][0] + : Builder.CreateExtractElement(GEPParts[0], Zero); + Gep2->setOperand(i, Index); Gep2->setName("gep.indvar.idx"); } @@ -2663,11 +2682,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, Cloned->setName(Instr->getName() + ".cloned"); // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - // Param is a vector. Need to extract the right lane. - if (Op->getType()->isVectorTy()) - Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); - Cloned->setOperand(op, Op); + + // If the operand is an induction variable, and there's a scalarized + // version of it available, use it. Otherwise, we will need to create + // an extractelement instruction if vectorizing. + auto *NewOp = Params[op][Part]; + auto *ScalarOp = Instr->getOperand(op); + if (ScalarIVMap.count(ScalarOp)) + NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; + else if (NewOp->getType()->isVectorTy()) + NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -4160,8 +4185,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { auto ID = Legal->getInductionVars()->lookup(OldInduction); if (isa(CI) && CI->getOperand(0) == OldInduction && ID.getConstIntStepValue()) { - auto *TruncType = cast(CI->getType()); - widenIntInduction(OldInduction, Entry, TruncType); + widenIntInduction(OldInduction, Entry, cast(CI)); addMetadata(Entry, &I); break; } diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index beee3978abb..c1f0bd95dbd 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -73,6 +74,26 @@ loopexit: ; for (int i = 0; i < n; ++i) ; sum += a[i]; ; +; CHECK-LABEL: @scalarize_induction_variable_01( +; CHECK: vector.body: +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %[[i0:.+]] = add i64 %index, 0 +; CHECK: %[[i1:.+]] = add i64 %index, 1 +; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i0]] +; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i1]] +; +; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01( +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL-NO-IC: %[[i0:.+]] = add i64 %index, 0 +; UNROLL-NO-IC: %[[i1:.+]] = add i64 %index, 1 +; UNROLL-NO-IC: %[[i2:.+]] = add i64 %index, 2 +; UNROLL-NO-IC: %[[i3:.+]] = add i64 %index, 3 +; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i0]] +; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i1]] +; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i2]] +; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i3]] +; ; IND-LABEL: @scalarize_induction_variable_01( ; IND: vector.body: ; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] @@ -112,6 +133,34 @@ for.end: ; for (int i ; 0; i < n; i += 8) ; s += (a[i] + b[i] + 1.0f); ; +; CHECK-LABEL: @scalarize_induction_variable_02( +; CHECK: vector.body: +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = shl i64 %index, 3 +; CHECK: %[[i0:.+]] = add i64 %offset.idx, 0 +; CHECK: %[[i1:.+]] = add i64 %offset.idx, 8 +; CHECK: getelementptr inbounds float, float* %a, i64 %[[i0]] +; CHECK: getelementptr inbounds float, float* %a, i64 %[[i1]] +; CHECK: getelementptr inbounds float, float* %b, i64 %[[i0]] +; CHECK: getelementptr inbounds float, float* %b, i64 %[[i1]] +; +; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02( +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL-NO-IC: %offset.idx = shl i64 %index, 3 +; UNROLL-NO-IC: %[[i0:.+]] = add i64 %offset.idx, 0 +; UNROLL-NO-IC: %[[i1:.+]] = add i64 %offset.idx, 8 +; UNROLL-NO-IC: %[[i2:.+]] = add i64 %offset.idx, 16 +; UNROLL-NO-IC: %[[i3:.+]] = add i64 %offset.idx, 24 +; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i0]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i1]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i2]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i3]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i0]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i1]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i2]] +; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i3]] +; ; IND-LABEL: @scalarize_induction_variable_02( ; IND: vector.body: ; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll index 7eb35100c75..24ffb6167de 100644 --- a/test/Transforms/LoopVectorize/reverse_induction.ll +++ b/test/Transforms/LoopVectorize/reverse_induction.ll @@ -8,21 +8,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i64 %startval, %index ; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 -; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0 ; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 -; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 ; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 -; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 ; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 -; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 ; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 -; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0 ; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5 -; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1 ; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6 -; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2 ; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7 -; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) { entry: @@ -48,21 +40,13 @@ loopend: ; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i128 %startval, %index ; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0 -; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0 ; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1 -; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1 ; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2 -; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2 ; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3 -; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3 ; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4 -; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0 ; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5 -; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1 ; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6 -; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2 ; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7 -; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) { entry: @@ -88,21 +72,13 @@ loopend: ; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i16 %startval, {{.*}} ; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0 -; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0 ; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1 -; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1 ; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2 -; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2 ; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3 -; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3 ; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4 -; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0 ; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5 -; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1 ; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6 -; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2 ; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7 -; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) { entry: -- 2.11.0