[LV] Move isLegalMasked* functions from Legality to CostModel

author Renato Golin <renato.golin@linaro.org>

Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)

committer Renato Golin <renato.golin@linaro.org>

Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)
author Renato Golin <renato.golin@linaro.org>
Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)
committer Renato Golin <renato.golin@linaro.org>
Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 7f45c7a..963dad5 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1648,58 +1648,12 @@ public:
  
    bool hasStride(Value *V) { return LAI->hasStride(V); }
  
-  /// Returns true if the target machine supports masked store operation
-  /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
-    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
-  }
-
-  /// Returns true if the target machine supports masked load operation
-  /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
-    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
-  }
-
-  /// Returns true if the target machine supports masked scatter operation
-  /// for the given \p DataType.
-  bool isLegalMaskedScatter(Type *DataType) {
-    return TTI->isLegalMaskedScatter(DataType);
-  }
-
-  /// Returns true if the target machine supports masked gather operation
-  /// for the given \p DataType.
-  bool isLegalMaskedGather(Type *DataType) {
-    return TTI->isLegalMaskedGather(DataType);
-  }
-
-  /// Returns true if the target machine can represent \p V as a masked gather
-  /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
-    auto *LI = dyn_cast<LoadInst>(V);
-    auto *SI = dyn_cast<StoreInst>(V);
-    if (!LI && !SI)
-      return false;
-    auto *Ptr = getPointerOperand(V);
-    auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
-  }
-
    /// Returns true if vector representation of the instruction \p I
    /// requires mask.
    bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
  
    unsigned getNumStores() const { return LAI->getNumStores(); }
    unsigned getNumLoads() const { return LAI->getNumLoads(); }
-  unsigned getNumPredStores() const { return NumPredStores; }
-
-  /// Returns true if \p I is an instruction that will be scalarized with
-  /// predication. Such instructions include conditional stores and
-  /// instructions that may divide by zero.
-  bool isScalarWithPredication(Instruction *I);
-
-  /// Returns true if \p I is a memory instruction with consecutive memory
-  /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
  
    // Returns true if the NoNaN attribute is set on the function.
    bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
@@ -1753,8 +1707,6 @@ private:
      return LAI ? &LAI->getSymbolicStrides() : nullptr;
    }
  
-  unsigned NumPredStores = 0;
-
    /// The loop that we evaluate.
    Loop *TheLoop;
  
@@ -2060,7 +2012,53 @@ public:
      collectLoopScalars(VF);
    }
  
+  /// Returns true if the target machine supports masked store operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+  }
+
+  /// Returns true if the target machine supports masked load operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+  }
+
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType) {
+    return TTI.isLegalMaskedScatter(DataType);
+  }
+
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType) {
+    return TTI.isLegalMaskedGather(DataType);
+  }
+
+  /// Returns true if the target machine can represent \p V as a masked gather
+  /// or scatter operation.
+  bool isLegalGatherOrScatter(Value *V) {
+    bool LI = isa<LoadInst>(V);
+    bool SI = isa<StoreInst>(V);
+    if (!LI && !SI)
+      return false;
+    auto *Ty = getMemInstValueType(V);
+    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+  }
+
+  /// Returns true if \p I is an instruction that will be scalarized with
+  /// predication. Such instructions include conditional stores and
+  /// instructions that may divide by zero.
+  bool isScalarWithPredication(Instruction *I);
+
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
  private:
+  unsigned NumPredStores = 0;
+
    /// \return An upper bound for the vectorization factor, larger than zero.
    /// One is returned if vectorization should best be avoided due to cost.
    unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
@@ -2112,6 +2110,10 @@ private:
    /// as a vector operation.
    bool isConsecutiveLoadOrStore(Instruction *I);
  
+  /// Returns true if an artificially high cost for emulated masked memrefs
+  /// should be used.
+  bool useEmulatedMaskMemRefHack(Instruction *I);
+
    /// Create an analysis remark that explains why vectorization failed
    ///
    /// \p RemarkName is the identifier for the remark.  \return the remark object
@@ -5421,14 +5423,22 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
    Scalars[VF].insert(Worklist.begin(), Worklist.end());
  }
  
-bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
-  if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
+  if (!Legal->blockNeedsPredication(I->getParent()))
      return false;
    switch(I->getOpcode()) {
    default:
      break;
-  case Instruction::Store:
-    return !isMaskRequired(I);
+  case Instruction::Load:
+  case Instruction::Store: {
+    if (!Legal->isMaskRequired(I))
+      return false;
+    auto *Ptr = getPointerOperand(I);
+    auto *Ty = getMemInstValueType(I);
+    return isa<LoadInst>(I) ?
+        !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
+      : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+  }
    case Instruction::UDiv:
    case Instruction::SDiv:
    case Instruction::SRem:
@@ -5438,8 +5448,8 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
    return false;
  }
  
-bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
-                                                              unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+                                                               unsigned VF) {
    // Get and ensure we have a valid memory instruction.
    LoadInst *LI = dyn_cast<LoadInst>(I);
    StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -5448,7 +5458,7 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
    auto *Ptr = getPointerOperand(I);
  
    // In order to be widened, the pointer should be consecutive, first of all.
-  if (!isConsecutivePtr(Ptr))
+  if (!Legal->isConsecutivePtr(Ptr))
      return false;
  
    // If the instruction is a store located in a predicated block, it will be
@@ -5703,39 +5713,26 @@ bool LoopVectorizationLegality::blockCanBePredicated(
        if (!LI)
          return false;
        if (!SafePtrs.count(LI->getPointerOperand())) {
-        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
-            isLegalMaskedGather(LI->getType())) {
-          MaskedOp.insert(LI);
-          continue;
-        }
          // !llvm.mem.parallel_loop_access implies if-conversion safety.
-        if (IsAnnotatedParallel)
-          continue;
-        return false;
+        // Otherwise, record that the load needs (real or emulated) masking
+        // and let the cost model decide.
+        if (!IsAnnotatedParallel)
+          MaskedOp.insert(LI);
+        continue;
        }
      }
  
      if (I.mayWriteToMemory()) {
        auto *SI = dyn_cast<StoreInst>(&I);
-      // We only support predication of stores in basic blocks with one
-      // predecessor.
        if (!SI)
          return false;
-
-      // Build a masked store if it is legal for the target.
-      if (isLegalMaskedStore(SI->getValueOperand()->getType(),
-                             SI->getPointerOperand()) ||
-          isLegalMaskedScatter(SI->getValueOperand()->getType())) {
-        MaskedOp.insert(SI);
-        continue;
-      }
-
-      bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
-      bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-
-      if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
-          !isSinglePredecessor)
-        return false;
+      // Predicated store requires some form of masking:
+      // 1) masked store HW instruction,
+      // 2) emulation via load-blend-store (only if safe and legal to do so,
+      //    be aware on the race conditions), or
+      // 3) element-by-element predicate check and scalar store.
+      MaskedOp.insert(SI);
+      continue;
      }
      if (I.mayThrow())
        return false;
@@ -6050,13 +6047,6 @@ void InterleavedAccessInfo::analyzeInterleaving(
  }
  
  Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
-  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
-    return None;
-  }
-
    if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
      // TODO: It may by useful to do since it's still likely to be dynamically
      // uniform if the target can skip.
@@ -6183,9 +6173,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
  VectorizationFactor
  LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
    float Cost = expectedCost(1).first;
-#ifndef NDEBUG
    const float ScalarCost = Cost;
-#endif /* NDEBUG */
    unsigned Width = 1;
    DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
  
@@ -6216,6 +6204,14 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
      }
    }
  
+  if (!EnableCondStoresVectorization && NumPredStores) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    Width = 1;
+    Cost = ScalarCost;
+  }
+
    DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
          << "LV: Vectorization seems to be not beneficial, "
          << "but was forced by a user.\n");
@@ -6267,7 +6263,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
        //        optimization to non-pointer types.
        //
        if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+          !Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
          continue;
  
        MinWidth = std::min(MinWidth,
@@ -6592,6 +6588,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
    return RUs;
  }
  
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+  // TODO: Cost model for emulated masked load/store is completely
+  // broken. This hack guides the cost model to use an artificially
+  // high enough value to practically disable vectorization with such
+  // operations, except where previously deployed legality hack allowed
+  // using very low cost values. This is to avoid regressions coming simply
+  // from moving "masked load/store" check from legality to cost model. 
+  // Masked Load/Gather emulation was previously never allowed.
+  // Limited number of Masked Store/Scatter emulation was allowed.
+  assert(isScalarWithPredication(I) &&
+         "Expecting a scalar emulated instruction");
+  return isa<LoadInst>(I) ||
+         (isa<StoreInst>(I) &&
+          NumPredStores > NumberOfStoresToPredicate);
+}
+
  void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
    // If we aren't vectorizing the loop, or if we've already collected the
    // instructions to scalarize, there's nothing to do. Collection may already
@@ -6612,11 +6624,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
      if (!Legal->blockNeedsPredication(BB))
        continue;
      for (Instruction &I : *BB)
-      if (Legal->isScalarWithPredication(&I)) {
+      if (isScalarWithPredication(&I)) {
          ScalarCostsTy ScalarCosts;
-        if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+        // Do not apply discount logic if hacked cost is needed
+        // for emulated masked memrefs.
+        if (!useEmulatedMaskMemRefHack(&I) &&
+            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
            ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
-
          // Remember that BB will remain after vectorization.
          PredicatedBBsAfterVectorization.insert(BB);
        }
@@ -6651,7 +6665,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
  
      // If the instruction is scalar with predication, it will be analyzed
      // separately. We ignore it within the context of PredInst.
-    if (Legal->isScalarWithPredication(I))
+    if (isScalarWithPredication(I))
        return false;
  
      // If any of the instruction's operands are uniform after vectorization,
@@ -6705,7 +6719,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
  
      // Compute the scalarization overhead of needed insertelement instructions
      // and phi nodes.
-    if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
        ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
                                                   true, false);
        ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
@@ -6848,9 +6862,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
    // If we have a predicated store, it may not be executed for each vector
    // lane. Scale the cost by the probability of executing the predicated
    // block.
-  if (Legal->isScalarWithPredication(I))
+  if (isScalarWithPredication(I)) {
      Cost /= getReciprocalPredBlockProb();
  
+    if (useEmulatedMaskMemRefHack(I))
+      // Artificially setting to a high enough value to practically disable
+      // vectorization with such operations.
+      Cost = 3000000;
+  }
+
    return Cost;
  }
  
@@ -6975,6 +6995,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
  void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
    if (VF == 1)
      return;
+  NumPredStores = 0;
    for (BasicBlock *BB : TheLoop->blocks()) {
      // For each instruction in the old loop.
      for (Instruction &I : *BB) {
@@ -6982,6 +7003,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
        if (!Ptr)
          continue;
  
+      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+        NumPredStores++;
        if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
          // Scalar load + broadcast
          unsigned Cost = getUniformMemOpCost(&I, VF);
@@ -6990,7 +7013,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
        }
  
        // We assume that widening is the best solution when possible.
-      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+      if (memoryInstructionCanBeWidened(&I, VF)) {
          unsigned Cost = getConsecutiveMemOpCost(&I, VF);
          int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
          assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -7017,7 +7040,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
        }
  
        unsigned GatherScatterCost =
-          Legal->isLegalGatherOrScatter(&I)
+          isLegalGatherOrScatter(&I)
                ? getGatherScatterCost(&I, VF) * NumAccesses
                : std::numeric_limits<unsigned>::max();
  
@@ -7178,7 +7201,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
      // vector lane. Get the scalarization cost and scale this amount by the
      // probability of executing the predicated block. If the instruction is not
      // predicated, we fall through to the next case.
-    if (VF > 1 && Legal->isScalarWithPredication(I)) {
+    if (VF > 1 && isScalarWithPredication(I)) {
        unsigned Cost = 0;
  
        // These instructions have a non-void type, so account for the phi nodes
@@ -7799,7 +7822,7 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
  
  bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
                                            VFRange &Range) {
-  if (Legal->isScalarWithPredication(I))
+  if (CM.isScalarWithPredication(I))
      return false;
  
    auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -7906,7 +7929,7 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
        [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
        Range);
  
-  bool IsPredicated = Legal->isScalarWithPredication(I);
+  bool IsPredicated = CM.isScalarWithPredication(I);
    auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
  
    // Find if I uses a predicated instruction. If so, it will use its scalar
diff --git a/test/Transforms/LoopVectorize/conditional-assignment.ll b/test/Transforms/LoopVectorize/conditional-assignment.ll

index 25d6111..7d1071b 100644 (file)
--- a/test/Transforms/LoopVectorize/conditional-assignment.ll
+++ b/test/Transforms/LoopVectorize/conditional-assignment.ll
@@ -1,7 +1,7 @@
  ; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
  ; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
  
-; CHECK: remark: source.c:2:8: loop not vectorized: store that is conditionally executed prevents vectorization
+; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial
  
  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
  
diff --git a/test/Transforms/LoopVectorize/hoist-loads.ll b/test/Transforms/LoopVectorize/hoist-loads.ll

index a20b0f6..db4774d 100644 (file)
--- a/test/Transforms/LoopVectorize/hoist-loads.ll
+++ b/test/Transforms/LoopVectorize/hoist-loads.ll
@@ -37,8 +37,9 @@ for.end:
  }
  
  ; However, we can't hoist loads whose address we have not seen unconditionally
-; accessed.
+; accessed. One wide load is fine, but not the second.
  ; CHECK-LABEL: @dont_hoist_cond_load(
+; CHECK: load <2 x float>
  ; CHECK-NOT: load <2 x float>
  
  define void @dont_hoist_cond_load() {
author	Renato Golin <renato.golin@linaro.org>
	Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)
committer	Renato Golin <renato.golin@linaro.org>
	Mon, 26 Feb 2018 11:06:36 +0000 (11:06 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/conditional-assignment.ll		patch \| blob \| history
test/Transforms/LoopVectorize/hoist-loads.ll		patch \| blob \| history