[LV] Suppress vectorization in some nontemporal cases

author Warren Ristow <warren.ristow@sony.com>

Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)

committer Warren Ristow <warren.ristow@sony.com>

Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)
author Warren Ristow <warren.ristow@sony.com>
Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)
committer Warren Ristow <warren.ristow@sony.com>
Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 52d4cb7..f53b17d 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -531,6 +531,11 @@ public:
    /// Return true if the target supports masked store.
    bool isLegalMaskedLoad(Type *DataType) const;
  
+  /// Return true if the target supports nontemporal store.
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
+  /// Return true if the target supports nontemporal load.
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
+
    /// Return true if the target supports masked scatter.
    bool isLegalMaskedScatter(Type *DataType) const;
    /// Return true if the target supports masked gather.
@@ -1118,6 +1123,8 @@ public:
    virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
    virtual bool isLegalMaskedStore(Type *DataType) = 0;
    virtual bool isLegalMaskedLoad(Type *DataType) = 0;
+  virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
+  virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
    virtual bool isLegalMaskedScatter(Type *DataType) = 0;
    virtual bool isLegalMaskedGather(Type *DataType) = 0;
    virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
@@ -1373,6 +1380,12 @@ public:
    bool isLegalMaskedLoad(Type *DataType) override {
      return Impl.isLegalMaskedLoad(DataType);
    }
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
+    return Impl.isLegalNTStore(DataType, Alignment);
+  }
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
+    return Impl.isLegalNTLoad(DataType, Alignment);
+  }
    bool isLegalMaskedScatter(Type *DataType) override {
      return Impl.isLegalMaskedScatter(DataType);
    }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index 2527495..f8b36ec 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -229,6 +229,20 @@ public:
  
    bool isLegalMaskedLoad(Type *DataType) { return false; }
  
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) {
+    // By default, assume nontemporal memory stores are available for stores
+    // that are aligned and have a size that is a power of 2.
+    unsigned DataSize = DL.getTypeStoreSize(DataType);
+    return Alignment >= DataSize && isPowerOf2_32(DataSize);
+  }
+
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) {
+    // By default, assume nontemporal memory loads are available for loads that
+    // are aligned and have a size that is a power of 2.
+    unsigned DataSize = DL.getTypeStoreSize(DataType);
+    return Alignment >= DataSize && isPowerOf2_32(DataSize);
+  }
+
    bool isLegalMaskedScatter(Type *DataType) { return false; }
  
    bool isLegalMaskedGather(Type *DataType) { return false; }
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

index 4089bfa..b144006 100644 (file)
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -205,12 +205,13 @@ class LoopVectorizationLegality {
  public:
    LoopVectorizationLegality(
        Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
-      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
-      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
-      OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
-      LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
-      : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
-        ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+      TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA,
+      Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
+      LoopInfo *LI, OptimizationRemarkEmitter *ORE,
+      LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
+      AssumptionCache *AC)
+      : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
+        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
  
    /// ReductionList contains the reduction descriptors for all
    /// of the reductions that were found in the loop.
@@ -402,6 +403,9 @@ private:
    /// unrolling.
    PredicatedScalarEvolution &PSE;
  
+  /// Target Transform Info.
+  TargetTransformInfo *TTI;
+
    /// Target Library Info.
    TargetLibraryInfo *TLI;
  
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 763b684..a55c1be 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -183,6 +183,16 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
    return TTIImpl->isLegalMaskedLoad(DataType);
  }
  
+bool TargetTransformInfo::isLegalNTStore(Type *DataType,
+                                         unsigned Alignment) const {
+  return TTIImpl->isLegalNTStore(DataType, Alignment);
+}
+
+bool TargetTransformInfo::isLegalNTLoad(Type *DataType,
+                                        unsigned Alignment) const {
+  return TTIImpl->isLegalNTLoad(DataType, Alignment);
+}
+
  bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
    return TTIImpl->isLegalMaskedGather(DataType);
  }
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 2b9a61d..08e46ed 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3143,6 +3143,41 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
    return isLegalMaskedLoad(DataType);
  }
  
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+  unsigned DataSize = DL.getTypeStoreSize(DataType);
+  // The only supported nontemporal loads are for aligned vectors of 16 or 32
+  // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
+  // (the equivalent stores only require AVX).
+  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+    return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
+
+  return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+  unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+  // SSE4A supports nontemporal stores of float and double at arbitrary
+  // alignment.
+  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+    return true;
+
+  // Besides the SSE4A subtarget exception above, only aligned stores are
+  // available nontemporaly on any other subtarget.  And only stores with a size
+  // of 4..32 bytes (powers of 2, only) are permitted.
+  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+      !isPowerOf2_32(DataSize))
+    return false;
+
+  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+  // loads require AVX2).
+  if (DataSize == 32)
+    return ST->hasAVX();
+  else if (DataSize == 16)
+    return ST->hasSSE1();
+  return true;
+}
+
  bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
    if (!isa<VectorType>(DataTy))
      return false;
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h

index 351a4f2..f43155e 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -186,6 +186,8 @@ public:
    bool canMacroFuseCmp();
    bool isLegalMaskedLoad(Type *DataType);
    bool isLegalMaskedStore(Type *DataType);
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+  bool isLegalNTStore(Type *DataType, unsigned Alignment);
    bool isLegalMaskedGather(Type *DataType);
    bool isLegalMaskedScatter(Type *DataType);
    bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

index e5713c4..6ef8dc2 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -767,6 +767,38 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
            return false;
          }
  
+        // For nontemporal stores, check that a nontemporal vector version is
+        // supported on the target.
+        if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+          // Arbitrarily try a vector of 2 elements.
+          Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+          assert(VecTy && "did not find vectorized version of stored type");
+          unsigned Alignment = getLoadStoreAlignment(ST);
+          if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+            reportVectorizationFailure(
+                "nontemporal store instruction cannot be vectorized",
+                "nontemporal store instruction cannot be vectorized",
+                "CantVectorizeNontemporalStore", ST);
+            return false;
+          }
+        }
+
+      } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+        if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+          // For nontemporal loads, check that a nontemporal vector version is
+          // supported on the target (arbitrarily try a vector of 2 elements).
+          Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+          assert(VecTy && "did not find vectorized version of load type");
+          unsigned Alignment = getLoadStoreAlignment(LD);
+          if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+            reportVectorizationFailure(
+                "nontemporal load instruction cannot be vectorized",
+                "nontemporal load instruction cannot be vectorized",
+                "CantVectorizeNontemporalLoad", LD);
+            return false;
+          }
+        }
+
          // FP instructions can allow unsafe algebra, thus vectorizable by
          // non-IEEE-754 compliant SIMD units.
          // This applies to floating-point math operations and calls, not memory
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 01104f6..7b3b9dd 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7275,7 +7275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
    // Check if it is legal to vectorize the loop.
    LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
                                  &Requirements, &Hints, DB, AC);
    if (!LVL.canVectorize(EnableVPlanNativePath)) {
      LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
diff --git a/test/Transforms/LoopVectorize/X86/nontemporal.ll b/test/Transforms/LoopVectorize/X86/nontemporal.ll

new file mode 100644 (file)

index 0000000..c83ca29
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/nontemporal.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+; The three test-cases below are all based on modified versions of a simple copy-loop:
+;
+; void foo(unsigned *src, unsigned *dst, unsigned nElts) {
+;   for (unsigned i = 0; i < nElts; ++i) {
+;     unsigned tmp = src[i];
+;     dst[i] = tmp;
+;   }
+; }
+;
+; In the first version, there are no nontemporal stores or loads, and so vectorization
+; is safely done.
+;
+; In the second version, the store into dst[i] has the nontemporal hint.  The alignment
+; on X86_64 for 'unsigned' is 4, so the vector store generally will not be aligned to the
+; vector size (of 16 here).  Unaligned nontemporal vector stores are not supported on X86_64,
+; and so the vectorization is suppressed (because when vectorizing it, the nontemoral hint
+; would not be honored in the final code-gen).
+;
+; The third version is analogous to the second, except rather than the store, it is the
+; load from 'src[i]' that has the nontemporal hint.  Vectorization is suppressed in this
+; case because (like stores) unaligned nontemoral vector loads are not supported on X86_64.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64"
+
+; CHECK-LABEL: @vectorTest(
+define void @vectorTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; Check that we vectorized the load, and that there is no nontemporal hint.
+; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+; Check that we vectorized the store, and that there is no nontemporal hint.
+; CHECK: store <4 x i32> %wide.load, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTStoreTest(
+; Check that the vectorized type of the store does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTStoreTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+; Check that the store is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx2, align 4, !nontemporal !4
+  store i32 %0, i32* %arrayidx2, align 4, !nontemporal !0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTLoadTest(
+; Check that the vectorized type of the load does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTLoadTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+; Check that the load is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: load i32, i32* %arrayidx, align 4, !nontemporal !4
+  %0 = load i32, i32* %arrayidx, align 4, !nontemporal !0
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+!0 = !{i32 1}
diff --git a/test/Transforms/LoopVectorize/nontemporal.ll b/test/Transforms/LoopVectorize/nontemporal.ll

index b5719e1..5df8b83 100644 (file)
--- a/test/Transforms/LoopVectorize/nontemporal.ll
+++ b/test/Transforms/LoopVectorize/nontemporal.ll
@@ -14,19 +14,19 @@ for.body.preheader:                               ; preds = %entry
  for.body:                                         ; preds = %for.body.preheader, %for.body
    %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
  
-; Check that we don't lose !nontemporal hint when vectorizing loads.
-; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of loads.
+; CHECK: load {{.*}} align 4, !nontemporal !0
    %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
    %0 = load float, float* %arrayidx, align 4, !nontemporal !0
  
  ; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
-; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+; CHECK: load {{.*}} align 4{{$}}
    %arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
    %1 = load float, float* %arrayidx2, align 4
    %add = fadd float %0, %1
  
-; Check that we don't lose !nontemporal hint when vectorizing stores.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of stores.
+; CHECK: store {{.*}} align 4, !nontemporal !0
    %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
    store float %add, float* %arrayidx4, align 4, !nontemporal !0
author	Warren Ristow <warren.ristow@sony.com>
	Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)
committer	Warren Ristow <warren.ristow@sony.com>
	Mon, 17 Jun 2019 17:20:08 +0000 (17:20 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorizationLegality.cpp		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/nontemporal.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopVectorize/nontemporal.ll		patch \| blob \| history