From: Florian Hahn <flo@fhahn.com>
Date: Wed, 10 Apr 2019 08:17:28 +0000 (+0000)
Subject: [VPLAN] Minor improvement to testing and debug messages.
X-Git-Tag: android-x86-9.0-r1~4869
X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=9928b761ce48e542888060ff81baacb636762293;p=android-x86%2Fexternal-llvm.git

[VPLAN] Minor improvement to testing and debug messages.

1. Use computed VF for stress testing.
2. If the computed VF does not produce vector code (VF smaller than 2), force VF to be 4.
3. Test vectorization of i64 data on AArch64 to make sure we generate VF != 4 (on X86 that was already tested on AVX).

Patch by Francesco Petrogalli <francesco.petrogalli@arm.com>

Differential Revision: https://reviews.llvm.org/D59952

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358056 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7f16d96e71c..5612e956794 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6117,17 +6117,20 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
     if (!UserVF) {
-      // We set VF to 4 for stress testing.
-      if (VPlanBuildStressTest)
+      VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+      // Make sure we have a VF > 1 for stress testing.
+      if (VPlanBuildStressTest && VF < 2) {
+        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+                          << "overriding computed VF.\n");
         VF = 4;
-      else
-        VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+      }
     }
-
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
-    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user VF " : "computed VF ")
-                      << VF << " to build VPlans.\n");
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
+                      << " to build VPlans.\n");
     buildVPlans(VF, VF);
 
     // For VPlan build stress testing, we bail out after VPlan construction.
diff --git a/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
index d3c250b97c8..aa8478b0b6a 100644
--- a/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -16,6 +16,7 @@
 ; }
 ;
 
+; CHECK-LABEL: @foo_i32(
 ; CHECK-LABEL: vector.ph:
 ; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
 ; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
@@ -48,8 +49,11 @@
 @arr2 = external global [8 x i32], align 16
 @arr = external global [8 x [8 x i32]], align 16
 
+@arrX = external global [8 x i64], align 16
+@arrY = external global [8 x [8 x i64]], align 16
+
 ; Function Attrs: norecurse nounwind uwtable
-define void @foo(i32 %n) {
+define void @foo_i32(i32 %n) {
 entry:
   br label %for.body
 
@@ -79,5 +83,62 @@ for.end10:                                        ; preds = %for.inc8
   ret void
 }
 
+; CHECK-LABEL: @foo_i64(
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> undef, i64 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> undef, <2 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, <2 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[VecInd]], <2 x i64*> %[[AAddr]], i32 4, <2 x i1> <i1 true, i1 true>)
+; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> <i1 true, i1 true>
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], <i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], <i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 2
+; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], <i64 2, i64 2>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+; Function Attrs: norecurse nounwind uwtable
+define void @foo_i64(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, i64 %indvars.iv21
+  store i64 %indvars.iv21, i64* %arrayidx, align 4
+  %add = add nsw i64 %indvars.iv21, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i64 %add, i64* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
 !1 = distinct !{!1, !2}
 !2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/test/Transforms/LoopVectorize/explicit_outer_detection.ll
index c46d7b09735..33527b307da 100644
--- a/test/Transforms/LoopVectorize/explicit_outer_detection.ll
+++ b/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -73,7 +73,7 @@ for.end15:                                        ; preds = %outer.inc, %entry
 ; CHECK-LABEL: case2
 ; CHECK: LV: Loop hints: force=enabled width=0 unroll=0
 ; CHECK: LV: We can vectorize this outer loop!
-; CHECK: LV: Using computed VF 1 to build VPlans.
+; CHECK: LV: Using VF 1 to build VPlans.
 
 define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
 entry:
diff --git a/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll b/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll
new file mode 100644
index 00000000000..d8d42caa3d9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s  -S -loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1  | FileCheck %s
+
+; This test checks that, when stress testing VPlan, if the computed VF
+; is 1, we override it to VF = 4.
+
+; CHECK: LV: VPlan computed VF 1.
+; CHECK: LV: VPlan stress testing: overriding computed VF.
+; CHECK: LV: Using VF 4 to build VPlans.
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}