From 25961b469a9debe69b915bcb4fa49d35d2ee9544 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 24 Jun 2013 02:52:43 +0000
Subject: [PATCH] SLP Vectorizer: Add support for vectorizing parts of the
 tree.

Untill now we detected the vectorizable tree and evaluated the cost of the
entire tree.  With this patch we can decide to trim-out branches of the tree
that are not profitable to vectorizer.

Also, increase the max depth from 6 to 12. In the worse possible case where all
of the code is made of diamond-shaped graph this can bring the cost to 2**10,
but diamonds are not very common.




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184681 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp       | 30 +++++++++++++---
 test/Transforms/SLPVectorizer/X86/long_chains.ll | 46 ++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/long_chains.ll
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5bc3d852e79..d0e3fe2d0b6 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -53,7 +53,7 @@ namespace {
 
 static const unsigned MinVecRegSize = 128;
 
-static const unsigned RecursionMaxDepth = 6;
+static const unsigned RecursionMaxDepth = 12;
 
 /// RAII pattern to save the insertion point of the IR builder.
 class BuilderLocGuard {
@@ -605,16 +605,17 @@ int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   if (ScalarTy->isVectorTy())
     return FuncSLP::MAX_COST;
 
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-
   if (allConstant(VL))
     return 0;
 
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
   if (isSplat(VL))
     return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
 
+  int GatherCost = getGatherCost(VecTy);
   if (Depth == RecursionMaxDepth || needToGatherAny(VL))
-    return getGatherCost(VecTy);
+    return GatherCost;
 
   BasicBlock *BB = getSameBlock(VL);
   unsigned Opcode = getSameOpcode(VL);
@@ -677,6 +678,12 @@ int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
     VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
     int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
     Cost += (VecCost - ScalarCost);
+
+    if (Cost > GatherCost) {
+      MustGather.insert(VL.begin(), VL.end());
+      return GatherCost;
+    }
+
     return Cost;
   }
   case Instruction::FCmp:
@@ -739,6 +746,12 @@ int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
       VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
     }
     TotalCost += (VecCost - ScalarCost);
+
+    if (TotalCost > GatherCost) {
+      MustGather.insert(VL.begin(), VL.end());
+      return GatherCost;
+    }
+
     return TotalCost;
   }
   case Instruction::Load: {
@@ -751,7 +764,14 @@ int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
     int ScalarLdCost = VecTy->getNumElements() *
                        TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
     int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    return VecLdCost - ScalarLdCost;
+    int TotalCost = VecLdCost - ScalarLdCost;
+
+    if (TotalCost > GatherCost) {
+      MustGather.insert(VL.begin(), VL.end());
+      return GatherCost;
+    }
+
+    return TotalCost;
   }
   case Instruction::Store: {
     // We know that we can merge the stores. Calculate the cost.
diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll
new file mode 100644
index 00000000000..0a2ace3f21f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: test
+; CHECK: sitofp i8
+; CHECK-NEXT: sitofp i8
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: fmul <2 x double>
+; CHECK: ret
+define i32 @test(double* nocapture %A, i8* nocapture %B) {
+entry:
+  %0 = load i8* %B, align 1
+  %arrayidx1 = getelementptr inbounds i8* %B, i64 1
+  %1 = load i8* %arrayidx1, align 1
+  %add = add i8 %0, 3
+  %add4 = add i8 %1, 3
+  %conv6 = sitofp i8 %add to double
+  %conv7 = sitofp i8 %add4 to double ; <--- This is inefficient. The chain stops here.
+  %mul = fmul double %conv6, %conv6
+  %add8 = fadd double %mul, 1.000000e+00
+  %mul9 = fmul double %conv7, %conv7
+  %add10 = fadd double %mul9, 1.000000e+00
+  %mul11 = fmul double %add8, %add8
+  %add12 = fadd double %mul11, 1.000000e+00
+  %mul13 = fmul double %add10, %add10
+  %add14 = fadd double %mul13, 1.000000e+00
+  %mul15 = fmul double %add12, %add12
+  %add16 = fadd double %mul15, 1.000000e+00
+  %mul17 = fmul double %add14, %add14
+  %add18 = fadd double %mul17, 1.000000e+00
+  %mul19 = fmul double %add16, %add16
+  %add20 = fadd double %mul19, 1.000000e+00
+  %mul21 = fmul double %add18, %add18
+  %add22 = fadd double %mul21, 1.000000e+00
+  %mul23 = fmul double %add20, %add20
+  %add24 = fadd double %mul23, 1.000000e+00
+  %mul25 = fmul double %add22, %add22
+  %add26 = fadd double %mul25, 1.000000e+00
+  store double %add24, double* %A, align 8
+  %arrayidx28 = getelementptr inbounds double* %A, i64 1
+  store double %add26, double* %arrayidx28, align 8
+  ret i32 undef
+}
-- 
2.11.0