[X86][CM] update add\sub costs of vectors of 64 in X86\SLM arch

author Mohammed Agabaria <mohammed.agabaria@intel.com>

Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)

committer Mohammed Agabaria <mohammed.agabaria@intel.com>

Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)
author Mohammed Agabaria <mohammed.agabaria@intel.com>
Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)
committer Mohammed Agabaria <mohammed.agabaria@intel.com>
Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 5ba8534..c9924f2 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -142,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost(
      { ISD::FDIV, MVT::v2f64, 69 }, // divpd
      { ISD::FADD, MVT::v2f64, 2  }, // addpd
      { ISD::FSUB, MVT::v2f64, 2  }, // subpd
-    // v2i64/v4i64 mul is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    // slm muldq version throughput is 2
-    { ISD::MUL,  MVT::v2i64, 11 },
+    // v2i64/v4i64 mul is custom lowered as a series of long:
+    // multiplies(3), shifts(3) and adds(2)
+    // slm muldq version throughput is 2 and addq throughput 4 
+    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+    //       3X4 (addq throughput) = 17 
+    { ISD::MUL,  MVT::v2i64, 17 },
+    // slm addq\subq throughput is 4
+    { ISD::ADD,  MVT::v2i64, 4  },
+    { ISD::SUB,  MVT::v2i64, 4  },
    };
  
    if (ST->isSLM()) {
diff --git a/test/Analysis/CostModel/X86/slm-arith-costs.ll b/test/Analysis/CostModel/X86/slm-arith-costs.ll

index 3673a5d..a767aa3 100644 (file)
--- a/test/Analysis/CostModel/X86/slm-arith-costs.ll
+++ b/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -3,6 +3,20 @@
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
+define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM:  cost of 4 {{.*}} add <2 x i64>
+  %res = add <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM:  cost of 4 {{.*}} sub <2 x i64>
+  %res = sub <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
  ; 8bit mul
  define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b)  {
  entry:
@@ -13,7 +27,7 @@ entry:
  
  define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b)  {
  entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i8>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i8>
    %res = mul nsw <2 x i8> %a, %b
    ret <2 x i8> %res
  }
@@ -97,7 +111,7 @@ entry:
  
  define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b)  {
  entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i16>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i16>
    %res = mul nsw <2 x i16> %a, %b
    ret <2 x i16> %res
  }
@@ -181,7 +195,7 @@ entry:
  
  define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b)  {
  entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i32>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i32>
    %res = mul nsw <2 x i32> %a, %b
    ret <2 x i32> %res
  }
@@ -217,28 +231,28 @@ entry:
  
  define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b)  {
  entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i64>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i64>
    %res = mul nsw <2 x i64> %a, %b
    ret <2 x i64> %res
  }
  
  define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b)  {
  entry:
-; SLM:  cost of 22 {{.*}} mul nsw <4 x i64>
+; SLM:  cost of 34 {{.*}} mul nsw <4 x i64>
    %res = mul nsw <4 x i64> %a, %b
    ret <4 x i64> %res
  }
  
  define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b)  {
  entry:
-; SLM:  cost of 44 {{.*}} mul nsw <8 x i64>
+; SLM:  cost of 68 {{.*}} mul nsw <8 x i64>
    %res = mul nsw <8 x i64> %a, %b
    ret <8 x i64> %res
  }
  
  define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b)  {
  entry:
-; SLM:  cost of 88 {{.*}} mul nsw <16 x i64>
+; SLM:  cost of 136 {{.*}} mul nsw <16 x i64>
    %res = mul nsw <16 x i64> %a, %b
    ret <16 x i64> %res
  }
diff --git a/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll

new file mode 100644 (file)

index 0000000..8be9f1d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1. 
+  %cmp17 = icmp sgt i32 %LastIndex, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv5 = sext i16 %Scale to i64
+  %sh_prom = and i64 %conv5, 4294967295
+  %0 = sext i16 %lag to i64
+  %wide.trip.count = zext i32 %LastIndex to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %conv8 = trunc i64 %add7 to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+  ret i32 %Accumulator.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i64
+  %2 = add nsw i64 %indvars.iv, %0
+  %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+  %3 = load i16, i16* %arrayidx3, align 2 
+  %conv4 = sext i16 %3 to i64
+  %mul = mul nsw i64 %conv4, %conv
+  %shr = ashr i64 %mul, %sh_prom
+  %add7 = add i64 %shr, %Accumulator.018 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
author	Mohammed Agabaria <mohammed.agabaria@intel.com>
	Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)
committer	Mohammed Agabaria <mohammed.agabaria@intel.com>
	Sun, 2 Jul 2017 12:16:15 +0000 (12:16 +0000)
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
test/Analysis/CostModel/X86/slm-arith-costs.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll	[new file with mode: 0644]	patch \| blob