[CostModel][X86] Match 256-bit vector shift 'splat' costs for AVX2 and above

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 29cd8ed..829b47b 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -263,7 +263,7 @@ int X86TTIImpl::getArithmeticInstrCost(
      if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
  
-  static const CostTblEntry AVX2CostTable[] = {
+  static const CostTblEntry AVX2ShiftCostTable[] = {
      // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
      // customize them to detect the cases where shift amount is a scalar one.
      { ISD::SHL,     MVT::v4i32,    1 },
@@ -287,11 +287,11 @@ int X86TTIImpl::getArithmeticInstrCost(
        // is lowered into a vector multiply (vpmullw).
        return LT.first;
  
-    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+    if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
    }
  
-  static const CostTblEntry XOPCostTable[] = {
+  static const CostTblEntry XOPShiftCostTable[] = {
      // 128bit shifts take 1cy, but right shifts require negation beforehand.
      { ISD::SHL,     MVT::v16i8,    1 },
      { ISD::SRL,     MVT::v16i8,    2 },
@@ -322,48 +322,7 @@ int X86TTIImpl::getArithmeticInstrCost(
  
    // Look for XOP lowering tricks.
    if (ST->hasXOP())
-    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX2CustomCostTable[] = {
-    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
-    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
-    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
-    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
-    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
-
-    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
-    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
-    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
-    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
-    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
-    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
-    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
-    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
-
-    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
-    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
-  };
-
-  // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX2())
-    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
-                                            LT.second))
+    if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
  
    static const CostTblEntry
@@ -415,6 +374,46 @@ int X86TTIImpl::getArithmeticInstrCost(
        ISD = ISD::MUL;
    }
  
+  static const CostTblEntry AVX2CostTable[] = {
+    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
+
+    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
+    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
+    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
+    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
+    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
+    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
+    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
+    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
+
+    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
+    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
    static const CostTblEntry AVX1CostTable[] = {
      // We don't have to scalarize unsupported ops. We can issue two half-sized
      // operations and we only need to extract the upper YMM half.
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll

index a2a3d04..ab1eb73 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -578,8 +578,8 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
    ret <16 x i16> %shift
@@ -590,8 +590,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -615,8 +615,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 8 for instruction:   %shift
  ; SSE41: Found an estimated cost of 8 for instruction:   %shift
  ; AVX: Found an estimated cost of 8 for instruction:   %shift
-; AVX2: Found an estimated cost of 24 for instruction:   %shift
-; AVX512: Found an estimated cost of 24 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 8 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -627,8 +627,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 16 for instruction:   %shift
  ; SSE41: Found an estimated cost of 16 for instruction:   %shift
  ; AVX: Found an estimated cost of 16 for instruction:   %shift
-; AVX2: Found an estimated cost of 48 for instruction:   %shift
-; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 16 for instruction:   %shift
+; AVX512F: Found an estimated cost of 16 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll

index c4eaef0..fea7271 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -589,8 +589,8 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
    ret <16 x i16> %shift
@@ -601,8 +601,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -626,8 +626,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -638,8 +638,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll

index 5bf4321..7090ae4 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -631,8 +631,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 2 for instruction:   %shift
    %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -643,8 +643,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-ashr-cost.ll		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-lshr-cost.ll		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-shl-cost.ll		patch \| blob \| history