[AMDGPU] Tune inlining parameters for AMDGPU target

author Daniil Fukalov <daniil.fukalov@amd.com>

Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)

committer Daniil Fukalov <daniil.fukalov@amd.com>

Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)
author Daniil Fukalov <daniil.fukalov@amd.com>
Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)
committer Daniil Fukalov <daniil.fukalov@amd.com>
Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index af1a12d..7574b81 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -263,6 +263,18 @@ public:
    /// individual classes of instructions would be better.
    unsigned getInliningThresholdMultiplier() const;
  
+  /// \returns Vector bonus in percent.
+  ///
+  /// Vector bonuses: We want to more aggressively inline vector-dense kernels
+  /// and apply this bonus based on the percentage of vector instructions. A
+  /// bonus is applied if the vector instructions exceed 50% and half that amount
+  /// is applied if it exceeds 10%. Note that these bonuses are some what
+  /// arbitrary and evolved over time by accident as much as because they are
+  /// principled bonuses.
+  /// FIXME: It would be nice to base the bonus values on something more
+  /// scientific. A target may has no bonus on vector instructions.
+  int getInlinerVectorBonusPercent() const;
+
    /// Estimate the cost of an intrinsic when lowered.
    ///
    /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
@@ -1128,6 +1140,7 @@ public:
    virtual int getCallCost(const Function *F,
                            ArrayRef<const Value *> Arguments, const User *U) = 0;
    virtual unsigned getInliningThresholdMultiplier() = 0;
+  virtual int getInlinerVectorBonusPercent() = 0;
    virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                                 ArrayRef<Type *> ParamTys, const User *U) = 0;
    virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
@@ -1351,6 +1364,9 @@ public:
    unsigned getInliningThresholdMultiplier() override {
      return Impl.getInliningThresholdMultiplier();
    }
+  int getInlinerVectorBonusPercent() override {
+    return Impl.getInlinerVectorBonusPercent();
+  }
    int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                         ArrayRef<Type *> ParamTys, const User *U = nullptr) override {
      return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index a9383e7..b99e1eb 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -140,6 +140,8 @@ public:
  
    unsigned getInliningThresholdMultiplier() { return 1; }
  
+  int getInlinerVectorBonusPercent() { return 150; }
+
    unsigned getMemcpyCost(const Instruction *I) {
      return TTI::TCC_Expensive;
    }
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h

index c2d050d..70bf670 100644 (file)
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -427,6 +427,8 @@ public:
  
    unsigned getInliningThresholdMultiplier() { return 1; }
  
+  int getInlinerVectorBonusPercent() { return 150; }
+
    void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                 TTI::UnrollingPreferences &UP) {
      // This unrolling functionality is target independent, but to provide some
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp

index 3cb56f8..0dec146 100644 (file)
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -880,15 +880,6 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
    // basic block at the given callsite context. This is speculatively applied
    // and withdrawn if more than one basic block is seen.
    //
-  // Vector bonuses: We want to more aggressively inline vector-dense kernels
-  // and apply this bonus based on the percentage of vector instructions. A
-  // bonus is applied if the vector instructions exceed 50% and half that amount
-  // is applied if it exceeds 10%. Note that these bonuses are some what
-  // arbitrary and evolved over time by accident as much as because they are
-  // principled bonuses.
-  // FIXME: It would be nice to base the bonus values on something more
-  // scientific.
-  //
    // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
    // of the last call to a static function as inlining such functions is
    // guaranteed to reduce code size.
@@ -896,7 +887,7 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
    // These bonus percentages may be set to 0 based on properties of the caller
    // and the callsite.
    int SingleBBBonusPercent = 50;
-  int VectorBonusPercent = 150;
+  int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
    int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
  
    // Lambda to set all the above bonus and bonus percentages to 0.
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 50c5ae9..eb04c34 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -176,6 +176,10 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
    return TTIImpl->getInliningThresholdMultiplier();
  }
  
+int TargetTransformInfo::getInlinerVectorBonusPercent() const {
+  return TTIImpl->getInlinerVectorBonusPercent();
+}
+
  int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
                                      ArrayRef<const Value *> Operands) const {
    return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp

index ec0dd6d..f4df20b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
  #define DEBUG_TYPE "inline"
  
  static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
+ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
                cl::desc("Cost of alloca argument"));
  
  // If the amount of scratch memory to eliminate exceeds our ability to allocate
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

index 72882c8..6f1bf5a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -191,7 +191,9 @@ public:
    bool areInlineCompatible(const Function *Caller,
                             const Function *Callee) const;
  
-  unsigned getInliningThresholdMultiplier() { return 9; }
+  unsigned getInliningThresholdMultiplier() { return 7; }
+
+  int getInlinerVectorBonusPercent() { return 0; }
  
    int getArithmeticReductionCost(unsigned Opcode,
                                   Type *Ty,
diff --git a/test/CodeGen/AMDGPU/amdgpu-inline.ll b/test/CodeGen/AMDGPU/amdgpu-inline.ll

index 75c16d0..c2f1836 100644 (file)
--- a/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -28,15 +28,8 @@ if.end:                                           ; preds = %if.then, %entry
  define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
  entry:
    %tmp1 = load float, float addrspace(5)* %p1, align 4
-  %cmp = fcmp ogt float %tmp1, 1.000000e+00
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
    %div = fdiv float 2.000000e+00, %tmp1
    store float %div, float addrspace(5)* %p2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
    ret void
  }
  
diff --git a/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll b/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll

new file mode 100644 (file)

index 0000000..cf28d4f
--- /dev/null
+++ b/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s
+
+define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) {
+entry:
+  %div.1 = udiv <16 x i32> %x, %y
+  %div.2 = udiv <16 x i32> %div.1, %y
+  %div.3 = udiv <16 x i32> %div.2, %y
+  %div.4 = udiv <16 x i32> %div.3, %y
+  %div.5 = udiv <16 x i32> %div.4, %y
+  %div.6 = udiv <16 x i32> %div.5, %y
+  %div.7 = udiv <16 x i32> %div.6, %y
+  %div.8 = udiv <16 x i32> %div.7, %y
+  %div.9 = udiv <16 x i32> %div.8, %y
+  %div.10 = udiv <16 x i32> %div.9, %y
+  %div.11 = udiv <16 x i32> %div.10, %y
+  %div.12 = udiv <16 x i32> %div.11, %y
+  ret <16 x i32> %div.12
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus
+; CHECK-NOT: udiv
+; CHECK: tail call <16 x i32> @div_vecbonus
+; CHECK: ret void
+define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) {
+entry:
+  %tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x
+  %tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y
+  %div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1)
+  store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x
+  ret void
+}
author	Daniil Fukalov <daniil.fukalov@amd.com>
	Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)
committer	Daniil Fukalov <daniil.fukalov@amd.com>
	Wed, 17 Jul 2019 16:51:29 +0000 (16:51 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
include/llvm/CodeGen/BasicTTIImpl.h		patch \| blob \| history
lib/Analysis/InlineCost.cpp		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInline.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/amdgpu-inline.ll		patch \| blob \| history
test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll	[new file with mode: 0644]	patch \| blob