[X86] Heuristic to selectively build Newton-Raphson SQRT estimation

author Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)

committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)
author Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)
committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 4586a17..baeb487 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -243,9 +243,10 @@ public:
      return true;
    }
  
-  /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
-  bool isFsqrtCheap() const {
-    return FsqrtIsCheap;
+  /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
+  virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
+    // Default behavior is to replace SQRT(X) with X*RSQRT(X).
+    return false;
    }
  
    /// Returns true if target has indicated at least one type should be bypassed.
@@ -1381,10 +1382,6 @@ protected:
    /// control.
    void setJumpIsExpensive(bool isExpensive = true);
  
-  /// Tells the code generator that fsqrt is cheap, and should not be replaced
-  /// with an alternative sequence of instructions.
-  void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
-
    /// Tells the code generator that this target supports floating point
    /// exceptions and cares about preserving floating point exception behavior.
    void setHasFloatingPointExceptions(bool FPExceptions = true) {
@@ -1910,9 +1907,6 @@ private:
    /// combined with "shift" to BitExtract instructions.
    bool HasExtractBitsInsn;
  
-  // Don't expand fsqrt with an approximation based on the inverse sqrt.
-  bool FsqrtIsCheap;
-
    /// Tells the code generator to bypass slow divide or remainder
    /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
    /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index df92845..5bcea64 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8907,14 +8907,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (TLI.isFsqrtCheap(N0, DAG))
      return SDValue();
  
    // TODO: FSQRT nodes should have flags that propagate to the created nodes.
    // For now, create a Flags object for use with all unsafe math transforms.
    SDNodeFlags Flags;
    Flags.setUnsafeAlgebra(true);
-  return buildSqrtEstimate(N->getOperand(0), &Flags);
+  return buildSqrtEstimate(N0, &Flags);
  }
  
  /// copysign(x, fp_extend(y)) -> copysign(x, y)
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp

index 25b246d..4bf0266 100644 (file)
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -807,7 +807,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
    SelectIsExpensive = false;
    HasMultipleConditionRegisters = false;
    HasExtractBitsInsn = false;
-  FsqrtIsCheap = false;
    JumpIsExpensive = JumpIsExpensiveOverride;
    PredictableSelectIsExpensive = false;
    MaskAndBranchFoldingIsLegal = false;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index a014549..b628d2a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
    setSelectIsExpensive(false);
    PredictableSelectIsExpensive = false;
  
-  setFsqrtIsCheap(true);
-
    // We want to find all load dependencies for long chains of stores to enable
    // merging into very wide vectors. The problem is with vectors with > 4
    // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index 206c93c..20775df 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -166,6 +166,9 @@ public:
  
    const char* getTargetNodeName(unsigned Opcode) const override;
  
+  bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+    return true;
+  }
    SDValue getRsqrtEstimate(SDValue Operand,
                             DAGCombinerInfo &DCI,
                             unsigned &RefinementSteps,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index 84be890..9b6fe68 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -249,6 +249,19 @@ def FeatureSoftFloat
  def FeatureFastPartialYMMWrite
      : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
                         "true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+    : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+                       "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+    : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+                       "true", "Vector SQRT is fast (disable Newton-Raphson)">;
  
  //===----------------------------------------------------------------------===//
  // X86 processors supported.
@@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[], [
    FeaturePCLMUL,
    FeatureXSAVE,
    FeatureXSAVEOPT,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFastScalarFSQRT
  ]>;
  
  class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
    FeatureXSAVEC,
    FeatureXSAVES,
    FeatureSGX,
-  FeatureCLFLUSHOPT
+  FeatureCLFLUSHOPT,
+  FeatureFastVectorFSQRT
  ]>;
  
  // FIXME: define SKL model
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 5ca486d..d5be62c 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
    return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
  }
  
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // We never want to use both SQRT and RSQRT instructions for the same input.
+  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+    return false;
+
+  if (VT.isVector())
+    return Subtarget.hasFastVectorFSQRT();
+  return Subtarget.hasFastScalarFSQRT();
+}
+
  /// The minimum architected relative accuracy is 2^-12. We need one
  /// Newton-Raphson step to have a good float result (24 bits of precision).
  SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 772f29f..d2df829 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1219,6 +1219,9 @@ namespace llvm {
      /// Convert a comparison if required by the subtarget.
      SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
  
+    /// Check if replacement of SQRT with RSQRT should be disabled.
+    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
      /// Use rsqrt* to speed up sqrt calculations.
      SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                               unsigned &RefinementSteps,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp

index 8f77682..156c0b9 100644 (file)
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment() {
    HasCmpxchg16b = false;
    UseLeaForSP = false;
    HasFastPartialYMMWrite = false;
+  HasFastScalarFSQRT = false;
+  HasFastVectorFSQRT = false;
    HasSlowDivide32 = false;
    HasSlowDivide64 = false;
    PadShortFunctions = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h

index a274b79..c1f862d 100644 (file)
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -199,6 +199,14 @@ protected:
    /// of a YMM register without clearing the upper part.
    bool HasFastPartialYMMWrite;
  
+  /// True if hardware SQRTSS instruction is at least as fast (latency) as
+  /// RSQRTSS followed by a Newton-Raphson iteration.
+  bool HasFastScalarFSQRT;
+
+  /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+  /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+  bool HasFastVectorFSQRT;
+
    /// True if 8-bit divisions are significantly faster than
    /// 32-bit divisions and should be used when possible.
    bool HasSlowDivide32;
@@ -434,6 +442,8 @@ public:
    bool hasCmpxchg16b() const { return HasCmpxchg16b; }
    bool useLeaForSP() const { return UseLeaForSP; }
    bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
    bool hasSlowDivide32() const { return HasSlowDivide32; }
    bool hasSlowDivide64() const { return HasSlowDivide64; }
    bool padShortFunctions() const { return PadShortFunctions; }
diff --git a/test/CodeGen/X86/sqrt-fastmath-tune.ll b/test/CodeGen/X86/sqrt-fastmath-tune.ll

new file mode 100644 (file)

index 0000000..afa01b6
--- /dev/null
+++ b/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem     | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell   | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake     | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
+
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
+
+define float @foo_x1(float %f) #0 {
+; SCALAR-EST-LABEL: foo_x1:
+; SCALAR-EST:       # BB#0:
+; SCALAR-EST-NEXT:    rsqrtss %xmm0
+; SCALAR-EST:         retq
+;
+; SCALAR-ACC-LABEL: foo_x1:
+; SCALAR-ACC:       # BB#0:
+; SCALAR-ACC-NEXT:    {{^ *v?sqrtss %xmm0}}
+; SCALAR-ACC-NEXT:    retq
+  %call = tail call float @llvm.sqrt.f32(float %f) #1
+  ret float %call
+}
+
+define <4 x float> @foo_x4(<4 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x4:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps %xmm0
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x4:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %xmm0}}
+; VECTOR-ACC-NEXT:    retq
+  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+  ret <4 x float> %call
+}
+
+define <8 x float> @foo_x8(<8 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x8:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x8:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %[xy]mm0}}
+; VECTOR-ACC-NOT:     rsqrt
+; VECTOR-ACC:         retq
+  %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+  ret <8 x float> %call
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
author	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)
committer	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 4 Aug 2016 12:47:28 +0000 (12:47 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/CodeGen/TargetLoweringBase.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/X86/X86.td		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
lib/Target/X86/X86Subtarget.cpp		patch \| blob \| history
lib/Target/X86/X86Subtarget.h		patch \| blob \| history
test/CodeGen/X86/sqrt-fastmath-tune.ll	[new file with mode: 0644]	patch \| blob