On modern Intel processors hardware SQRT in many cases is faster than RSQRT
followed by Newton-Raphson refinement. The patch introduces a simple heuristic
to choose between hardware SQRT instruction and Newton-Raphson software
estimation.
The patch treats scalars and vectors differently. The heuristic is that for
scalars the compiler should optimize for latency while for vectors it should
optimize for throughput. It is based on the assumption that throughput bound
code is likely to be vectorized.
Basically, the patch disables scalar NR for big cores and disables NR completely
for Skylake. Firstly, scalar SQRT has shorter latency than NR code in big cores.
Secondly, vector SQRT has been greatly improved in Skylake and has better
throughput compared to NR.
Differential Revision: https://reviews.llvm.org/D21379
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277725
91177308-0d34-0410-b5e6-
96231b3b80d8
return true;
}
- /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
- bool isFsqrtCheap() const {
- return FsqrtIsCheap;
+ /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
+ virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
+ // Default behavior is to replace SQRT(X) with X*RSQRT(X).
+ return false;
}
/// Returns true if target has indicated at least one type should be bypassed.
/// control.
void setJumpIsExpensive(bool isExpensive = true);
- /// Tells the code generator that fsqrt is cheap, and should not be replaced
- /// with an alternative sequence of instructions.
- void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
-
/// Tells the code generator that this target supports floating point
/// exceptions and cares about preserving floating point exception behavior.
void setHasFloatingPointExceptions(bool FPExceptions = true) {
/// combined with "shift" to BitExtract instructions.
bool HasExtractBitsInsn;
- // Don't expand fsqrt with an approximation based on the inverse sqrt.
- bool FsqrtIsCheap;
-
/// Tells the code generator to bypass slow divide or remainder
/// instructions. For example, BypassSlowDivWidths[32,8] tells the code
/// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
}
SDValue DAGCombiner::visitFSQRT(SDNode *N) {
- if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (TLI.isFsqrtCheap(N0, DAG))
return SDValue();
// TODO: FSQRT nodes should have flags that propagate to the created nodes.
// For now, create a Flags object for use with all unsafe math transforms.
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
- return buildSqrtEstimate(N->getOperand(0), &Flags);
+ return buildSqrtEstimate(N0, &Flags);
}
/// copysign(x, fp_extend(y)) -> copysign(x, y)
SelectIsExpensive = false;
HasMultipleConditionRegisters = false;
HasExtractBitsInsn = false;
- FsqrtIsCheap = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
MaskAndBranchFoldingIsLegal = false;
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
const char* getTargetNodeName(unsigned Opcode) const override;
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
SDValue getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
def FeatureFastPartialYMMWrite
: SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
"true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
FeatureXSAVEC,
FeatureXSAVES,
FeatureSGX,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
]>;
// FIXME: define SKL model
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
/// of a YMM register without clearing the upper part.
bool HasFastPartialYMMWrite;
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
+
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
+
+define float @foo_x1(float %f) #0 {
+; SCALAR-EST-LABEL: foo_x1:
+; SCALAR-EST: # BB#0:
+; SCALAR-EST-NEXT: rsqrtss %xmm0
+; SCALAR-EST: retq
+;
+; SCALAR-ACC-LABEL: foo_x1:
+; SCALAR-ACC: # BB#0:
+; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}}
+; SCALAR-ACC-NEXT: retq
+ %call = tail call float @llvm.sqrt.f32(float %f) #1
+ ret float %call
+}
+
+define <4 x float> @foo_x4(<4 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x4:
+; VECTOR-EST: # BB#0:
+; VECTOR-EST-NEXT: rsqrtps %xmm0
+; VECTOR-EST: retq
+;
+; VECTOR-ACC-LABEL: foo_x4:
+; VECTOR-ACC: # BB#0:
+; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}}
+; VECTOR-ACC-NEXT: retq
+ %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+ ret <4 x float> %call
+}
+
+define <8 x float> @foo_x8(<8 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x8:
+; VECTOR-EST: # BB#0:
+; VECTOR-EST-NEXT: rsqrtps
+; VECTOR-EST: retq
+;
+; VECTOR-ACC-LABEL: foo_x8:
+; VECTOR-ACC: # BB#0:
+; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}}
+; VECTOR-ACC-NOT: rsqrt
+; VECTOR-ACC: retq
+ %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+ ret <8 x float> %call
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }