revert r284495: [Target] remove TargetRecip class

[android-x86/external-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b1f2610..c1b6a22 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -53,6 +53,7 @@
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRecip.h"
  #include "X86IntrinsicsInfo.h"
  #include <bitset>
  #include <numeric>
@@ -84,6 +85,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
+  // By default (and when -ffast-math is on), enable estimate codegen with 1
+  // refinement step for floats (not doubles) except scalar division. Scalar
+  // division estimates are disabled because they break too much real-world
+  // code. These defaults are intended to match GCC behavior.
+  ReciprocalEstimates.set("sqrtf", true, 1);
+  ReciprocalEstimates.set("divf", false, 1);
+  ReciprocalEstimates.set("vec-sqrtf", true, 1);
+  ReciprocalEstimates.set("vec-divf", true, 1);
+
    // For 64-bit, since we have so many registers, use the ILP scheduler.
    // For 32-bit, use the register pressure specific scheduling.
    // For Atom, always use ILP scheduling.
@@ -15241,10 +15251,11 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
  /// The minimum architected relative accuracy is 2^-12. We need one
  /// Newton-Raphson step to have a good float result (24 bits of precision).
  SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
-                                            SelectionDAG &DAG, int Enabled,
-                                            int &RefinementSteps,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
                                              bool &UseOneConstNR) const {
    EVT VT = Op.getValueType();
+  const char *RecipOp;
  
    // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
    // TODO: Add support for AVX512 (v16f32).
@@ -15253,24 +15264,30 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
    // instructions: convert to single, rsqrtss, convert back to double, refine
    // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
    // along with FMA, this could be a throughput win.
-  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
-      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
-    if (RefinementSteps == ReciprocalEstimate::Unspecified)
-      RefinementSteps = 1;
+  if (VT == MVT::f32 && Subtarget.hasSSE1())
+    RecipOp = "sqrtf";
+  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget.hasAVX()))
+    RecipOp = "vec-sqrtf";
+  else
+    return SDValue();
  
-    UseOneConstNR = false;
-    return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
  }
  
  /// The minimum architected relative accuracy is 2^-12. We need one
  /// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
-                                            int Enabled,
-                                            int &RefinementSteps) const {
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
    EVT VT = Op.getValueType();
+  const char *RecipOp;
  
    // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
    // TODO: Add support for AVX512 (v16f32).
@@ -15279,22 +15296,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
    // 15 instructions: convert to single, rcpss, convert back to double, refine
    // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
    // along with FMA, this could be a throughput win.
+  if (VT == MVT::f32 && Subtarget.hasSSE1())
+    RecipOp = "divf";
+  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget.hasAVX()))
+    RecipOp = "vec-divf";
+  else
+    return SDValue();
  
-  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
-      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
-    // Enable estimate codegen with 1 refinement step for vector division.
-    // Scalar division estimates are disabled because they break too much
-    // real-world code. These defaults are intended to match GCC behavior.
-    if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
-      return SDValue();
-
-    if (RefinementSteps == ReciprocalEstimate::Unspecified)
-      RefinementSteps = 1;
+  TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
  
-    return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
  }
  
  /// If we have at least two divisions that use the same divisor, convert to