Use PPC reciprocal estimates with Newton iteration in fast-math mode

author Hal Finkel <hfinkel@anl.gov>

Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
author Hal Finkel <hfinkel@anl.gov>
Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td

index a1ea229..98869ea 100644 (file)
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,16 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                          "Enable the MFOCRF instruction">;
  def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                          "Enable the fsqrt instruction">;
+def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
+                                        "Enable the fre instruction">;
+def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
+                                        "Enable the fres instruction">;
+def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+                                        "Enable the frsqrte instruction">;
+def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+                                        "Enable the frsqrtes instruction">;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+                              "Assume higher precision reciprocal estimates">;
  def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                          "Enable the stfiwx instruction">;
  def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
@@ -101,30 +111,46 @@ include "PPCInstrInfo.td"
  
  def : Processor<"generic", G3Itineraries, [Directive32]>;
  def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
                                             FeatureBookE]>;
  def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
                                             FeatureBookE]>;
  def : Processor<"601", G3Itineraries, [Directive601]>;
  def : Processor<"602", G3Itineraries, [Directive602]>;
-def : Processor<"603", G3Itineraries, [Directive603]>;
-def : Processor<"603e", G3Itineraries, [Directive603]>;
-def : Processor<"603ev", G3Itineraries, [Directive603]>;
-def : Processor<"604", G3Itineraries, [Directive604]>;
-def : Processor<"604e", G3Itineraries, [Directive604]>;
-def : Processor<"620", G3Itineraries, [Directive620]>;
-def : Processor<"750", G4Itineraries, [Directive750]>;
-def : Processor<"g3", G3Itineraries, [Directive750]>;
-def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+                                         FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                            FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                           FeatureFRES, FeatureFRSQRTE]>;
  def : Processor<"970", G5Itineraries,
                    [Directive970, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt,
+                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
                     Feature64Bit /*, Feature64BitRegs */]>;
  def : Processor<"g5", G5Itineraries,
                    [Directive970, FeatureAltivec,
                     FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureFRES, FeatureFRSQRTE,
                     Feature64Bit /*, Feature64BitRegs */]>;
  def : ProcessorModel<"e500mc", PPCE500mcModel,
                    [DirectiveE500mc, FeatureMFOCRF,
@@ -134,43 +160,57 @@ def : ProcessorModel<"e5500", PPCE5500Model,
                     FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
  def : Processor<"a2", PPCA2Itineraries,
                    [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                     FeatureFPRND, FeatureFPCVT, FeatureISEL,
                     FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
                 /*, Feature64BitRegs */]>;
  def : Processor<"a2q", PPCA2Itineraries,
                    [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                     FeatureFPRND, FeatureFPCVT, FeatureISEL,
                     FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
                 /*, Feature64BitRegs */, FeatureQPX]>;
  def : Processor<"pwr3", G5Itineraries,
-                  [DirectivePwr3, FeatureAltivec, FeatureMFOCRF,
+                  [DirectivePwr3, FeatureAltivec,
+                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
                     FeatureSTFIWX, Feature64Bit]>;
  def : Processor<"pwr4", G5Itineraries,
                    [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+                   FeatureSTFIWX, Feature64Bit]>;
  def : Processor<"pwr5", G5Itineraries,
                    [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, Feature64Bit]>;
  def : Processor<"pwr5x", G5Itineraries,
                    [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureFPRND,
-                   Feature64Bit]>;
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
  def : Processor<"pwr6", G5Itineraries,
                    [DirectivePwr6, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
-                   FeatureLFIWAX, FeatureFPRND, Feature64Bit
-               /*, Feature64BitRegs */]>;
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
  def : Processor<"pwr6x", G5Itineraries,
                    [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
                     FeatureFPRND, Feature64Bit]>;
  def : Processor<"pwr7", G5Itineraries,
                    [DirectivePwr7, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
-                   FeatureLFIWAX, FeatureFPRND, FeatureFPCVT,
-                   FeatureISEL, FeaturePOPCNTD, FeatureLDBRX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
                     Feature64Bit /*, Feature64BitRegs */]>;
  def : Processor<"ppc", G3Itineraries, [Directive32]>;
  def : Processor<"ppc64", G5Itineraries,
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index ee43fc6..3bda37b 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -150,10 +150,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
  
    // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget->hasFSQRT()) {
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
      setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
      setOperationAction(ISD::FSQRT, MVT::f32, Expand);
-  }
  
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
@@ -469,6 +474,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
  
      setOperationAction(ISD::MUL, MVT::v4f32, Legal);
      setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    }
+
      setOperationAction(ISD::MUL, MVT::v4i32, Custom);
      setOperationAction(ISD::MUL, MVT::v8i16, Custom);
      setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -519,6 +530,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setTargetDAGCombine(ISD::BR_CC);
    setTargetDAGCombine(ISD::BSWAP);
  
+  // Use reciprocal estimates.
+  if (TM.Options.UnsafeFPMath) {
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::FSQRT);
+  }
+
    // Darwin long double math library functions have $LDBL128 appended.
    if (Subtarget->isDarwin()) {
      setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
@@ -590,6 +607,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case PPCISD::FCFID:           return "PPCISD::FCFID";
    case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
    case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FRE:             return "PPCISD::FRE";
+  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
    case PPCISD::STFIWX:          return "PPCISD::STFIWX";
    case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
    case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
@@ -6658,6 +6677,153 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  // Target Optimization Hooks
  //===----------------------------------------------------------------------===//
  
+SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N,
+                                             DAGCombinerInfo &DCI,
+                                             bool UseOperand) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES()) ||
+      (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE())  ||
+      (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal, we need to find the zero of the function:
+    //   F(X) = A X - 1 [which has a zero at X = 1/A]
+    //     =>
+    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+    //     does not require additional intermediate precision]
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (N->getValueType(0).getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = N->getDebugLoc();
+
+    SDValue FPOne =
+      DAG.getConstantFP(1.0, N->getValueType(0).getScalarType());
+    if (N->getValueType(0).isVector()) {
+      assert(N->getValueType(0).getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
+                          FPOne, FPOne, FPOne, FPOne);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRE, dl,
+                              N->getValueType(0),
+                              UseOperand ? N->getOperand(1) :
+                                           SDValue(N, 0));
+    DCI.AddToWorklist(Est.getNode());
+
+    // Newton iterations: Est = Est + Est (1 - Arg * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
+                                   N->getValueType(0),
+                                   UseOperand ? N->getOperand(1) :
+                                                SDValue(N, 0),
+                                   Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl,
+                           N->getValueType(0), FPOne, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl,
+                           N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FADD, dl,
+                        N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
+      (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
+      (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal sqrt, we need to find the zero of the function:
+    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+    //     =>
+    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+    // As a result, we precompute A/2 prior to the iteration loop.
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (N->getValueType(0).getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = N->getDebugLoc();
+
+    SDValue FPThreeHalfs =
+      DAG.getConstantFP(1.5, N->getValueType(0).getScalarType());
+    if (N->getValueType(0).isVector()) {
+      assert(N->getValueType(0).getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
+                                 FPThreeHalfs, FPThreeHalfs,
+                                 FPThreeHalfs, FPThreeHalfs);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl,
+                              N->getValueType(0), N->getOperand(0));
+    DCI.AddToWorklist(Est.getNode());
+
+    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
+    // this entire sequence requires only one FP constant.
+    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                                  FPThreeHalfs, N->getOperand(0));
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0),
+                          HalfArg, N->getOperand(0));
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
+                                   N->getValueType(0), Est, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl,
+                           N->getValueType(0), HalfArg, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl,
+                           N->getValueType(0), FPThreeHalfs, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FMUL, dl,
+                        N->getValueType(0), Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
  SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
    const TargetMachine &TM = getTargetMachine();
@@ -6684,7 +6850,44 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
          return N->getOperand(0);
      }
      break;
+  case ISD::FDIV: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
+      SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI);
+      if (RV.getNode() != 0) {
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    }
+
+    SDValue RV = DAGCombineFastRecip(N, DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                         N->getOperand(0), RV);
+    }
+
+    }
+    break;
+  case ISD::FSQRT: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
+    // reciprocal sqrt.
+    SDValue RV = DAGCombineFastRecipFSQRT(N, DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      RV = DAGCombineFastRecip(RV.getNode(), DCI, false);
+      if (RV.getNode() != 0)
+        return RV;
+    }
  
+    }
+    break;
    case ISD::SINT_TO_FP:
      if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
        if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h

index 6690899..2ddcb25 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -49,6 +49,9 @@ namespace llvm {
        /// unsigned integers.
        FCTIDUZ, FCTIWUZ,
  
+      /// Reciprocal estimate instructions (unary FP ops).
+      FRE, FRSQRTE,
+
        // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
        // three v4f32 operands and producing a v4f32 result.
        VMADDFP, VNMSUBFP,
@@ -620,6 +623,10 @@ namespace llvm {
  
      SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
      SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue DAGCombineFastRecip(SDNode *N, DAGCombinerInfo &DCI,
+                                bool UseOperand = true) const;
+    SDValue DAGCombineFastRecipFSQRT(SDNode *N, DAGCombinerInfo &DCI) const;
    };
  }
  
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td

index fff91df..18968ba 100644 (file)
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -795,6 +795,9 @@ def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
  def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
            (VPERM $vA, $vB, $vC)>;
  
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
  // Vector shifts
  def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
            (v16i8 (VSLB $vA, $vB))>;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td

index 067f5aa..feca7c0 100644 (file)
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -62,6 +62,9 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
  // PowerPC specific DAG Nodes.
  //
  
+def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
  def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
  def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
  def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
@@ -1223,8 +1226,21 @@ def FNEGS  : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB),
  def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
                        "fneg $frD, $frB", FPGeneral,
                        [(set f64:$frD, (fneg f64:$frB))]>;
+
+// Reciprocal estimates.
+def FRE      : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fre $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfre f64:$frB))]>;
+def FRES     : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fres $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfre f32:$frB))]>;
+def FRSQRTE  : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frsqrte $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frsqrtes $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
  }
-                      
  
  // XL-Form instructions.  condition register logical ops.
  //
@@ -1687,5 +1703,15 @@ def : Pat<(membarrier (i32 imm /*ll*/),
  
  def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
  
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+
  include "PPCInstrAltivec.td"
  include "PPCInstr64Bit.td"
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp

index 1b7150f..a8f2b3f 100644 (file)
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -38,6 +38,11 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
    , HasAltivec(false)
    , HasQPX(false)
    , HasFSQRT(false)
+  , HasFRE(false)
+  , HasFRES(false)
+  , HasFRSQRTE(false)
+  , HasFRSQRTES(false)
+  , HasRecipPrec(false)
    , HasSTFIWX(false)
    , HasLFIWAX(false)
    , HasFPRND(false)
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h

index d5dfa1e..65b4d21 100644 (file)
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -77,6 +77,8 @@ protected:
    bool HasAltivec;
    bool HasQPX;
    bool HasFSQRT;
+  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+  bool HasRecipPrec;
    bool HasSTFIWX;
    bool HasLFIWAX;
    bool HasFPRND;
@@ -159,6 +161,11 @@ public:
  
    // Specific obvious features.
    bool hasFSQRT() const { return HasFSQRT; }
+  bool hasFRE() const { return HasFRE; }
+  bool hasFRES() const { return HasFRES; }
+  bool hasFRSQRTE() const { return HasFRSQRTE; }
+  bool hasFRSQRTES() const { return HasFRSQRTES; }
+  bool hasRecipPrec() const { return HasRecipPrec; }
    bool hasSTFIWX() const { return HasSTFIWX; }
    bool hasLFIWAX() const { return HasLFIWAX; }
    bool hasFPRND() const { return HasFPRND; }
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll

new file mode 100644 (file)

index 0000000..cad98c5
--- /dev/null
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -0,0 +1,178 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare double @llvm.sqrt.f64(double)
+declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define double @foo(double %a, double %b) nounwind {
+entry:
+  %x = call double @llvm.sqrt.f64(double %b)
+  %r = fdiv double %a, %x
+  ret double %r
+
+; CHECK: @foo
+; CHECK: frsqrte
+; CHECK: fnmsub
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: blr
+
+; CHECK-SAFE: @foo
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define float @goo(float %a, float %b) nounwind {
+entry:
+  %x = call float @llvm.sqrt.f32(float %b)
+  %r = fdiv float %a, %x
+  ret float %r
+
+; CHECK: @goo
+; CHECK: frsqrtes
+; CHECK: fnmsubs
+; CHECK: fmuls
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: fmuls
+; CHECK: blr
+
+; CHECK-SAFE: @goo
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
+  %r = fdiv <4 x float> %a, %x
+  ret <4 x float> %r
+
+; CHECK: @hoo
+; CHECK: vrsqrtefp
+
+; CHECK-SAFE: @hoo
+; CHECK-SAFE-NOT: vrsqrtefp
+; CHECK-SAFE: blr
+}
+
+define double @foo2(double %a, double %b) nounwind {
+entry:
+  %r = fdiv double %a, %b
+  ret double %r
+
+; CHECK: @foo2
+; CHECK: fre
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: blr
+
+; CHECK-SAFE: @foo2
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define float @goo2(float %a, float %b) nounwind {
+entry:
+  %r = fdiv float %a, %b
+  ret float %r
+
+; CHECK: @goo2
+; CHECK: fres
+; CHECK: fnmsubs
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: blr
+
+; CHECK-SAFE: @goo2
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %r = fdiv <4 x float> %a, %b
+  ret <4 x float> %r
+
+; CHECK: @hoo2
+; CHECK: vrefp
+
+; CHECK-SAFE: @hoo2
+; CHECK-SAFE-NOT: vrefp
+; CHECK-SAFE: blr
+}
+
+define double @foo3(double %a) nounwind {
+entry:
+  %r = call double @llvm.sqrt.f64(double %a)
+  ret double %r
+
+; CHECK: @foo3
+; CHECK: frsqrte
+; CHECK: fnmsub
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmadd
+; CHECK: fmul
+; CHECK: fre
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: fnmsub
+; CHECK: fmadd
+; CHECK: blr
+
+; CHECK-SAFE: @foo3
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: blr
+}
+
+define float @goo3(float %a) nounwind {
+entry:
+  %r = call float @llvm.sqrt.f32(float %a)
+  ret float %r
+
+; CHECK: @goo3
+; CHECK: frsqrtes
+; CHECK: fnmsubs
+; CHECK: fmuls
+; CHECK: fmadds
+; CHECK: fmuls
+; CHECK: fres
+; CHECK: fnmsubs
+; CHECK: fmadds
+; CHECK: blr
+
+; CHECK-SAFE: @goo3
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @hoo3(<4 x float> %a) nounwind {
+entry:
+  %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
+  ret <4 x float> %r
+
+; CHECK: @hoo3
+; CHECK: vrsqrtefp
+; CHECK: vrefp
+
+; CHECK-SAFE: @hoo3
+; CHECK-SAFE-NOT: vrsqrtefp
+; CHECK-SAFE: blr
+}
+
author	Hal Finkel <hfinkel@anl.gov>
	Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Wed, 3 Apr 2013 04:01:11 +0000 (04:01 +0000)
lib/Target/PowerPC/PPC.td		patch \| blob \| history
lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCISelLowering.h		patch \| blob \| history
lib/Target/PowerPC/PPCInstrAltivec.td		patch \| blob \| history
lib/Target/PowerPC/PPCInstrInfo.td		patch \| blob \| history
lib/Target/PowerPC/PPCSubtarget.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCSubtarget.h		patch \| blob \| history
test/CodeGen/PowerPC/recipest.ll	[new file with mode: 0644]	patch \| blob