[AArch64] Use the reciprocal estimation machinery

author Evandro Menezes <e.menezes@samsung.com>

Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)

committer Evandro Menezes <e.menezes@samsung.com>

Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)
author Evandro Menezes <e.menezes@samsung.com>
Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)
committer Evandro Menezes <e.menezes@samsung.com>
Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 10a2b9e..575f9d9 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -970,6 +970,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
    case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
    case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
    }
    return nullptr;
  }
@@ -4624,6 +4626,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
  //                          AArch64 Optimization Hooks
  //===----------------------------------------------------------------------===//
  
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const AArch64Subtarget &ST,
+  const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+  const SDValue &Operand, unsigned &ExtraSteps) {
+  if (!ST.hasNEON())
+    return SDValue();
+
+  EVT VT = Operand.getValueType();
+
+  std::string RecipOp;
+  RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
+  RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+  RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  ExtraSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
+}
+
+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+  UseOneConst = true;
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+}
+
  //===----------------------------------------------------------------------===//
  //                          AArch64 Inline Assembly Support
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h

index cf1c122..65e2614 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,6 +187,10 @@ enum NodeType : unsigned {
    SMULL,
    UMULL,
  
+  // Reciprocal estimates.
+  FRECPE,
+  FRSQRTE,
+
    // NEON Load/Store with post-increment base updates
    LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
    LD3post,
@@ -511,6 +515,11 @@ private:
  
    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                          std::vector<SDNode *> *Created) const override;
+  SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
    unsigned combineRepeatedFPDivisors() const override;
  
    ConstraintType getConstraintType(StringRef Constraint) const override;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td

index 1d37b6a..9e39549 100644 (file)
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -283,6 +283,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
  def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
  def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
  
+def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+
  def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
  def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
  def AArch64sminv    : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -3401,6 +3404,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
  def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
            (FRECPEv1i64 FPR64:$Rn)>;
  
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+          (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+          (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+          (FRECPEv2f64 FPR128:$Rn)>;
+
  def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
            (FRECPXv1i32 FPR32:$Rn)>;
  def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3413,6 +3429,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
  def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
            (FRSQRTEv1i64 FPR64:$Rn)>;
  
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+          (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+          (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+          (FRSQRTEv2f64 FPR128:$Rn)>;
+
  // If an integer is about to be converted to a floating point value,
  // just load it on the floating point unit.
  // Here are the patterns for 8 and 16-bits to float.
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp

index 33f65ce..f184efe 100644 (file)
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -136,6 +136,30 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {
    return "E-m:e-i64:64-i128:128-n32:64-S128";
  }
  
+// Helper function to set up the defaults for reciprocals.
+static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
+{
+  // For the estimates, convergence is quadratic, so essentially the number of
+  // digits is doubled after each iteration. ARMv8, the minimum architected
+  // accuracy of the initial estimate is 2^-8.  Therefore, the number of extra
+  // steps to refine the result for float (23 mantissa bits) and for double
+  // (52 mantissa bits) are 2 and 3, respectively.
+  unsigned ExtraStepsF = 2,
+           ExtraStepsD = ExtraStepsF + 1;
+  // FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
+  bool UseRsqrt = ST.isExynosM1();
+
+  TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD);
+
+  TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD);
+}
+
  /// TargetMachine ctor - Create an AArch64 architecture model.
  ///
  AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
@@ -149,7 +173,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
      : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,
                          Options, RM, CM, OL),
        TLOF(createTLOF(getTargetTriple())),
-      isLittle(LittleEndian) {
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+  initReciprocals(*this, Subtarget);
    initAsmInfo();
  }
  
@@ -189,7 +214,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
      // function that reside in TargetOptions.
      resetTargetOptions(F);
      I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
-                                            isLittle);
+                                            Subtarget.isLittleEndian());
  #ifndef LLVM_BUILD_GLOBAL_ISEL
     GISelAccessor *GISel = new GISelAccessor();
  #else
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h

index 8d49a29..aac98a2 100644 (file)
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -46,7 +46,7 @@ public:
    }
  
  private:
-  bool isLittle;
+  AArch64Subtarget Subtarget;
  };
  
  // AArch64leTargetMachine - AArch64 little endian target machine.
diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll

new file mode 100644 (file)

index 0000000..710739b
--- /dev/null
+++ b/test/CodeGen/AArch64/recp-fastmath.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!div,!vec-div | FileCheck %s --check-prefix=FAULT
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=div,vec-div   | FileCheck %s
+
+define float @frecp(float %x) #0 {
+  %div = fdiv fast float 1.0, %x
+  ret float %div
+
+; FAULT-LABEL: frecp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: frecp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: fmov
+}
+
+define <2 x float> @f2recp(<2 x float> %x) #0 {
+  %div = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+  ret <2 x float> %div
+
+; FAULT-LABEL: f2recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: f2recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+define <4 x float> @f4recp(<4 x float> %x) #0 {
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <4 x float> %div
+
+; FAULT-LABEL: f4recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: f4recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+define double @drecp(double %x) #0 {
+  %div = fdiv fast double 1.0, %x
+  ret double %div
+
+; FAULT-LABEL: drecp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: drecp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: fmov
+}
+
+define <2 x double> @d2recp(<2 x double> %x) #0 {
+  %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %x
+  ret <2 x double> %div
+
+; FAULT-LABEL: d2recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: d2recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll

new file mode 100644 (file)

index 0000000..8bc85a5
--- /dev/null
+++ b/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -0,0 +1,158 @@
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!sqrt,!vec-sqrt | FileCheck %s --check-prefix=FAULT
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=sqrt,vec-sqrt   | FileCheck %s
+
+declare float @llvm.sqrt.f32(float) #1
+declare double @llvm.sqrt.f64(double) #1
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #1
+
+define float @fsqrt(float %a) #0 {
+  %1 = tail call fast float @llvm.sqrt.f32(float %a)
+  ret float %1
+
+; FAULT-LABEL: fsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: fsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x float> @f2sqrt(<2 x float> %a) #0 {
+  %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
+  ret <2 x float> %1
+
+; FAULT-LABEL: f2sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f2sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define <4 x float> @f4sqrt(<4 x float> %a) #0 {
+  %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
+  ret <4 x float> %1
+
+; FAULT-LABEL: f4sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f4sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define double @dsqrt(double %a) #0 {
+  %1 = tail call fast double @llvm.sqrt.f64(double %a)
+  ret double %1
+
+; FAULT-LABEL: dsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: dsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x double> @d2sqrt(<2 x double> %a) #0 {
+  %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
+  ret <2 x double> %1
+
+; FAULT-LABEL: d2sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: d2sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define float @frsqrt(float %a) #0 {
+  %1 = tail call fast float @llvm.sqrt.f32(float %a)
+  %2 = fdiv fast float 1.000000e+00, %1
+  ret float %2
+
+; FAULT-LABEL: frsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: frsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
+  %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
+  %2 = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %1
+  ret <2 x float> %2
+
+; FAULT-LABEL: f2rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f2rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
+  %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
+  %2 = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %1
+  ret <4 x float> %2
+
+; FAULT-LABEL: f4rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f4rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define double @drsqrt(double %a) #0 {
+  %1 = tail call fast double @llvm.sqrt.f64(double %a)
+  %2 = fdiv fast double 1.000000e+00, %1
+  ret double %2
+
+; FAULT-LABEL: drsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: drsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
+  %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
+  %2 = fdiv fast <2 x double> <double 1.000000e+00, double 1.000000e+00>, %1
+  ret <2 x double> %2
+
+; FAULT-LABEL: d2rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: d2rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
author	Evandro Menezes <e.menezes@samsung.com>
	Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)
committer	Evandro Menezes <e.menezes@samsung.com>
	Wed, 4 May 2016 20:18:27 +0000 (20:18 +0000)
lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64ISelLowering.h		patch \| blob \| history
lib/Target/AArch64/AArch64InstrInfo.td		patch \| blob \| history
lib/Target/AArch64/AArch64TargetMachine.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64TargetMachine.h		patch \| blob \| history
test/CodeGen/AArch64/recp-fastmath.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AArch64/sqrt-fastmath.ll	[new file with mode: 0644]	patch \| blob