From: Changpeng Fang <changpeng.fang@gmail.com>
Date: Fri, 24 Jan 2020 00:57:43 +0000 (-0800)
Subject: AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare
X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=2531535984ad989ce88aeee23cb92a827da6686e;p=android-x86%2Fexternal-llvm-project.git

AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare

    Summary:
      RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not
    meet the requirement. However, in DAG lowering, fpmath information gets lost,
    and thus we may generate either inaccurate rcp related computation or slow code
    for fdiv.

    In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could
    exactly know !fpmath.

     FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
                         unsafe-fp-math, fast math flags, denormals and fpmath
                         accuracy request.

     RCP Optimizations:
       1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
                                                      denormals flushed.
       a/b -> a*rcp(b) when fast unsafe rcp is legal.

     Use fdiv.fast:
       a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals flushed.

       1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals.

    Reviewers:
      arsenm

    Differential Revision:
      https://reviews.llvm.org/D71293
---

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3ac634b6a47..76f8d5e8c32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
+// Perform RCP optimizations:
+//
+// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                denormals flushed.
+//
+// a/b -> a*rcp(b) when fast unsafe rcp is legal.
+static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
+                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
+                            bool HasDenormals, bool NeedHighAccuracy) {
+
+  Type *Ty = Den->getType();
+  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
+                             (HasDenormals || NeedHighAccuracy))
+    return nullptr;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
+  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+      if (CLHS->isExactlyValue(1.0)) {
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+        //       insert rsq intrinsic here.
+
+        // 1.0 / x -> rcp(x)
+        return Builder.CreateCall(Decl, { Den });
+       }
+
+       // Same as for 1.0, but expand the sign out of the constant.
+       if (CLHS->isExactlyValue(-1.0)) {
+         // -1.0 / x -> rcp (fneg x)
+         Value *FNeg = Builder.CreateFNeg(Den);
+         return Builder.CreateCall(Decl, { FNeg });
+       }
+    }
+  }
+
+  if (FastUnsafeRcpLegal) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    Value *Recip = Builder.CreateCall(Decl, { Den });
+    return Builder.CreateFMul(Num, Recip, "", FPMath);
+  }
+  return nullptr;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
+                              bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
     return HasDenormals;
 
-  if (UnsafeDiv)
+  if (FastUnsafeRcpLegal)
     return true;
 
   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
@@ -620,44 +672,57 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   return HasDenormals ^ IsOne;
 }
 
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+
+// Optimizations is performed based on fpmath, fast math flags as wells as
+// denormals to lower fdiv using either rcp or fdiv.fast.
+//
+// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
+//                     unsafe-fp-math, fast math flags, denormals and fpmath
+//                     accuracy request.
+//
+// RCP Optimizations:
+//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                  denormals flushed.
+//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
+//
+// Use fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
-  Type *Ty = FDiv.getType();
 
-  if (!Ty->getScalarType()->isFloatTy())
-    return false;
+  Type *Ty = FDiv.getType()->getScalarType();
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
+  // No intrinsic for fdiv16 if target does not support f16.
+  if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
 
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
-                                      FMF.allowReciprocal();
+  // Determine whether it is ok to use rcp based on unsafe-fp-math,
+  // fast math flags, denormals and accuracy request.
+  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
+          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
+                                     || FMF.approxFunc()));
 
-  // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (UnsafeDiv)
-    return false;
+  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
+  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
+                           !FastUnsafeRcpLegal;
 
-  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
 
   Value *NewFDiv = nullptr;
-
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
     NewFDiv = UndefValue::get(VT);
 
     // FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -665,19 +730,32 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt;
-
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
-        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
-      } else {
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      Value *NewElt = nullptr;
+      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
+                                           HasFP32Denormals)) {
+        Function *Decl =
+                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
       }
+      if (!NewElt) // Try rcp.
+        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
+                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
+      if (!NewElt)
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  } else { // Scalar.
+    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
+                                          HasFP32Denormals)) {
+      Function *Decl =
+               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
+    }
+    if (!NewFDiv) { // Try rcp.
+      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
+                              Mod, HasFP32Denormals, NeedHighAccuracy);
+    }
   }
 
   if (NewFDiv) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f7f75530bb..84f26cccd7d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7474,49 +7474,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
-  if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
+  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
+         (Flags.hasAllowReciprocal() &&
+          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
+            VT == MVT::f16 ||
+            Flags.hasApproximateFuncs()));
+
+  // Do rcp optimization only when fast unsafe rcp is legal here.
+  // NOTE: We already performed RCP optimization to insert intrinsics in
+  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
+  // rcp optimization.
+  //   However, there are cases like FREM, which is expended into a sequence
+  // of instructions including FDIV, which may expose new opportunities.
+  if (!FastUnsafeRcpLegal)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
-      if (CLHS->isExactlyValue(1.0)) {
-        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-        // the CI documentation has a worst case error of 1 ulp.
-        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-        // use it as long as we aren't trying to use denormals.
-        //
-        // v_rcp_f16 and v_rsq_f16 DO support denormals.
-
-        // 1.0 / sqrt(x) -> rsq(x)
-
-        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
-        // error seems really high at 2^29 ULP.
-        if (RHS.getOpcode() == ISD::FSQRT)
-          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
-        // 1.0 / x -> rcp(x)
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-      }
+    if (CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+      //
+      // v_rcp_f16 and v_rsq_f16 DO support denormals.
 
-      // Same as for 1.0, but expand the sign out of the constant.
-      if (CLHS->isExactlyValue(-1.0)) {
-        // -1.0 / x -> rcp (fneg x)
-        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
-      }
+      // 1.0 / sqrt(x) -> rsq(x)
+
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
     }
-  }
 
-  if (Unsafe) {
-    // Turn into multiply by the reciprocal.
-    // x / y -> x * (1.0 / y)
-    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
+    // Same as for 1.0, but expand the sign out of the constant.
+    if (CLHS->isExactlyValue(-1.0)) {
+      // -1.0 / x -> rcp (fneg x)
+      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+    }
   }
 
-  return SDValue();
+  // Turn into multiply by the reciprocal.
+  // x / y -> x * (1.0 / y)
+  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }
 
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
@@ -8663,6 +8668,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                            N->getFlags());
   }
 
+  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
+                           N0.getOperand(0), N->getFlags());
+  }
+
   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 0c7160df2b9..6ead8b9ad13 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -16,8 +16,10 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0
+; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
@@ -45,12 +47,12 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath(
 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
-; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
+; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
-; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
-; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x
+; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
 define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
@@ -76,10 +78,58 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
   ret void
 }
 
+; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
+; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
+; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
+; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
+define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {
+
+  %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.low.accuracy, float addrspace(1)* %out
+
+  %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
+  store volatile float %arcp.high.accuracy, float addrspace(1)* %out
+
+  %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
+  store volatile float %arcp.low.afn, float addrspace(1)* %out
+
+  %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
+  store volatile float %arcp.high.afn, float addrspace(1)* %out
+
+  ret void
+}
+
 ; CHECK-LABEL: @fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float %[[NO_A0]], %[[NO_B0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1
+; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
+; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1
+; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2
+; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0
+; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2
+; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -106,12 +156,52 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 }
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
-; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
+; CHECK: %[[NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[NO1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[HALF0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1
+; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
+; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] =  fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1
+; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
+; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]
+; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
+; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
+; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
+; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
+; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
+; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
+; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0
+; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_RCP1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -136,12 +226,48 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 }
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
-; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
-; CHECK: store volatile <2 x float> %fast.25ulp
+; CHECK: %[[NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
+; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]
+; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
+; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
+; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
+; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]]
+; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
+; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
+; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
+; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]]
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
+; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0
+; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
+; CHECK: %[[FAST_25_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_25_RCP1]]
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
@@ -161,12 +287,29 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
   ret void
 }
 
-; FIXME: Should be able to get fdiv for 1.0 component
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]])
+; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0
+; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0
+; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]])
+; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %arcp.25ulp
 
-; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]])
+; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0
+; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0
+; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]])
+; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -186,8 +329,9 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
+; CHECK: %arcp.md.25ulp  = fdiv arcp float %a, %b, !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index bd4deb14aad..74ad6215905 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -284,6 +284,68 @@ define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out,
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX10: s_denorm_mode 15
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX10: s_denorm_mode 12
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+
+define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; PREGFX10-NOT: s_setreg
+; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10-NOT: s_setreg
+
+; GFX10-NOT: s_denorm_mode
+; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
+; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
+; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
+; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
+; GFX10-NOT: s_denorm_mode
+
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
 attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" }
 attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
 attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index 01499e681ea..c02a21efce5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -348,7 +348,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %div = fdiv fast float 1.000000e+00, %load
+  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
@@ -359,7 +359,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %div = fdiv fast float -1.000000e+00, %load
+  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
@@ -370,7 +370,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %neg = fsub float -0.000000e+00, %load
+  %neg = fsub float -0.000000e+00, %load, !fpmath !0
   %div = fdiv fast float 1.000000e+00, %neg
   store float %div, float addrspace(1)* %arg, align 4
   ret void
@@ -382,22 +382,18 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %neg = fsub float -0.000000e+00, %load
+  %neg = fsub float -0.000000e+00, %load, !fpmath !0
   %div = fdiv fast float -1.000000e+00, %neg
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float 1.000000e+00, %load
@@ -406,15 +402,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg
 }
 
 ; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float -1.000000e+00, %load
@@ -423,15 +415,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load
@@ -441,15 +429,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 5dda92dbd5e..133afd4d32c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -219,13 +219,30 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %
 ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
 ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
 ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
+define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
+.entry:
+  %tmp7 = fdiv float 1.000000e+00, %tmp6
+  %tmp8 = fmul float 0.000000e+00, %tmp7
+  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
+  %.i188 = fadd float %tmp9, 0.000000e+00
+  %tmp10 = fcmp uge float %.i188, %tmp2
+  %tmp11 = fsub float -0.000000e+00, %.i188
+  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
+  %tmp12 = fcmp ule float %.i092, 0.000000e+00
+  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
+  ret float %.i198
+}
+
+; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
+; function attribute unsafe-fp-math automatically. Combine with the previous test
+; when that is done.
+; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
 ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
 ; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
-
-define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
+define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
 .entry:
   %tmp7 = fdiv float 1.000000e+00, %tmp6
   %tmp8 = fmul float 0.000000e+00, %tmp7
@@ -2524,3 +2541,4 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index cf3b3ccbc07..25f110f9126 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -11,7 +11,7 @@ define float @v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, |v0|, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.fabs.f32(float %a.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -22,10 +22,10 @@ define float @v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_med3_f32 v0, -v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = fsub float -0.0, %a.nnan.add
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -71,7 +71,7 @@ define float @v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32(float %a, flo
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.copysign.f32(float %a.nnan.add, float %sign)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -101,7 +101,7 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -166,7 +166,7 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -182,7 +182,7 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -215,7 +215,7 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -232,7 +232,7 @@ define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
   %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b.nnan.add
@@ -269,7 +269,7 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %cmp = icmp eq i32 %c, 0
   %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -669,3 +669,5 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+
+!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index ad2d84b7911..a3c08038b87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -37,7 +37,7 @@ define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -47,7 +47,7 @@ define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %o
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -61,8 +61,7 @@ define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
-; SI: v_sqrt_f32_e32
-; SI: v_rcp_f32_e32
+; SI: v_rsq_f32_e32
 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
@@ -144,3 +143,5 @@ attributes #1 = { nounwind "unsafe-fp-math"="false" "target-features"="-fp32-den
 attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
 attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
 attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" }
+
+!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index d6b717411de..9f717df480f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -112,7 +112,7 @@ bb:
 
 bb19:                                             ; preds = %bb
   %tmp20 = uitofp i32 %arg6 to float
-  %tmp21 = fdiv float 1.000000e+00, %tmp20
+  %tmp21 = fdiv float 1.000000e+00, %tmp20, !fpmath !0
   %tmp22 = and i32 %arg6, 16777215
   br label %bb23
 
@@ -258,3 +258,5 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
 
 attributes #0 = { nounwind willreturn }
 attributes #1 = { nounwind readnone speculatable }
+
+!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index fbdaeb82929..b7552b06f08 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -10,7 +10,7 @@
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -71,7 +71,7 @@ define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %ou
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
-  %rcp = fdiv float 1.0, %src.fabs
+  %rcp = fdiv float 1.0, %src.fabs, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -83,7 +83,7 @@ define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
-  %rcp = fdiv float -1.0, %src
+  %rcp = fdiv float -1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -95,7 +95,7 @@ define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src)
 define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
-  %rcp = fdiv float 1.0, %src.fabs.fneg
+  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -109,7 +109,7 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float
 define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
-  %rcp = fdiv float 1.0, %src.fabs.fneg
+  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
   store volatile float %rcp, float addrspace(1)* %out, align 4
 
   %other = fmul float %src, %src.fabs.fneg
diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
index 6fb680e6298..badaae3af23 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
@@ -5,7 +5,7 @@
 define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %cvt = uitofp i32 %load to float
-  %div = fdiv float 1.000000e+00, %cvt
+  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +15,9 @@ define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %
 define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %cvt = sitofp i32 %load to float
-  %div = fdiv float 1.000000e+00, %cvt
+  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
+
+!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll
index 40c3c94246e..8480f344601 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@@ -11,7 +11,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
+  %div = fdiv float 1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -33,7 +33,7 @@ define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double add
 ; SI: s_endpgm
 define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
+  %div = fdiv float 1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -41,15 +41,17 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
 ; Recognize that this is rsqrt(a) * rcp(b) * c,
 ; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
 
+; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
+
 ; SI-LABEL: @rsqrt_fmul
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
-; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
-; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-UNSAFE-DAG: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], [[A]]
+; SI-UNSAFE-DAG: v_mul_f32_e32  [[MUL:v[0-9]+]], [[SQRT]], [[B]]
+; SI-UNSAFE-DAG: v_rcp_f32_e32  [[RCP:v[0-9]+]], [[MUL]]
+; SI-UNSAFE-DAG: v_mul_f32_e32  [[RESULT:v[0-9]+]], [[C]], [[RCP]]
 ; SI-UNSAFE: buffer_store_dword [[RESULT]]
 
 ; SI-SAFE-NOT: v_rsq_f32
@@ -78,13 +80,13 @@ define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
 ; SI-SAFE: buffer_store_dword [[RSQ]]
 
-; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
-; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
-; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
+; SI-UNSAFE: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], v{{[0-9]+}}
+; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
+; SI-UNSAFE: buffer_store_dword [[RSQ]]
 define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
-  %div = fdiv float -1.0, %sqrt
+  %div = fdiv float -1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -109,14 +111,14 @@ define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double
 ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
 ; SI-SAFE: buffer_store_dword [[RSQ]]
 
-; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
-; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
-; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
+; SI-UNSAFE: v_sqrt_f32_e64 [[SQRT:v[0-9]+]], -v{{[0-9]+}}
+; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
+; SI-UNSAFE: buffer_store_dword [[RSQ]]
 define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
-  %div = fdiv float -1.0, %sqrt
+  %div = fdiv float -1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -136,3 +138,5 @@ define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, do
   store double %div, double addrspace(1)* %out, align 4
   ret void
 }
+
+!0 = !{float 2.500000e+00}