From 4bb99910b03716acacbace2bfc522bb4a8e49094 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 Dec 2016 03:05:41 +0000 Subject: [PATCH] AMDGPU: Custom lower f16 fdiv git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290301 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 22 +++++++++++++++++- lib/Target/AMDGPU/SIISelLowering.h | 1 + test/CodeGen/AMDGPU/fdiv.f16.ll | 44 +++++++++++++++++++++++------------- 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index b9302582fa8..5411ccf7400 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -299,7 +299,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); - setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); @@ -3008,6 +3008,23 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, GlueChain.getValue(2)); } +SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src0 = Op.getOperand(0); + SDValue Src1 = Op.getOperand(1); + + SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); + SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); + + SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); + SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); + + SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); + SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); +} + // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -3201,6 +3218,9 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f64) return LowerFDIV64(Op, DAG); + if (VT == MVT::f16) + return LowerFDIV16(Op, DAG); + llvm_unreachable("Unexpected type for fdiv"); } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index cb6d5364793..b4d87d9406f 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -37,6 +37,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll index bad04326193..da791f7e665 100644 --- a/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -4,27 +4,39 @@ ; Make sure fdiv is promoted to f32. ; GCN-LABEL: {{^}}fdiv_f16 -; GCN: v_cvt_f32_f16 -; GCN: v_cvt_f32_f16 -; GCN: v_div_scale_f32 -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_mul_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 -; GCN: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +; SI: v_cvt_f16_f32 + +; VI: buffer_load_ushort [[LHS:v[0-9]+]] +; VI: buffer_load_ushort [[RHS:v[0-9]+]] + +; VI-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] +; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] + +; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] +; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]] +; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] +; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] +; VI: buffer_store_short [[RESULT]] define void @fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fdiv half %a.val, %b.val store half %r.val, half addrspace(1)* %r ret void -- 2.11.0