From 6bc216ccf6a7fc8e9f500fb44b12f045995b4c3d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 23 Feb 2017 22:38:24 +0000 Subject: [PATCH] [NVPTX] Added support for .f16x2 instructions. This patch enables support for .f16x2 operations. Added new register type Float16x2. Added support for .f16x2 instructions. Added handling of vectorized loads/stores of v2f16 values. Differential Revision: https://reviews.llvm.org/D30057 Differential Revision: https://reviews.llvm.org/D30310 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296032 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp | 3 + lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 2 + lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 346 ++++- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 + lib/Target/NVPTX/NVPTXISelLowering.cpp | 332 +++-- lib/Target/NVPTX/NVPTXISelLowering.h | 6 +- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 2 + lib/Target/NVPTX/NVPTXInstrInfo.td | 124 +- lib/Target/NVPTX/NVPTXIntrinsics.td | 8 + lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 4 + lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 + test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 34 + test/CodeGen/NVPTX/f16-instructions.ll | 16 +- test/CodeGen/NVPTX/f16x2-instructions.ll | 1433 +++++++++++++++++++++ test/CodeGen/NVPTX/param-load-store.ll | 182 ++- 15 files changed, 2343 insertions(+), 153 deletions(-) create mode 100644 test/CodeGen/NVPTX/f16x2-instructions.ll diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp index 04ae3c2533d..b774fe169d7 100644 --- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp +++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp @@ -64,6 +64,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { case 7: OS << "%h"; break; + case 8: + OS << "%hh"; + break; } unsigned VReg = RegNo & 0x0FFFFFFF; diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index b9473514e5e..e5942997c20 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -363,6 +363,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { Ret = (6 << 28); } else if (RC == &NVPTX::Float16RegsRegClass) { Ret = (7 << 28); + } else if (RC == &NVPTX::Float16x2RegsRegClass) { + Ret = (8 << 28); } else { report_fatal_error("Bad register class"); } diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 2aef67b9caf..7da621ccdc3 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -84,6 +84,14 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStore(N)) return; break; + case ISD::EXTRACT_VECTOR_ELT: + if (tryEXTRACT_VECTOR_ELEMENT(N)) + return; + break; + case NVPTXISD::SETP_F16X2: + SelectSETP_F16X2(N); + return; + case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: if (tryLoadVector(N)) @@ -516,6 +524,127 @@ bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) { return true; } +// Map ISD:CONDCODE value to appropriate CmpMode expected by +// NVPTXInstPrinter::printCmpMode() +static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { + using NVPTX::PTXCmpMode::CmpMode; + unsigned PTXCmpMode = [](ISD::CondCode CC) { + switch (CC) { + default: + llvm_unreachable("Unexpected condition code."); + case ISD::SETOEQ: + return CmpMode::EQ; + case ISD::SETOGT: + return CmpMode::GT; + case ISD::SETOGE: + return CmpMode::GE; + case ISD::SETOLT: + return CmpMode::LT; + case ISD::SETOLE: + return CmpMode::LE; + case ISD::SETONE: + return CmpMode::NE; + case ISD::SETO: + return CmpMode::NUM; + case ISD::SETUO: + return CmpMode::NotANumber; + case ISD::SETUEQ: + return CmpMode::EQU; + case ISD::SETUGT: + return CmpMode::GTU; + case ISD::SETUGE: + return CmpMode::GEU; + case ISD::SETULT: + return CmpMode::LTU; + case ISD::SETULE: + return CmpMode::LEU; + case ISD::SETUNE: + return CmpMode::NEU; + case ISD::SETEQ: + return CmpMode::EQ; + case ISD::SETGT: + return CmpMode::GT; + case ISD::SETGE: + return CmpMode::GE; + case ISD::SETLT: + return CmpMode::LT; + case ISD::SETLE: + return CmpMode::LE; + case ISD::SETNE: + return CmpMode::NE; + } + }(CondCode.get()); + + if (FTZ) + PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; + + return PTXCmpMode; +} + +bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { + unsigned PTXCmpMode = + getPTXCmpMode(*cast(N->getOperand(2)), useF32FTZ()); + SDLoc DL(N); + SDNode *SetP = CurDAG->getMachineNode( + NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), + N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); + ReplaceNode(N, SetP); + return true; +} + +// Find all instances of extract_vector_elt that use this v2f16 vector +// and coalesce them into a scattering move instruction. +bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { + SDValue Vector = N->getOperand(0); + + // We only care about f16x2 as it's the only real vector type we + // need to deal with. + if (Vector.getSimpleValueType() != MVT::v2f16) + return false; + + // Find and record all uses of this vector that extract element 0 or 1. + SmallVector E0, E1; + for (const auto &U : Vector.getNode()->uses()) { + if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + continue; + if (U->getOperand(0) != Vector) + continue; + if (const ConstantSDNode *IdxConst = + dyn_cast(U->getOperand(1))) { + if (IdxConst->getZExtValue() == 0) + E0.push_back(U); + else if (IdxConst->getZExtValue() == 1) + E1.push_back(U); + else + llvm_unreachable("Invalid vector index."); + } + } + + // There's no point scattering f16x2 if we only ever access one + // element of it. + if (E0.empty() || E1.empty()) + return false; + + unsigned Op = NVPTX::SplitF16x2; + // If the vector has been BITCAST'ed from i32, we can use original + // value directly and avoid register-to-register move. + SDValue Source = Vector; + if (Vector->getOpcode() == ISD::BITCAST) { + Op = NVPTX::SplitI32toF16x2; + Source = Vector->getOperand(0); + } + // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) + // into f16,f16 SplitF16x2(V) + SDNode *ScatterOp = + CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source); + for (auto *Node : E0) + ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); + for (auto *Node : E1) + ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); + + return true; +} + static unsigned int getCodeAddrSpace(MemSDNode *N) { const Value *Src = N->getMemOperand()->getValue(); @@ -689,29 +818,26 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) isVolatile = false; - // Vector Setting - MVT SimpleVT = LoadedVT.getSimpleVT(); - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; - if (SimpleVT.isVector()) { - unsigned num = SimpleVT.getVectorNumElements(); - if (num == 2) - vecType = NVPTX::PTXLdStInstCode::V2; - else if (num == 4) - vecType = NVPTX::PTXLdStInstCode::V4; - else - return false; - } - // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the // type is integer // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float + MVT SimpleVT = LoadedVT.getSimpleVT(); MVT ScalarVT = SimpleVT.getScalarType(); // Read at least 8 bits (predicates are stored as 8-bit values) unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits()); unsigned int fromType; + + // Vector Setting + unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + if (SimpleVT.isVector()) { + assert(LoadedVT == MVT::v2f16 && "Unexpected vector type"); + // v2f16 is loaded using ld.b32 + fromTypeWidth = 32; + } + if ((LD->getExtensionType() == ISD::SEXTLOAD)) fromType = NVPTX::PTXLdStInstCode::Signed; else if (ScalarVT.isFloatingPoint()) @@ -746,6 +872,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_avar; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_avar; + break; case MVT::f32: Opcode = NVPTX::LD_f32_avar; break; @@ -777,6 +906,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_asi; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_asi; + break; case MVT::f32: Opcode = NVPTX::LD_f32_asi; break; @@ -809,6 +941,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_ari_64; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_ari_64; + break; case MVT::f32: Opcode = NVPTX::LD_f32_ari_64; break; @@ -835,6 +970,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_ari; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_ari; + break; case MVT::f32: Opcode = NVPTX::LD_f32_ari; break; @@ -867,6 +1005,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_areg_64; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_areg_64; + break; case MVT::f32: Opcode = NVPTX::LD_f32_areg_64; break; @@ -893,6 +1034,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case MVT::f16: Opcode = NVPTX::LD_f16_areg; break; + case MVT::v2f16: + Opcode = NVPTX::LD_f16x2_areg; + break; case MVT::f32: Opcode = NVPTX::LD_f32_areg; break; @@ -968,7 +1112,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { if (ExtensionType == ISD::SEXTLOAD) FromType = NVPTX::PTXLdStInstCode::Signed; else if (ScalarVT.isFloatingPoint()) - FromType = NVPTX::PTXLdStInstCode::Float; + FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped + : NVPTX::PTXLdStInstCode::Float; else FromType = NVPTX::PTXLdStInstCode::Unsigned; @@ -987,6 +1132,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { EVT EltVT = N->getValueType(0); + // v8f16 is a special case. PTX doesn't have ld.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // load them with ld.v4.b32. + if (EltVT == MVT::v2f16) { + assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); + EltVT = MVT::i32; + FromType = NVPTX::PTXLdStInstCode::Untyped; + FromTypeWidth = 32; + } + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: @@ -1007,6 +1162,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_avar; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_avar; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_avar; break; @@ -1028,6 +1186,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_avar; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_avar; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_avar; break; @@ -1060,6 +1221,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_asi; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_asi; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_asi; break; @@ -1081,6 +1245,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_asi; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_asi; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_asi; break; @@ -1114,6 +1281,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_ari_64; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_ari_64; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_ari_64; break; @@ -1135,6 +1305,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_ari_64; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_ari_64; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_ari_64; break; @@ -1161,6 +1334,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_ari; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_ari; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_ari; break; @@ -1182,6 +1358,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_ari; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_ari; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_ari; break; @@ -1216,6 +1395,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_areg_64; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_areg_64; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_areg_64; break; @@ -1237,6 +1419,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_areg_64; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_areg_64; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_areg_64; break; @@ -1263,6 +1448,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::LDV_i64_v2_areg; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v2_areg; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v2_areg; break; @@ -1284,6 +1472,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::LDV_i32_v4_areg; break; + case MVT::f16: + Opcode = NVPTX::LDV_f16_v4_areg; + break; case MVT::f32: Opcode = NVPTX::LDV_f32_v4_areg; break; @@ -2151,21 +2342,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; - if (SimpleVT.isVector()) { - unsigned num = SimpleVT.getVectorNumElements(); - if (num == 2) - vecType = NVPTX::PTXLdStInstCode::V2; - else if (num == 4) - vecType = NVPTX::PTXLdStInstCode::V4; - else - return false; - } // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' // MVT ScalarVT = SimpleVT.getScalarType(); unsigned toTypeWidth = ScalarVT.getSizeInBits(); + if (SimpleVT.isVector()) { + assert(StoreVT == MVT::v2f16 && "Unexpected vector type"); + // v2f16 is stored using st.b32 + toTypeWidth = 32; + } + unsigned int toType; if (ScalarVT.isFloatingPoint()) // f16 uses .b16 as its storage type. @@ -2200,6 +2388,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_avar; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_avar; + break; case MVT::f32: Opcode = NVPTX::ST_f32_avar; break; @@ -2232,6 +2423,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_asi; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_asi; + break; case MVT::f32: Opcode = NVPTX::ST_f32_asi; break; @@ -2265,6 +2459,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_ari_64; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_ari_64; + break; case MVT::f32: Opcode = NVPTX::ST_f32_ari_64; break; @@ -2291,6 +2488,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_ari; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_ari; + break; case MVT::f32: Opcode = NVPTX::ST_f32_ari; break; @@ -2324,6 +2524,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_areg_64; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_areg_64; + break; case MVT::f32: Opcode = NVPTX::ST_f32_areg_64; break; @@ -2350,6 +2553,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case MVT::f16: Opcode = NVPTX::ST_f16_areg; break; + case MVT::v2f16: + Opcode = NVPTX::ST_f16x2_areg; + break; case MVT::f32: Opcode = NVPTX::ST_f32_areg; break; @@ -2411,7 +2617,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned ToTypeWidth = ScalarVT.getSizeInBits(); unsigned ToType; if (ScalarVT.isFloatingPoint()) - ToType = NVPTX::PTXLdStInstCode::Float; + ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped + : NVPTX::PTXLdStInstCode::Float; else ToType = NVPTX::PTXLdStInstCode::Unsigned; @@ -2438,6 +2645,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return false; } + // v8f16 is a special case. PTX doesn't have st.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // store them with st.v4.b32. + if (EltVT == MVT::v2f16) { + assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); + EltVT = MVT::i32; + ToType = NVPTX::PTXLdStInstCode::Untyped; + ToTypeWidth = 32; + } + StOps.push_back(getI32Imm(IsVolatile, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); @@ -2464,6 +2681,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_avar; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_avar; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_avar; break; @@ -2513,6 +2733,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_asi; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_asi; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_asi; break; @@ -2534,6 +2757,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::STV_i32_v4_asi; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v4_asi; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v4_asi; break; @@ -2564,6 +2790,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_ari_64; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_ari_64; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_ari_64; break; @@ -2585,6 +2814,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::STV_i32_v4_ari_64; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v4_ari_64; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v4_ari_64; break; @@ -2611,6 +2843,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_ari; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_ari; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_ari; break; @@ -2632,6 +2867,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::STV_i32_v4_ari; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v4_ari; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v4_ari; break; @@ -2662,6 +2900,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_areg_64; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_areg_64; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_areg_64; break; @@ -2683,6 +2924,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::STV_i32_v4_areg_64; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v4_areg_64; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v4_areg_64; break; @@ -2709,6 +2953,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i64: Opcode = NVPTX::STV_i64_v2_areg; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v2_areg; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v2_areg; break; @@ -2730,6 +2977,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case MVT::i32: Opcode = NVPTX::STV_i32_v4_areg; break; + case MVT::f16: + Opcode = NVPTX::STV_f16_v4_areg; + break; case MVT::f32: Opcode = NVPTX::STV_f32_v4_areg; break; @@ -2804,6 +3054,9 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { case MVT::f16: Opc = NVPTX::LoadParamMemF16; break; + case MVT::v2f16: + Opc = NVPTX::LoadParamMemF16x2; + break; case MVT::f32: Opc = NVPTX::LoadParamMemF32; break; @@ -2831,6 +3084,12 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { case MVT::i64: Opc = NVPTX::LoadParamMemV2I64; break; + case MVT::f16: + Opc = NVPTX::LoadParamMemV2F16; + break; + case MVT::v2f16: + Opc = NVPTX::LoadParamMemV2F16x2; + break; case MVT::f32: Opc = NVPTX::LoadParamMemV2F32; break; @@ -2855,6 +3114,12 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { case MVT::i32: Opc = NVPTX::LoadParamMemV4I32; break; + case MVT::f16: + Opc = NVPTX::LoadParamMemV4F16; + break; + case MVT::v2f16: + Opc = NVPTX::LoadParamMemV4F16x2; + break; case MVT::f32: Opc = NVPTX::LoadParamMemV4F32; break; @@ -2942,6 +3207,9 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { case MVT::f16: Opcode = NVPTX::StoreRetvalF16; break; + case MVT::v2f16: + Opcode = NVPTX::StoreRetvalF16x2; + break; case MVT::f32: Opcode = NVPTX::StoreRetvalF32; break; @@ -2969,6 +3237,12 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { case MVT::i64: Opcode = NVPTX::StoreRetvalV2I64; break; + case MVT::f16: + Opcode = NVPTX::StoreRetvalV2F16; + break; + case MVT::v2f16: + Opcode = NVPTX::StoreRetvalV2F16x2; + break; case MVT::f32: Opcode = NVPTX::StoreRetvalV2F32; break; @@ -2993,6 +3267,12 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { case MVT::i32: Opcode = NVPTX::StoreRetvalV4I32; break; + case MVT::f16: + Opcode = NVPTX::StoreRetvalV4F16; + break; + case MVT::v2f16: + Opcode = NVPTX::StoreRetvalV4F16x2; + break; case MVT::f32: Opcode = NVPTX::StoreRetvalV4F32; break; @@ -3000,8 +3280,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { break; } - SDNode *Ret = - CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + SDNode *Ret = CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); MemRefs0[0] = cast(N)->getMemOperand(); cast(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1); @@ -3078,6 +3357,9 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { case MVT::f16: Opcode = NVPTX::StoreParamF16; break; + case MVT::v2f16: + Opcode = NVPTX::StoreParamF16x2; + break; case MVT::f32: Opcode = NVPTX::StoreParamF32; break; @@ -3105,6 +3387,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { case MVT::i64: Opcode = NVPTX::StoreParamV2I64; break; + case MVT::f16: + Opcode = NVPTX::StoreParamV2F16; + break; + case MVT::v2f16: + Opcode = NVPTX::StoreParamV2F16x2; + break; case MVT::f32: Opcode = NVPTX::StoreParamV2F32; break; @@ -3129,6 +3417,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { case MVT::i32: Opcode = NVPTX::StoreParamV4I32; break; + case MVT::f16: + Opcode = NVPTX::StoreParamV4F16; + break; + case MVT::v2f16: + Opcode = NVPTX::StoreParamV4F16x2; + break; case MVT::f32: Opcode = NVPTX::StoreParamV4F32; break; diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 889575cdf7c..8fc38e7c461 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -71,6 +71,8 @@ private: bool trySurfaceIntrinsic(SDNode *N); bool tryBFE(SDNode *N); bool tryConstantFP16(SDNode *N); + bool SelectSETP_F16X2(SDNode *N); + bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 27d9f34850c..c2877c34f63 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -146,6 +146,9 @@ static bool IsPTXVectorType(MVT VT) { case MVT::v2i32: case MVT::v4i32: case MVT::v2i64: + case MVT::v2f16: + case MVT::v4f16: + case MVT::v8f16: // <4 x f16x2> case MVT::v2f32: case MVT::v4f32: case MVT::v2f64: @@ -170,13 +173,24 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { EVT VT = TempVTs[i]; uint64_t Off = TempOffsets[i]; - if (VT.isVector()) - for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) { - ValueVTs.push_back(VT.getVectorElementType()); + // Split vectors into individual elements, except for v2f16, which + // we will pass as a single scalar. + if (VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + // Vectors with an even number of f16 elements will be passed to + // us as an array of v2f16 elements. We must match this so we + // stay in sync with Ins/Outs. + if (EltVT == MVT::f16 && NumElts % 2 == 0) { + EltVT = MVT::v2f16; + NumElts /= 2; + } + for (unsigned j = 0; j != NumElts; ++j) { + ValueVTs.push_back(EltVT); if (Offsets) - Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize()); + Offsets->push_back(Off + j * EltVT.getStoreSize()); } - else { + } else { ValueVTs.push_back(VT); if (Offsets) Offsets->push_back(Off); @@ -331,6 +345,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, else setSchedulingPreference(Sched::Source); + auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, + LegalizeAction NoF16Action) { + setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); + }; + addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); @@ -338,13 +357,20 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); + addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); + + // Conversion to/from FP16/FP16x2 is always legal. + setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::SETCC, MVT::f16, - STI.allowFP16Math() ? Legal : Promote); + setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); + setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); // Operations not directly supported by NVPTX. - setOperationAction(ISD::SELECT_CC, MVT::f16, - STI.allowFP16Math() ? Expand : Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); @@ -352,8 +378,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::BR_CC, MVT::f16, - STI.allowFP16Math() ? Expand : Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v2f16, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -493,58 +519,53 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine(ISD::SREM); setTargetDAGCombine(ISD::UREM); - if (!STI.allowFP16Math()) { - // Promote fp16 arithmetic if fp16 hardware isn't available or the - // user passed --nvptx-no-fp16-math. The flag is useful because, - // although sm_53+ GPUs have some sort of FP16 support in - // hardware, only sm_53 and sm_60 have full implementation. Others - // only have token amount of hardware and are likely to run faster - // by using fp32 units instead. - setOperationAction(ISD::FADD, MVT::f16, Promote); - setOperationAction(ISD::FMUL, MVT::f16, Promote); - setOperationAction(ISD::FSUB, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); + // setcc for f16x2 needs special handling to prevent legalizer's + // attempt to scalarize it due to v2i1 not being legal. + if (STI.allowFP16Math()) + setTargetDAGCombine(ISD::SETCC); + + // Promote fp16 arithmetic if fp16 hardware isn't available or the + // user passed --nvptx-no-fp16-math. The flag is useful because, + // although sm_53+ GPUs have some sort of FP16 support in + // hardware, only sm_53 and sm_60 have full implementation. Others + // only have token amount of hardware and are likely to run faster + // by using fp32 units instead. + for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { + setFP16OperationAction(Op, MVT::f16, Legal, Promote); + setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); } - // There's no neg.f16 instruction. + + // There's no neg.f16 instruction. Expand to (0-x). setOperationAction(ISD::FNEG, MVT::f16, Expand); + setOperationAction(ISD::FNEG, MVT::v2f16, Expand); + + // (would be) Library functions. - // Library functions. These default to Expand, but we have instructions - // for them. - setOperationAction(ISD::FCEIL, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + // These map to conversion instructions for scalar FP types. + for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, + ISD::FROUND, ISD::FTRUNC}) { + setOperationAction(Op, MVT::f16, Legal); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } // 'Expand' implements FCOPYSIGN without calling an external library. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - // FP16 does not support these nodes in hardware, but we can perform - // these ops using single-precision hardware. - setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FABS, MVT::f16, Promote); + // These map to corresponding instructions for f32/f64. f16 must be + // promoted to f32. v2f16 is expanded to f16, which is then promoted + // to f32. + for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, + ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { + setOperationAction(Op, MVT::f16, Promote); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::FMINNAN, MVT::f16, Promote); @@ -660,6 +681,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::FUN_SHFR_CLAMP"; case NVPTXISD::IMAD: return "NVPTXISD::IMAD"; + case NVPTXISD::SETP_F16X2: + return "NVPTXISD::SETP_F16X2"; case NVPTXISD::Dummy: return "NVPTXISD::Dummy"; case NVPTXISD::MUL_WIDE_SIGNED: @@ -1158,7 +1181,8 @@ TargetLoweringBase::LegalizeTypeAction NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) return TypeSplitVector; - + if (VT == MVT::v2f16) + return TypeLegal; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1723,7 +1747,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool ExtendIntegerRetVal = RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { bool needTruncate = false; EVT TheLoadType = VTs[i]; EVT EltType = Ins[i].VT; @@ -1765,11 +1789,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, llvm_unreachable("Invalid vector info."); } - SDValue VectorOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), - InFlag}; + SDValue LoadOperands[] = { + Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), VectorOps, TheLoadType, + Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, MachinePointerInfo(), EltAlign); for (unsigned j = 0; j < NumElts; ++j) { @@ -1823,6 +1847,55 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +// We can init constant f16x2 with a single .b32 move. Normally it +// would get lowered as two constant loads and vector-packing move. +// mov.b16 %h1, 0x4000; +// mov.b16 %h2, 0x3C00; +// mov.b32 %hh2, {%h2, %h1}; +// Instead we want just a constant move: +// mov.b32 %hh2, 0x40003C00 +// +// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 +// generates good SASS in both cases. +SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + //return Op; + if (!(Op->getValueType(0) == MVT::v2f16 && + isa(Op->getOperand(0)) && + isa(Op->getOperand(1)))) + return Op; + + APInt E0 = + cast(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); + APInt E1 = + cast(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); + SDValue Const = + DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); +} + +SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Index = Op->getOperand(1); + // Constant index will be matched by tablegen. + if (isa(Index.getNode())) + return Op; + + // Extract individual elements and select one of them. + SDValue Vector = Op->getOperand(0); + EVT VectorVT = Vector.getValueType(); + assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); + EVT EltVT = VectorVT.getVectorElementType(); + + SDLoc dl(Op.getNode()); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(0, dl)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(1, dl)); + return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, + ISD::CondCode::SETEQ); +} + /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift /// amount, or @@ -1956,8 +2029,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return Op; case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return Op; + case ISD::EXTRACT_VECTOR_ELT: + return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::STORE: @@ -2054,12 +2130,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { case MVT::v2i16: case MVT::v2i32: case MVT::v2i64: + case MVT::v2f16: case MVT::v2f32: case MVT::v2f64: case MVT::v4i8: case MVT::v4i16: case MVT::v4i32: + case MVT::v4f16: case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> // This is a "native" vector type break; } @@ -2090,6 +2169,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { if (EltVT.getSizeInBits() < 16) NeedExt = true; + bool StoreF16x2 = false; switch (NumElts) { default: return SDValue(); @@ -2099,6 +2179,14 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { case 4: Opcode = NVPTXISD::StoreV4; break; + case 8: + // v8f16 is a special case. PTX doesn't have st.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // store them with st.v4.b32. + assert(EltVT == MVT::f16 && "Wrong type for the vector."); + Opcode = NVPTXISD::StoreV4; + StoreF16x2 = true; + break; } SmallVector Ops; @@ -2106,23 +2194,36 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { // First is the chain Ops.push_back(N->getOperand(0)); - // Then the split values - for (unsigned i = 0; i < NumElts; ++i) { - SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, - DAG.getIntPtrConstant(i, DL)); - if (NeedExt) - ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); - Ops.push_back(ExtVal); + if (StoreF16x2) { + // Combine f16,f16 -> v2f16 + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2 + 1, DL)); + SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); + Ops.push_back(V2); + } + } else { + // Then the split values + for (unsigned i = 0; i < NumElts; ++i) { + SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(i, DL)); + if (NeedExt) + ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + Ops.push_back(ExtVal); + } } // Then any remaining arguments Ops.append(N->op_begin() + 2, N->op_end()); - SDValue NewSt = DAG.getMemIntrinsicNode( - Opcode, DL, DAG.getVTList(MVT::Other), Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); + SDValue NewSt = + DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); - //return DCI.CombineTo(N, NewSt, true); + // return DCI.CombineTo(N, NewSt, true); return NewSt; } @@ -2282,7 +2383,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SmallVector VTs; SmallVector Offsets; ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); - assert(VTs.size() > 0 && "empty aggregate type not expected"); + assert(VTs.size() > 0 && "Unexpected empty type."); auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); @@ -2299,7 +2400,15 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( unsigned NumElts = parti - VecIdx + 1; EVT EltVT = VTs[parti]; // i1 is loaded/stored as i8. - EVT LoadVT = EltVT == MVT::i1 ? MVT::i8 : EltVT; + EVT LoadVT = EltVT; + if (EltVT == MVT::i1) + LoadVT = MVT::i8; + else if (EltVT == MVT::v2f16) + // getLoad needs a vector type, but it can't handle + // vectors which contain v2f16 elements. So we must load + // using i32 here and then bitcast back. + LoadVT = MVT::i32; + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); SDValue VecAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, @@ -2319,15 +2428,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // We've loaded i1 as an i8 and now must truncate it back to i1 if (EltVT == MVT::i1) Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); - // Extend the element if necesary (e.g an i8 is loaded + // v2f16 was loaded as an i32. Now we must bitcast it back. + else if (EltVT == MVT::v2f16) + Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); + // Extend the element if necesary (e.g. an i8 is loaded // into an i16 register) - if (Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { + if (Ins[InsIdx].VT.isInteger() && + Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); } InVals.push_back(Elt); } + // Reset vector tracking state. VecIdx = -1; } @@ -2399,7 +2513,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; SmallVector StoreOperands; - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { // New load/store. Record chain and offset operands. if (VectorInfo[i] & PVF_FIRST) { assert(StoreOperands.empty() && "Orphaned operand list."); @@ -4168,6 +4282,27 @@ static SDValue PerformSHLCombine(SDNode *N, return SDValue(); } +static SDValue PerformSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT CCType = N->getValueType(0); + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + + if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) + return SDValue(); + + SDLoc DL(N); + // setp.f16x2 returns two scalar predicates, which we need to + // convert back to v2i1. The returned result will be scalarized by + // the legalizer, but the comparison will remain a single vector + // instruction. + SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, + DCI.DAG.getVTList(MVT::i1, MVT::i1), + {A, B, N->getOperand(2)}); + return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), + CCNode.getValue(1)); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); @@ -4185,6 +4320,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::UREM: case ISD::SREM: return PerformREMCombine(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI); } return SDValue(); } @@ -4208,12 +4345,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, case MVT::v2i16: case MVT::v2i32: case MVT::v2i64: + case MVT::v2f16: case MVT::v2f32: case MVT::v2f64: case MVT::v4i8: case MVT::v4i16: case MVT::v4i32: + case MVT::v4f16: case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> // This is a "native" vector type break; } @@ -4247,6 +4387,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, unsigned Opcode = 0; SDVTList LdResVTs; + bool LoadF16x2 = false; switch (NumElts) { default: @@ -4261,6 +4402,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LdResVTs = DAG.getVTList(ListVTs); break; } + case 8: { + // v8f16 is a special case. PTX doesn't have ld.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // load them with ld.v4.b32. + assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); + LoadF16x2 = true; + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, + MVT::Other}; + LdResVTs = DAG.getVTList(ListVTs); + break; + } } // Copy regular operands @@ -4274,13 +4427,26 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LD->getMemoryVT(), LD->getMemOperand()); - SmallVector ScalarRes; - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Res = NewLD.getValue(i); - if (NeedTrunc) - Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); - ScalarRes.push_back(Res); + SmallVector ScalarRes; + if (LoadF16x2) { + // Split v2f16 subvectors back into individual elements. + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue SubVector = NewLD.getValue(i); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(0, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(1, DL)); + ScalarRes.push_back(E0); + ScalarRes.push_back(E1); + } + } else { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } } SDValue LoadChain = NewLD.getValue(NumElts); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index ab78d8a936b..9d7b70d80c1 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -56,6 +56,7 @@ enum NodeType : unsigned { MUL_WIDE_SIGNED, MUL_WIDE_UNSIGNED, IMAD, + SETP_F16X2, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -73,7 +74,7 @@ enum NodeType : unsigned { StoreParamV2, StoreParamV4, StoreParamS32, // to sext and store a <32bit value, not used currently - StoreParamU32, // to zext and store a <32bit value, not used currently + StoreParamU32, // to zext and store a <32bit value, not used currently StoreRetval, StoreRetvalV2, StoreRetvalV4, @@ -549,14 +550,15 @@ private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTOREf16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 67e6e252eb9..3026f0be242 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -55,6 +55,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (DestRC == &NVPTX::Float16RegsRegClass) { Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr : NVPTX::BITCONVERT_16_I2F); + } else if (DestRC == &NVPTX::Float16x2RegsRegClass) { + Op = NVPTX::IMOV32rr; } else if (DestRC == &NVPTX::Float32RegsRegClass) { Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr : NVPTX::BITCONVERT_32_I2F); diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 3345ce8d3cb..13d86d31c04 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -102,6 +102,9 @@ def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; def CmpMode : Operand { let PrintMethod = "printCmpMode"; } +def VecElement : Operand { + let PrintMethod = "printVecElement"; +} //===----------------------------------------------------------------------===// // NVPTX Instruction Predicate Definitions @@ -305,6 +308,19 @@ multiclass F3_fma_component { [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, Requires<[useFP16Math, allowFMA]>; + def f16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA, doF32FTZ]>; + def f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA]>; + // These have strange names so we don't perturb existing mir tests. def _rnf64rr : NVPTXInst<(outs Float64Regs:$dst), @@ -354,6 +370,18 @@ multiclass F3_fma_component { !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, Requires<[useFP16Math, noFMA]>; + def _rnf16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA, doF32FTZ]>; + def _rnf16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA]>; } // Template for operations which take two f32 or f64 operands. Provides three @@ -991,15 +1019,17 @@ multiclass FMA Requires<[Pred]>; } -multiclass FMA_F16 { +multiclass FMA_F16 { def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, Requires<[useFP16Math, Pred]>; } -defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, f16imm, doF32FTZ>; -defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, f16imm, true>; +defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>; +defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>; +defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>; +defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>; defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>; defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>; @@ -1390,9 +1420,17 @@ defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; def SETP_f16rr : NVPTXInst<(outs Int1Regs:$dst), (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), - "setp${cmp:base}${cmp:ftz}.f16 $dst, $a, $b;", + "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", []>, Requires<[useFP16Math]>; +def SETP_f16x2rr : + NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), + (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), + "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", + []>, + Requires<[useFP16Math]>; + + // FIXME: This doesn't appear to be correct. The "set" mnemonic has the form // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination // reg, either u32, s32, or f32. Anyway these aren't used at the moment. @@ -1488,6 +1526,13 @@ defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>; defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; +def SELP_f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Float16x2Regs:$dst, + (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>; + //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- @@ -2061,10 +2106,15 @@ def LoadParamMemV4I32 : LoadParamV4MemInst; def LoadParamMemV4I16 : LoadParamV4MemInst; def LoadParamMemV4I8 : LoadParamV4MemInst; def LoadParamMemF16 : LoadParamMemInst; +def LoadParamMemF16x2 : LoadParamMemInst; def LoadParamMemF32 : LoadParamMemInst; def LoadParamMemF64 : LoadParamMemInst; +def LoadParamMemV2F16 : LoadParamV2MemInst; +def LoadParamMemV2F16x2: LoadParamV2MemInst; def LoadParamMemV2F32 : LoadParamV2MemInst; def LoadParamMemV2F64 : LoadParamV2MemInst; +def LoadParamMemV4F16 : LoadParamV4MemInst; +def LoadParamMemV4F16x2: LoadParamV4MemInst; def LoadParamMemV4F32 : LoadParamV4MemInst; def StoreParamI64 : StoreParamInst; @@ -2082,10 +2132,15 @@ def StoreParamV4I16 : StoreParamV4Inst; def StoreParamV4I8 : StoreParamV4Inst; def StoreParamF16 : StoreParamInst; +def StoreParamF16x2 : StoreParamInst; def StoreParamF32 : StoreParamInst; def StoreParamF64 : StoreParamInst; +def StoreParamV2F16 : StoreParamV2Inst; +def StoreParamV2F16x2 : StoreParamV2Inst; def StoreParamV2F32 : StoreParamV2Inst; def StoreParamV2F64 : StoreParamV2Inst; +def StoreParamV4F16 : StoreParamV4Inst; +def StoreParamV4F16x2 : StoreParamV4Inst; def StoreParamV4F32 : StoreParamV4Inst; def StoreRetvalI64 : StoreRetvalInst; @@ -2103,9 +2158,14 @@ def StoreRetvalV4I8 : StoreRetvalV4Inst; def StoreRetvalF64 : StoreRetvalInst; def StoreRetvalF32 : StoreRetvalInst; def StoreRetvalF16 : StoreRetvalInst; +def StoreRetvalF16x2 : StoreRetvalInst; def StoreRetvalV2F64 : StoreRetvalV2Inst; def StoreRetvalV2F32 : StoreRetvalV2Inst; +def StoreRetvalV2F16 : StoreRetvalV2Inst; +def StoreRetvalV2F16x2: StoreRetvalV2Inst; def StoreRetvalV4F32 : StoreRetvalV4Inst; +def StoreRetvalV4F16 : StoreRetvalV4Inst; +def StoreRetvalV4F16x2: StoreRetvalV4Inst; def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; @@ -2252,6 +2312,7 @@ let mayLoad=1, hasSideEffects=0 in { defm LD_i32 : LD; defm LD_i64 : LD; defm LD_f16 : LD; + defm LD_f16x2 : LD; defm LD_f32 : LD; defm LD_f64 : LD; } @@ -2301,6 +2362,7 @@ let mayStore=1, hasSideEffects=0 in { defm ST_i32 : ST; defm ST_i64 : ST; defm ST_f16 : ST; + defm ST_f16x2 : ST; defm ST_f32 : ST; defm ST_f64 : ST; } @@ -2387,6 +2449,7 @@ let mayLoad=1, hasSideEffects=0 in { defm LDV_i16 : LD_VEC; defm LDV_i32 : LD_VEC; defm LDV_i64 : LD_VEC; + defm LDV_f16 : LD_VEC; defm LDV_f32 : LD_VEC; defm LDV_f64 : LD_VEC; } @@ -2480,17 +2543,18 @@ let mayStore=1, hasSideEffects=0 in { defm STV_i16 : ST_VEC; defm STV_i32 : ST_VEC; defm STV_i64 : ST_VEC; + defm STV_f16 : ST_VEC; + defm STV_f16x2 : ST_VEC; defm STV_f32 : ST_VEC; defm STV_f64 : ST_VEC; } - //---- Conversion ---- class F_BITCONVERT : NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), - !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")), + !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")), [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>; @@ -2499,6 +2563,8 @@ def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; +def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>; +def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>; // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where // we cannot specify floating-point literals in isel patterns. Therefore, we @@ -2741,6 +2807,9 @@ def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), (SELP_b64rr Int64Regs:$a, Int64Regs:$b, (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b), + (SELP_f16rr Float16Regs:$a, Float16Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), (SELP_f32rr Float32Regs:$a, Float32Regs:$b, (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; @@ -2779,6 +2848,49 @@ let hasSideEffects = 0 in { def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), (ins Float64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; + +} + +let hasSideEffects = 0 in { + // Extract element of f16x2 register. PTX does not provide any way + // to access elements of f16x2 vector directly, so we need to + // extract it using a temporary register. + def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_hi;\n\t" + " mov.b32 \t{$dst, %tmp_hi}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 0))]>; + def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_lo;\n\t" + " mov.b32 \t{%tmp_lo, $dst}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 1))]>; + + // Coalesce two f16 registers into f16x2 + def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + "mov.b32 \t$dst, {{$a, $b}};", + [(set Float16x2Regs:$dst, + (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; + + // Directly initializing underlying the b32 register is one less SASS + // instruction than than vector-packing move. + def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), + "mov.b32 \t$dst, $src;", + []>; + + // Split f16x2 into two f16 registers. + def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Float16x2Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; + // Split an i32 into two f16 + def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Int32Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; } // Count leading zeros diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 509a253d24a..39779e8dfa9 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1606,6 +1606,10 @@ defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64 \t$result, [$src];", Int64Regs>; +defm INT_PTX_LDG_GLOBAL_f16 + : LDG_G<"b16 \t$result, [$src];", Float16Regs>; +defm INT_PTX_LDG_GLOBAL_f16x2 + : LDG_G<"b32 \t$result, [$src];", Float16x2Regs>; defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32 \t$result, [$src];", Float32Regs>; defm INT_PTX_LDG_GLOBAL_f64 @@ -1661,6 +1665,8 @@ defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>; +defm INT_PTX_LDG_G_v4f16_ELE + : VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>; defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>; defm INT_PTX_LDG_G_v2i64_ELE @@ -1673,6 +1679,8 @@ defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>; defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>; +defm INT_PTX_LDG_G_v8f16_ELE + : VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>; defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>; diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 9caedfb0fef..8d46694fbe5 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -35,6 +35,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) { // accepted for all supported fp16 instructions on all GPU // variants, so we can use them instead. return ".b16"; + if (RC == &NVPTX::Float16x2RegsRegClass) + return ".b32"; if (RC == &NVPTX::Float64RegsRegClass) return ".f64"; if (RC == &NVPTX::Int64RegsRegClass) @@ -73,6 +75,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; if (RC == &NVPTX::Float16RegsRegClass) return "%h"; + if (RC == &NVPTX::Float16x2RegsRegClass) + return "%hh"; if (RC == &NVPTX::Float64RegsRegClass) return "%fd"; if (RC == &NVPTX::Int64RegsRegClass) diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index fd255bdb6d2..f04764a9e9a 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -37,6 +37,7 @@ foreach i = 0-4 in { def R#i : NVPTXReg<"%r"#i>; // 32-bit def RL#i : NVPTXReg<"%rd"#i>; // 64-bit def H#i : NVPTXReg<"%h"#i>; // 16-bit float + def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float def F#i : NVPTXReg<"%f"#i>; // 32-bit float def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float @@ -59,6 +60,7 @@ def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>; def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>; def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>; +def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>; def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>; def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>; diff --git a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 1a4b0bad36e..e84030f385c 100644 --- a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -15,3 +15,37 @@ define i32 @f(i32* %p) { %sum = add i32 %v0, %v1 ret i32 %sum } + +define half @fh(half* %p) { + %p.1 = getelementptr half, half* %p, i32 1 + %p.2 = getelementptr half, half* %p, i32 2 + %p.3 = getelementptr half, half* %p, i32 3 + %p.4 = getelementptr half, half* %p, i32 4 + %v0 = load half, half* %p, align 64 + %v1 = load half, half* %p.1, align 4 + %v2 = load half, half* %p.2, align 4 + %v3 = load half, half* %p.3, align 4 + %v4 = load half, half* %p.4, align 4 + %sum1 = fadd half %v0, %v1 + %sum2 = fadd half %v2, %v3 + %sum3 = fadd half %sum1, %sum2 + %sum = fadd half %sum3, %v4 + ret half %sum +} + +define float @ff(float* %p) { + %p.1 = getelementptr float, float* %p, i32 1 + %p.2 = getelementptr float, float* %p, i32 2 + %p.3 = getelementptr float, float* %p, i32 3 + %p.4 = getelementptr float, float* %p, i32 4 + %v0 = load float, float* %p, align 64 + %v1 = load float, float* %p.1, align 4 + %v2 = load float, float* %p.2, align 4 + %v3 = load float, float* %p.3, align 4 + %v4 = load float, float* %p.4, align 4 + %sum1 = fadd float %v0, %v1 + %sum2 = fadd float %v2, %v3 + %sum3 = fadd float %sum1, %sum2 + %sum = fadd float %sum3, %v4 + ret float %sum +} diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll index 5fce2961127..3d39104579d 100644 --- a/test/CodeGen/NVPTX/f16-instructions.ll +++ b/test/CodeGen/NVPTX/f16-instructions.ll @@ -127,13 +127,13 @@ define half @test_fdiv(half %a, half %b) #0 { ; CHECK-LABEL: test_frem( ; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0]; ; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1]; -; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]]; -; CHECK-NEXT: div.rn.f32 [[F2:%f[0-9]+]], [[F0]], [[F1]]; -; CHECK-NEXT: cvt.rmi.f32.f32 [[F3:%f[0-9]+]], [[F2]]; -; CHECK-NEXT: mul.f32 [[F4:%f[0-9]+]], [[F3]], [[F1]]; -; CHECK-NEXT: sub.f32 [[F5:%f[0-9]+]], [[F0]], [[F4]]; -; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[F5]]; +; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]]; +; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]]; +; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]]; +; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]]; +; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]]; +; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; ; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; define half @test_frem(half %a, half %b) #0 { @@ -509,7 +509,7 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: @%p1 bra [[LABEL:LBB.*]]; +; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]]; ; CHECK: st.u32 [%[[C]]], ; CHECK: [[LABEL]]: ; CHECK: st.u32 [%[[D]]], diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll new file mode 100644 index 00000000000..2c1d5fe003a --- /dev/null +++ b/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -0,0 +1,1433 @@ +; ## Full FP16 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s +; ## FP16 support explicitly disabled. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s +; ## FP16 is not supported by hardware. +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; +; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_ret_const() #0 { + ret <2 x half> +} + +; CHECK-LABEL: test_extract_0( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0]; +; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_0(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 0 + ret half %e +} + +; CHECK-LABEL: test_extract_1( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0]; +; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_1(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 1 + ret half %e +} + +; CHECK-LABEL: test_extract_i( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0]; +; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; +; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]]; +; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { + %e = extractelement <2 x half> %a, i64 %idx + ret half %e +} + +; CHECK-LABEL: test_fadd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1]; +; +; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { + %r = fadd <2 x half> %a, %b + ret <2 x half> %r +} + +; Check that we can lower fadd with immediate arguments. +; CHECK-LABEL: test_fadd_imm_0( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { + %r = fadd <2 x half> , %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fadd_imm_1( +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { + %r = fadd <2 x half> %a, + ret <2 x half> %r +} + +; CHECK-LABEL: test_fsub( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0]; +; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { + %r = fsub <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fneg( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; +; +; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; +; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fneg(<2 x half> %a) #0 { + %r = fsub <2 x half> , %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmul( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1]; +; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { + %r = fmul <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fdiv( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]; +; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { + %r = fdiv <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_frem( +; -- Load two 16x2 inputs and split them into f16 elements +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1]; +; -- Split into elements +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; -- promote to f32. +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; -- frem(a[0],b[0]). +; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; +; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; +; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]]; +; -- frem(a[1],b[1]). +; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; +; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; +; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]]; +; -- convert back to f16. +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; -- merge into f16x2 and return it. +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { + %r = frem <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: .func test_ldst_v2f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; +; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]] +; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]]; +; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]}; +; CHECK: ret; +define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) { + %t1 = load <2 x half>, <2 x half>* %a + store <2 x half> %t1, <2 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v3f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; +; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair +; number of bitshifting instructions that may change at llvm's whim. +; So we only verify that we only issue correct number of writes using +; correct offset, but not the values we write. +; CHECK-DAG: ld.u64 +; CHECK-DAG: st.u32 [%[[B]]], +; CHECK-DAG: st.b16 [%[[B]]+4], +; CHECK: ret; +define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) { + %t1 = load <3 x half>, <3 x half>* %a + store <3 x half> %t1, <3 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v4f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; +; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) { + %t1 = load <4 x half>, <4 x half>* %a + store <4 x half> %t1, <4 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v8f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; +; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) { + %t1 = load <8 x half>, <8 x half>* %a + store <8 x half> %t1, <8 x half>* %b, align 16 + ret void +} + +declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[A]]; +; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1]; +; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { + %r = select i1 %c, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3]; +; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc_f32_f16( +; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3]; +; CHECK-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; +; TODO: Currently DAG combiner scalarizes setcc before we can lower it to setp.f16x2. +; We'd like to see this instruction: +; CHECK-F16-NOTYET: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; But we end up with a pair of scalar instances of it instead: +; CHECK-F16-DAG: setp.neu.f16 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-F16-DAG: setp.neu.f16 [[P1:%p[0-9]+]], [[C1]], [[D1]] + + +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, + <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1]; +; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, + <2 x float> %c, <2 x float> %d) #0 { + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fcmp_une( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1]; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp une <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ueq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1]; +; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ueq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ugt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1]; +; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ugt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_uge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1]; +; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ult( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1]; +; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ult <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ule( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1]; +; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ule <2 x half> %a, %b + ret <2 x i1> %r +} + + +; CHECK-LABEL: test_fcmp_uno( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1]; +; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uno <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_one( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1]; +; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp one <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oeq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1]; +; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oeq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ogt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1]; +; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ogt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1]; +; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_olt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1]; +; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp olt <2 x half> %a, %b + ret <2 x i1> %r +} + +; XCHECK-LABEL: test_fcmp_ole( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1]; +; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ole <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ord( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1]; +; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ord <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fptosi_i32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptosi_i64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { + %r = uitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { + %r = uitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { + %r = sitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { + %r = sitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]]; + +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = uitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]]; +; +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = sitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xfloat( +; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { + %r = fptrunc <2 x float> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; +; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { + %r = fptrunc <2 x double> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fpext_2xfloat( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x float> + ret <2 x float> %r +} + +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x double> + ret <2 x double> %r +} + + +; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] +; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 +; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] +; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { + %r = bitcast <2 x half> %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( +; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; +; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; +; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; +; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { + %r = bitcast <2 x i16> %a to <2 x half> + ret <2 x half> %r +} + + +declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0 +declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 +declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 + +; CHECK-LABEL: test_sqrt( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sqrt(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 { +; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_sin( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sin(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_cos( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_cos(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 { +; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define <2 x half> @test_exp(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define <2 x half> @test_exp2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define <2 x half> @test_log(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define <2 x half> @test_log10(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define <2 x half> @test_log2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_fma( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} + +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret +define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fabs( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fabs(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_minnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_maxnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; +; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; +; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; +; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; +; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { + %tb = fptrunc <2 x float> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f64( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0]; +; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; +; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; +; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; +; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; +; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { + %tb = fptrunc <2 x double> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_extended( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]] +; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]]; +; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]]; +; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; +; CHECK: ret; +define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + %xr = fpext <2 x half> %r to <2 x float> + ret <2 x float> %xr +} + +; CHECK-LABEL: test_floor( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_floor(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_ceil( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_ceil(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_trunc( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_trunc(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_rint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_rint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_nearbyint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_nearbyint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_round( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_round(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.round.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmuladd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll index a6defdcb478..8a67567acc9 100644 --- a/test/CodeGen/NVPTX/param-load-store.ll +++ b/test/CodeGen/NVPTX/param-load-store.ll @@ -4,9 +4,9 @@ %s_i1 = type { i1 } %s_i8 = type { i8 } %s_i16 = type { i16 } -%s_half = type { half } +%s_f16 = type { half } %s_i32 = type { i32 } -%s_float = type { float } +%s_f32 = type { float } %s_i64 = type { i64 } %s_f64 = type { double } @@ -322,22 +322,148 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) { } ; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_half( -; CHECK-NEXT: .param .b32 test_half_param_0 -; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0]; +; CHECK-LABEL: test_f16( +; CHECK-NEXT: .param .b32 test_f16_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b16 [param0+0], [[E]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), -; CHECK-NEXT: test_half, +; CHECK-NEXT: test_f16, ; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; ; CHECK: st.param.b16 [func_retval0+0], [[R]] ; CHECK-NEXT: ret; -define half @test_half(half %a) { - %r = tail call half @test_half(half %a); +define half @test_f16(half %a) { + %r = tail call half @test_f16(half %a); ret half %r; } +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v2f16( +; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] +; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v2f16, +; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define <2 x half> @test_v2f16(<2 x half> %a) { + %r = tail call <2 x half> @test_v2f16(<2 x half> %a); + ret <2 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3f16( +; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] +; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v3f16, +; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; +; CHECK: ret; +define <3 x half> @test_v3f16(<3 x half> %a) { + %r = tail call <3 x half> @test_v3f16(<3 x half> %a); + ret <3 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4f16( +; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] +; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v4f16, +; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; +; CHECK: ret; +define <4 x half> @test_v4f16(<4 x half> %a) { + %r = tail call <4 x half> @test_v4f16(<4 x half> %a); + ret <4 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5f16( +; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v5f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; +; CHECK: ret; +define <5 x half> @test_v5f16(<5 x half> %a) { + %r = tail call <5 x half> @test_v5f16(<5 x half> %a); + ret <5 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v8f16( +; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] +; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]]; +; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v8f16, +; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; +; CHECK: ret; +define <8 x half> @test_v8f16(<8 x half> %a) { + %r = tail call <8 x half> @test_v8f16(<8 x half> %a); + ret <8 x half> %r; +} + +; CHECK:.func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v9f16( +; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0]; +; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8]; +; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.v4.b16 [param0+8], +; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_v9f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8]; +; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; +; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; +; CHECK: ret; +define <9 x half> @test_v9f16(<9 x half> %a) { + %r = tail call <9 x half> @test_v9f16(<9 x half> %a); + ret <9 x half> %r; +} + ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i32( ; CHECK-NEXT: .param .b32 test_i32_param_0 @@ -415,19 +541,19 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) { } ; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_float( -; CHECK-NEXT: .param .b32 test_float_param_0 -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0]; +; CHECK-LABEL: test_f32( +; CHECK-NEXT: .param .b32 test_f32_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.f32 [param0+0], [[E]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), -; CHECK-NEXT: test_float, +; CHECK-NEXT: test_f32, ; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; ; CHECK: st.param.f32 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; -define float @test_float(float %a) { - %r = tail call float @test_float(float %a); +define float @test_f32(float %a) { + %r = tail call float @test_f32(float %a); ret float %r; } @@ -547,20 +673,20 @@ define %s_i16 @test_s_i16(%s_i16 %a) { } ; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_half( -; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2] -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0]; +; CHECK-LABEL: test_s_f16( +; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; ; CHECK: st.param.b16 [param0+0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni -; CHECK-NEXT: test_s_half, +; CHECK-NEXT: test_s_f16, ; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; ; CHECK: st.param.b16 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; -define %s_half @test_s_half(%s_half %a) { - %r = tail call %s_half @test_s_half(%s_half %a); - ret %s_half %r; +define %s_f16 @test_s_f16(%s_f16 %a) { + %r = tail call %s_f16 @test_s_f16(%s_f16 %a); + ret %s_f16 %r; } ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) @@ -581,20 +707,20 @@ define %s_i32 @test_s_i32(%s_i32 %a) { } ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_float( -; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4] -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0]; +; CHECK-LABEL: test_s_f32( +; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] ; CHECK: st.param.f32 [param0+0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_float, +; CHECK-NEXT: test_s_f32, ; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; ; CHECK: st.param.f32 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; -define %s_float @test_s_float(%s_float %a) { - %r = tail call %s_float @test_s_float(%s_float %a); - ret %s_float %r; +define %s_f32 @test_s_f32(%s_f32 %a) { + %r = tail call %s_f32 @test_s_f32(%s_f32 %a); + ret %s_f32 %r; } ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -- 2.11.0