case MVT::v2i32:
case MVT::v4i32:
case MVT::v2i64:
+ case MVT::v2f16:
+ case MVT::v4f16:
+ case MVT::v8f16: // <4 x f16x2>
case MVT::v2f32:
case MVT::v4f32:
case MVT::v2f64:
for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
EVT VT = TempVTs[i];
uint64_t Off = TempOffsets[i];
- if (VT.isVector())
- for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
- ValueVTs.push_back(VT.getVectorElementType());
+ // Split vectors into individual elements, except for v2f16, which
+ // we will pass as a single scalar.
+ if (VT.isVector()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ // Vectors with an even number of f16 elements will be passed to
+ // us as an array of v2f16 elements. We must match this so we
+ // stay in sync with Ins/Outs.
+ if (EltVT == MVT::f16 && NumElts % 2 == 0) {
+ EltVT = MVT::v2f16;
+ NumElts /= 2;
+ }
+ for (unsigned j = 0; j != NumElts; ++j) {
+ ValueVTs.push_back(EltVT);
if (Offsets)
- Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+ Offsets->push_back(Off + j * EltVT.getStoreSize());
}
- else {
+ } else {
ValueVTs.push_back(VT);
if (Offsets)
Offsets->push_back(Off);
else
setSchedulingPreference(Sched::Source);
+ auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
+ LegalizeAction NoF16Action) {
+ setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
+ };
+
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
+ addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
+
+ // Conversion to/from FP16/FP16x2 is always legal.
+ setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::SETCC, MVT::f16,
- STI.allowFP16Math() ? Legal : Promote);
+ setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
+ setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
// Operations not directly supported by NVPTX.
- setOperationAction(ISD::SELECT_CC, MVT::f16,
- STI.allowFP16Math() ? Expand : Promote);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::BR_CC, MVT::f16,
- STI.allowFP16Math() ? Expand : Promote);
+ setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setTargetDAGCombine(ISD::SREM);
setTargetDAGCombine(ISD::UREM);
- if (!STI.allowFP16Math()) {
- // Promote fp16 arithmetic if fp16 hardware isn't available or the
- // user passed --nvptx-no-fp16-math. The flag is useful because,
- // although sm_53+ GPUs have some sort of FP16 support in
- // hardware, only sm_53 and sm_60 have full implementation. Others
- // only have token amount of hardware and are likely to run faster
- // by using fp32 units instead.
- setOperationAction(ISD::FADD, MVT::f16, Promote);
- setOperationAction(ISD::FMUL, MVT::f16, Promote);
- setOperationAction(ISD::FSUB, MVT::f16, Promote);
- setOperationAction(ISD::FMA, MVT::f16, Promote);
+ // setcc for f16x2 needs special handling to prevent legalizer's
+ // attempt to scalarize it due to v2i1 not being legal.
+ if (STI.allowFP16Math())
+ setTargetDAGCombine(ISD::SETCC);
+
+ // Promote fp16 arithmetic if fp16 hardware isn't available or the
+ // user passed --nvptx-no-fp16-math. The flag is useful because,
+ // although sm_53+ GPUs have some sort of FP16 support in
+ // hardware, only sm_53 and sm_60 have full implementation. Others
+ // only have token amount of hardware and are likely to run faster
+ // by using fp32 units instead.
+ for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+ setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+ setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
}
- // There's no neg.f16 instruction.
+
+ // There's no neg.f16 instruction. Expand to (0-x).
setOperationAction(ISD::FNEG, MVT::f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
+
+ // (would be) Library functions.
- // Library functions. These default to Expand, but we have instructions
- // for them.
- setOperationAction(ISD::FCEIL, MVT::f16, Legal);
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f16, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- setOperationAction(ISD::FROUND, MVT::f16, Legal);
- setOperationAction(ISD::FROUND, MVT::f32, Legal);
- setOperationAction(ISD::FROUND, MVT::f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ // These map to conversion instructions for scalar FP types.
+ for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
+ ISD::FROUND, ISD::FTRUNC}) {
+ setOperationAction(Op, MVT::f16, Legal);
+ setOperationAction(Op, MVT::f32, Legal);
+ setOperationAction(Op, MVT::f64, Legal);
+ setOperationAction(Op, MVT::v2f16, Expand);
+ }
// 'Expand' implements FCOPYSIGN without calling an external library.
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- // FP16 does not support these nodes in hardware, but we can perform
- // these ops using single-precision hardware.
- setOperationAction(ISD::FDIV, MVT::f16, Promote);
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::FSQRT, MVT::f16, Promote);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FABS, MVT::f16, Promote);
+ // These map to corresponding instructions for f32/f64. f16 must be
+ // promoted to f32. v2f16 is expanded to f16, which is then promoted
+ // to f32.
+ for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
+ ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+ setOperationAction(Op, MVT::f16, Promote);
+ setOperationAction(Op, MVT::f32, Legal);
+ setOperationAction(Op, MVT::f64, Legal);
+ setOperationAction(Op, MVT::v2f16, Expand);
+ }
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
return "NVPTXISD::FUN_SHFR_CLAMP";
case NVPTXISD::IMAD:
return "NVPTXISD::IMAD";
+ case NVPTXISD::SETP_F16X2:
+ return "NVPTXISD::SETP_F16X2";
case NVPTXISD::Dummy:
return "NVPTXISD::Dummy";
case NVPTXISD::MUL_WIDE_SIGNED:
NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
return TypeSplitVector;
-
+ if (VT == MVT::v2f16)
+ return TypeLegal;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
bool ExtendIntegerRetVal =
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
bool needTruncate = false;
EVT TheLoadType = VTs[i];
EVT EltType = Ins[i].VT;
llvm_unreachable("Invalid vector info.");
}
- SDValue VectorOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offsets[VecIdx], dl, MVT::i32),
- InFlag};
+ SDValue LoadOperands[] = {
+ Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), VectorOps, TheLoadType,
+ Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
MachinePointerInfo(), EltAlign);
for (unsigned j = 0; j < NumElts; ++j) {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
+// We can init constant f16x2 with a single .b32 move. Normally it
+// would get lowered as two constant loads and vector-packing move.
+// mov.b16 %h1, 0x4000;
+// mov.b16 %h2, 0x3C00;
+// mov.b32 %hh2, {%h2, %h1};
+// Instead we want just a constant move:
+// mov.b32 %hh2, 0x40003C00
+//
+// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
+// generates good SASS in both cases.
+SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ //return Op;
+ if (!(Op->getValueType(0) == MVT::v2f16 &&
+ isa<ConstantFPSDNode>(Op->getOperand(0)) &&
+ isa<ConstantFPSDNode>(Op->getOperand(1))))
+ return Op;
+
+ APInt E0 =
+ cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
+ APInt E1 =
+ cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
+ SDValue Const =
+ DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+ return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
+}
+
+SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Index = Op->getOperand(1);
+ // Constant index will be matched by tablegen.
+ if (isa<ConstantSDNode>(Index.getNode()))
+ return Op;
+
+ // Extract individual elements and select one of them.
+ SDValue Vector = Op->getOperand(0);
+ EVT VectorVT = Vector.getValueType();
+ assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
+ EVT EltVT = VectorVT.getVectorElementType();
+
+ SDLoc dl(Op.getNode());
+ SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+ DAG.getIntPtrConstant(1, dl));
+ return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
+ ISD::CondCode::SETEQ);
+}
+
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
/// amount, or
case ISD::INTRINSIC_W_CHAIN:
return Op;
case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return Op;
+ case ISD::EXTRACT_VECTOR_ELT:
+ return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
case ISD::STORE:
case MVT::v2i16:
case MVT::v2i32:
case MVT::v2i64:
+ case MVT::v2f16:
case MVT::v2f32:
case MVT::v2f64:
case MVT::v4i8:
case MVT::v4i16:
case MVT::v4i32:
+ case MVT::v4f16:
case MVT::v4f32:
+ case MVT::v8f16: // <4 x f16x2>
// This is a "native" vector type
break;
}
if (EltVT.getSizeInBits() < 16)
NeedExt = true;
+ bool StoreF16x2 = false;
switch (NumElts) {
default:
return SDValue();
case 4:
Opcode = NVPTXISD::StoreV4;
break;
+ case 8:
+ // v8f16 is a special case. PTX doesn't have st.v8.f16
+ // instruction. Instead, we split the vector into v2f16 chunks and
+ // store them with st.v4.b32.
+ assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+ Opcode = NVPTXISD::StoreV4;
+ StoreF16x2 = true;
+ break;
}
SmallVector<SDValue, 8> Ops;
// First is the chain
Ops.push_back(N->getOperand(0));
- // Then the split values
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i, DL));
- if (NeedExt)
- ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
- Ops.push_back(ExtVal);
+ if (StoreF16x2) {
+ // Combine f16,f16 -> v2f16
+ NumElts /= 2;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+ DAG.getIntPtrConstant(i * 2, DL));
+ SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+ DAG.getIntPtrConstant(i * 2 + 1, DL));
+ SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
+ Ops.push_back(V2);
+ }
+ } else {
+ // Then the split values
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+ DAG.getIntPtrConstant(i, DL));
+ if (NeedExt)
+ ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+ Ops.push_back(ExtVal);
+ }
}
// Then any remaining arguments
Ops.append(N->op_begin() + 2, N->op_end());
- SDValue NewSt = DAG.getMemIntrinsicNode(
- Opcode, DL, DAG.getVTList(MVT::Other), Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
+ SDValue NewSt =
+ DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
- //return DCI.CombineTo(N, NewSt, true);
+ // return DCI.CombineTo(N, NewSt, true);
return NewSt;
}
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
- assert(VTs.size() > 0 && "empty aggregate type not expected");
+ assert(VTs.size() > 0 && "Unexpected empty type.");
auto VectorInfo =
VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
unsigned NumElts = parti - VecIdx + 1;
EVT EltVT = VTs[parti];
// i1 is loaded/stored as i8.
- EVT LoadVT = EltVT == MVT::i1 ? MVT::i8 : EltVT;
+ EVT LoadVT = EltVT;
+ if (EltVT == MVT::i1)
+ LoadVT = MVT::i8;
+ else if (EltVT == MVT::v2f16)
+ // getLoad needs a vector type, but it can't handle
+ // vectors which contain v2f16 elements. So we must load
+ // using i32 here and then bitcast back.
+ LoadVT = MVT::i32;
+
EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
SDValue VecAddr =
DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
// We've loaded i1 as an i8 and now must truncate it back to i1
if (EltVT == MVT::i1)
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
- // Extend the element if necesary (e.g an i8 is loaded
+ // v2f16 was loaded as an i32. Now we must bitcast it back.
+ else if (EltVT == MVT::v2f16)
+ Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+ // Extend the element if necesary (e.g. an i8 is loaded
// into an i16 register)
- if (Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+ if (Ins[InsIdx].VT.isInteger() &&
+ Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND;
Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
}
InVals.push_back(Elt);
}
+
// Reset vector tracking state.
VecIdx = -1;
}
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
SmallVector<SDValue, 6> StoreOperands;
- for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
// New load/store. Record chain and offset operands.
if (VectorInfo[i] & PVF_FIRST) {
assert(StoreOperands.empty() && "Orphaned operand list.");
return SDValue();
}
+static SDValue PerformSETCCCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT CCType = N->getValueType(0);
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+
+ if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ SDLoc DL(N);
+ // setp.f16x2 returns two scalar predicates, which we need to
+ // convert back to v2i1. The returned result will be scalarized by
+ // the legalizer, but the comparison will remain a single vector
+ // instruction.
+ SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
+ DCI.DAG.getVTList(MVT::i1, MVT::i1),
+ {A, B, N->getOperand(2)});
+ return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
+ CCNode.getValue(1));
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
case ISD::UREM:
case ISD::SREM:
return PerformREMCombine(N, DCI, OptLevel);
+ case ISD::SETCC:
+ return PerformSETCCCombine(N, DCI);
}
return SDValue();
}
case MVT::v2i16:
case MVT::v2i32:
case MVT::v2i64:
+ case MVT::v2f16:
case MVT::v2f32:
case MVT::v2f64:
case MVT::v4i8:
case MVT::v4i16:
case MVT::v4i32:
+ case MVT::v4f16:
case MVT::v4f32:
+ case MVT::v8f16: // <4 x f16x2>
// This is a "native" vector type
break;
}
unsigned Opcode = 0;
SDVTList LdResVTs;
+ bool LoadF16x2 = false;
switch (NumElts) {
default:
LdResVTs = DAG.getVTList(ListVTs);
break;
}
+ case 8: {
+ // v8f16 is a special case. PTX doesn't have ld.v8.f16
+ // instruction. Instead, we split the vector into v2f16 chunks and
+ // load them with ld.v4.b32.
+ assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+ LoadF16x2 = true;
+ Opcode = NVPTXISD::LoadV4;
+ EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
+ MVT::Other};
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
}
// Copy regular operands
LD->getMemoryVT(),
LD->getMemOperand());
- SmallVector<SDValue, 4> ScalarRes;
-
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
+ SmallVector<SDValue, 8> ScalarRes;
+ if (LoadF16x2) {
+ // Split v2f16 subvectors back into individual elements.
+ NumElts /= 2;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue SubVector = NewLD.getValue(i);
+ SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+ DAG.getIntPtrConstant(1, DL));
+ ScalarRes.push_back(E0);
+ ScalarRes.push_back(E1);
+ }
+ } else {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
}
SDValue LoadChain = NewLD.getValue(NumElts);
--- /dev/null
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: -disable-post-ra -disable-fp-elim \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;
+; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_ret_const() #0 {
+ ret <2 x half> <half 1.0, half 2.0>
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0];
+; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_0(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 0
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0];
+; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_1(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 1
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_i(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
+; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0;
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
+; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+ %e = extractelement <2 x half> %a, i64 %idx
+ ret half %e
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1];
+;
+; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fadd <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+ %r = fadd <2 x half> <half 1.0, half 2.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+ %r = fadd <2 x half> %a, <half 1.0, half 2.0>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0];
+;
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fsub <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0];
+;
+; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0;
+; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fneg(<2 x half> %a) #0 {
+ %r = fsub <2 x half> <half 0.0, half 0.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fmul <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fdiv <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_frem(
+; -- Load two 16x2 inputs and split them into f16 elements
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1];
+; -- Split into elements
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; -- promote to f32.
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; -- frem(a[0],b[0]).
+; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
+; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
+; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
+; -- frem(a[1],b[1]).
+; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
+; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
+; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
+; -- convert back to f16.
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; -- merge into f16x2 and return it.
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+ %r = frem <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
+; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]]
+; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]};
+; CHECK: ret;
+define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
+ %t1 = load <2 x half>, <2 x half>* %a
+ store <2 x half> %t1, <2 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+; number of bitshifting instructions that may change at llvm's whim.
+; So we only verify that we only issue correct number of writes using
+; correct offset, but not the values we write.
+; CHECK-DAG: ld.u64
+; CHECK-DAG: st.u32 [%[[B]]],
+; CHECK-DAG: st.b16 [%[[B]]+4],
+; CHECK: ret;
+define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
+ %t1 = load <3 x half>, <3 x half>* %a
+ store <3 x half> %t1, <3 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v4f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
+; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
+ %t1 = load <4 x half>, <4 x half>* %a
+ store <4 x half> %t1, <4 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v8f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
+; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
+ %t1 = load <8 x half>, <8 x half>* %a
+ store <8 x half> %t1, <8 x half>* %b, align 16
+ ret void
+}
+
+declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[A]];
+; CHECK-DAG: st.param.b32 [param1+0], [[B]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3];
+;
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+;
+; TODO: Currently DAG combiner scalarizes setcc before we can lower it to setp.f16x2.
+; We'd like to see this instruction:
+; CHECK-F16-NOTYET: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; But we end up with a pair of scalar instances of it instead:
+; CHECK-F16-DAG: setp.neu.f16 [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-F16-DAG: setp.neu.f16 [[P1:%p[0-9]+]], [[C1]], [[D1]]
+
+
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+ <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
+; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
+; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+ <2 x float> %c, <2 x float> %d) #0 {
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp une <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ueq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ugt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ult <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ule <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uno <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp one <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oeq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ogt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp olt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ole <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ord <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+ %r = uitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+ %r = uitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+ %r = sitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+ %r = sitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]];
+
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = uitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]];
+;
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = sitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+ %r = fptrunc <2 x float> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
+; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+ %r = fptrunc <2 x double> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x double>
+ ret <2 x double> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]]
+; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16
+; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]]
+; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+ %r = bitcast <2 x half> %a to <2 x i16>
+ ret <2 x i16> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]];
+; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]];
+; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16;
+; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+ %r = bitcast <2 x i16> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+
+declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
+declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
+; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
+; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x half> @test_exp(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x half> @test_exp2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x half> @test_log(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x half> @test_log10(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x half> @test_log2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret
+define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fabs(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16;
+; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+ %tb = fptrunc <2 x float> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+ %tb = fptrunc <2 x double> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]];
+; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]];
+; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
+; CHECK: ret;
+define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ %xr = fpext <2 x half> %r to <2 x float>
+ ret <2 x float> %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_floor(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_ceil(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_trunc(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_rint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_round(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }