FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
/// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
/// values.
- /// In the case where a single input is NaN, the non-NaN input is returned.
+ //
+ /// In the case where a single input is a NaN (either signaling or quiet),
+ /// the non-NaN input is returned.
///
/// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
FMINNUM, FMAXNUM,
+
+ /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
+ /// two values, following the IEEE-754 2008 definition. This differs from
+ /// FMINNUM/FMAXNUM in the handling of signaling NaNs. If one input is a
+ /// signaling NaN, returns a quiet NaN.
+ FMINNUM_IEEE, FMAXNUM_IEEE,
+
/// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0
/// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics,
/// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics.
/// \returns True, if the expansion was successful, false otherwise
bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+ /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
+ SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
+
/// Turn load of vector type into a load of the individual elements.
/// \param LD load to expand
/// \returns MERGE_VALUEs of the scalar loads with their chains.
[SDNPCommutative, SDNPAssociative]>;
def fmaxnum : SDNode<"ISD::FMAXNUM" , SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
+def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
+ [SDNPCommutative]>;
+def fmaxnum_ieee : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
+ [SDNPCommutative]>;
+
def fminnan : SDNode<"ISD::FMINNAN" , SDTFPBinOp>;
def fmaxnan : SDNode<"ISD::FMAXNAN" , SDTFPBinOp>;
def fgetsign : SDNode<"ISD::FGETSIGN" , SDTFPToIntOp>;
case ISD::SETLE:
case ISD::SETULT:
case ISD::SETULE: {
+ // Since it's known never nan to get here already, either fminnum or
+ // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
+ // expanded in terms of it.
+ unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+ if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+ return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
case ISD::SETGE:
case ISD::SETUGT:
case ISD::SETUGE: {
+ unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+ if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+ return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
Results.push_back(Tmp1);
break;
}
-
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: {
+ if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG))
+ Results.push_back(Expanded);
+ break;
+ }
case ISD::FSIN:
case ISD::FCOS: {
EVT VT = Node->getValueType(0);
SDValue ExpandBITREVERSE(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);
+ SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
SDValue ExpandStrictFPOp(SDValue Op);
/// Implements vector promotion.
case ISD::FABS:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
case ISD::FMINNAN:
case ISD::FMAXNAN:
case ISD::FCOPYSIGN:
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
return ExpandCTTZ(Op);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ return ExpandFMINNUM_FMAXNUM(Op);
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:
return DAG.UnrollVectorOp(Op.getNode());
}
+SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
+ if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
+ return Expanded;
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
EVT VT = Op.getValueType();
EVT EltVT = VT.getVectorElementType();
case ISD::FMUL:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
case ISD::FMINNAN:
case ISD::FMAXNAN:
case ISD::SMIN:
// TODO: Refine on operand
return false;
}
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: {
+ // Only one needs to be known not-nan, since it will be returned if the
+ // other ends up being one.
+ return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+ isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+ }
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE: {
+ if (SNaN)
+ return true;
+ // This can return a NaN if either operand is an sNaN, or if both operands
+ // are NaN.
+ return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+ isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+ (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+ isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+ }
+ case ISD::FMINNAN:
+ case ISD::FMAXNAN: {
+ // TODO: Does this quiet or return the origina NaN as-is?
+ return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
- // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on
- // what they should do.
+ }
case ISD::EXTRACT_VECTOR_ELT: {
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
case ISD::FABS: return "fabs";
case ISD::FMINNUM: return "fminnum";
case ISD::FMAXNUM: return "fmaxnum";
+ case ISD::FMINNUM_IEEE: return "fminnum_ieee";
+ case ISD::FMAXNUM_IEEE: return "fmaxnum_ieee";
+
case ISD::FMINNAN: return "fminnan";
case ISD::FMAXNAN: return "fmaxnan";
case ISD::FNEG: return "fneg";
return true;
}
+SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Node);
+ unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
+ ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+ EVT VT = Node->getValueType(0);
+ if (isOperationLegalOrCustom(NewOp, VT)) {
+ SDValue Quiet0 = Node->getOperand(0);
+ SDValue Quiet1 = Node->getOperand(1);
+
+ if (!Node->getFlags().hasNoNaNs()) {
+ // Insert canonicalizes if it's possible we need to quiet to get correct
+ // sNaN behavior.
+ if (!DAG.isKnownNeverSNaN(Quiet0)) {
+ Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
+ Node->getFlags());
+ }
+ if (!DAG.isKnownNeverSNaN(Quiet1)) {
+ Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
+ Node->getFlags());
+ }
+ }
+
+ return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
+ }
+
+ return SDValue();
+}
+
SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
SelectionDAG &DAG) const {
SDLoc SL(LD);
setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
setOperationAction(ISD::FMINNUM, VT, Expand);
setOperationAction(ISD::FMAXNUM, VT, Expand);
+ setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
+ setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
setOperationAction(ISD::FMINNAN, VT, Expand);
setOperationAction(ISD::FMAXNAN, VT, Expand);
setOperationAction(ISD::FMAD, VT, Expand);
case ISD::FMAD:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
case ISD::FSIN:
case ISD::FTRUNC:
case ISD::FRINT:
return ISD::FMINNUM;
case ISD::FMINNUM:
return ISD::FMAXNUM;
+ case ISD::FMAXNUM_IEEE:
+ return ISD::FMINNUM_IEEE;
+ case ISD::FMINNUM_IEEE:
+ return ISD::FMAXNUM_IEEE;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
}
case ISD::FMAXNUM:
case ISD::FMINNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
SIN_HW,
FMAX_LEGACY,
FMIN_LEGACY,
+
FMAX3,
SMAX3,
UMAX3,
def smin_oneuse : HasOneUseBinOp<smin>;
def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;
+
def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
def and_oneuse : HasOneUseBinOp<and>;
def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
(AMDGPUrcp (fsqrt vt:$src)),
(RsqInst $src)
>;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+ [(fminnum_ieee node:$src0, node:$src1),
+ (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+ [(fmaxnum_ieee node:$src0, node:$src1),
+ (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+ [(fminnum_ieee_oneuse node:$src0, node:$src1),
+ (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+ [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+ (fmaxnum_oneuse node:$src0, node:$src1)]
+>;
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);
- setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+ // These are really only legal for ieee_mode functions. We should be avoiding
+ // them for functions that don't have ieee_mode enabled, so just say they are
+ // legal.
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
// F16 - VOP2 Actions.
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
setOperationAction(ISD::FDIV, MVT::f16, Custom);
// F16 - VOP3 Actions.
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
}
if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMINNUM_IEEE);
+ setTargetDAGCombine(ISD::FMAXNUM_IEEE);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
case ISD::FNEG:
case ISD::FCANONICALIZE:
return splitUnaryVectorOp(Op, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
case ISD::FADD:
case ISD::FMUL:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+ // FIXME: Assert during eslection that this is only selected for
+ // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+ // mode functions, but this happens to be OK since it's only done in cases
+ // where there is known no sNaN.
+ if (IsIEEEMode)
+ return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+ if (VT == MVT::v4f16)
+ return splitBinaryVectorOp(Op, DAG);
+ return Op;
+}
+
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMIN3: {
// FIXME: Shouldn't treat the generic operations different based these.
- bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
- if (IsIEEEMode) {
- // snans will be quieted, so we only need to worry about denormals.
- if (Subtarget->supportsMinMaxDenormModes() ||
- denormalsEnabledForType(Op.getValueType()))
- return true;
-
- // Flushing may be required.
- // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
- // targets need to check their input recursively.
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
- isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
- }
+ // However, we aren't really required to flush the result from
+ // minnum/maxnum..
+ // snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
- denormalsEnabledForType(Op.getValueType())) {
- // Only quieting may be necessary.
- return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
- DAG.isKnownNeverSNaN(Op.getOperand(1));
+ denormalsEnabledForType(Op.getValueType()))
+ return true;
+
+ // Flushing may be required.
+ // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+ // targets need to check their input recursively.
+
+ // FIXME: Does this apply with clamp? It's implemented with max.
+ for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+ return false;
}
- // Flushing and quieting may be necessary
- // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
- // needs to be quieted.
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
- isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+ return true;
}
case ISD::SELECT: {
return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
// Could be anything.
return false;
+ case ISD::BITCAST: {
+ // Hack round the mess we make when legalizing extract_vector_elt
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType() == MVT::i16 &&
+ Src.getOpcode() == ISD::TRUNCATE) {
+ SDValue TruncSrc = Src.getOperand(0);
+ if (TruncSrc.getValueType() == MVT::i32 &&
+ TruncSrc.getOpcode() == ISD::BITCAST &&
+ TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+ return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+ }
+ }
+
+ return false;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntrinsicID
= cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
}
// Constant fold canonicalize.
-
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
}
}
+ unsigned SrcOpc = N0.getOpcode();
+
+ // If it's free to do so, push canonicalizes further up the source, which may
+ // find a canonical source.
+ //
+ // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+ // sNaNs.
+ if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+ auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+ if (CRHS && N0.hasOneUse()) {
+ SDLoc SL(N);
+ SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+ N0.getOperand(0));
+ SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+ DCI.AddToWorklist(Canon0.getNode());
+
+ return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
+ }
+ }
+
return isCanonicalized(DAG, N0) ? N0 : SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
+ case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
return AMDGPUISD::UMAX3;
case ISD::FMINNUM:
+ case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+ (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
case ISD::SMIN:
case ISD::SMAX:
case ISD::FMAXNUM:
- case ISD::FMINNUM: {
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY: {
- if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+ if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
getTargetMachine().getOptLevel() > CodeGenOpt::None)
return performMinMaxCombine(N, DCI);
break;
return false;
}
}
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN,
+ unsigned Depth) const {
+ if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+ if (Subtarget->enableDX10Clamp())
+ return true; // Clamped to 0.
+ return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+ }
+
+ return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+ SNaN, Depth);
+}
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(EVT VT) const;
+
+ bool isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN = false,
+ unsigned Depth = 0) const override;
};
} // End namespace llvm
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
+ //SDPatternOperator max, SDPatternOperator min,
Instruction med3Inst> : GCNPat<
- (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
- (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
class FP16Med3Pat<ValueType vt,
Instruction med3Inst> : GCNPat<
- (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
- (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
- (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
- (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
>;
defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
+; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
+; GCN-NOT: [[MAX]]
+; GCN-NOT: [[MED]]
+
+; SI: buffer_store_dword [[MED]]
+; SI: buffer_store_dword [[MAX]]
+
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-; VI-DENORM-NOT: v_max_f32
-; VI-DENORM-NOT: v_mul_f32
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-NOT: v_max
+; GCN-NOT: v_mul
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-
-; VI-DENORM-NOT: v_max
-; VI-DENORM-NOT: v_mul
; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
+
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
}
; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GFX9: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
-; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
+
+; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
}
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
-; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
+
+; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; Need to quiet the nan with a separate instruction since it will be
; passed through the minnum.
+; FIXME: canonicalize doens't work correctly without ieee_mode
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
; GFX9-NEXT: ; return to shader
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
-; VI-DENORM: v_max_f32_e32 v0, v0, v0
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return
+
+; VI-DENORM-NOT: v0
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
; GFX9: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+
+; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
+; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+
; VI-NEXT: s_setpc_b64
define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
declare double @llvm.maxnum.f64(double, double) nounwind readnone
; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
+; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]]
+; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]]
+; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]]
; SI: buffer_store_dwordx2 [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]
; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
; GCN: buffer_store_short [[RESULT]],
; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]
; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
; GCN: buffer_store_short [[RESULT]],
; SI-NEXT: v_max3_f32
; SI-NEXT: v_max3_f32
-; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v1
-; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_max_f16_e32 v0, v2, v0
-; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_max_f16
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16
; GFX9-NEXT: v_pk_max_f16
; GFX9-NEXT: v_pk_max_f16
-define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
entry:
- %max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
- %max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
- %res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
+ %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
+ %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
+ %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
ret <2 x half> %res
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #1
; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
}
; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
; EG: MAX
}
; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
}
; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
}
; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
}
; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
}
; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE-NOT: v_cmp
+; VI-SAFE-NOT: v_cndmask
+
; GCN-NONAN: v_max_f32_e32
; GCN-NONAN: v_max_f32_e32
; GCN-NONAN: v_max_f32_e32
+
+; GCN-NOT: v_max
define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
}
; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: v_max_
; GCN: v_cmp_gt_f32
; GCN-NEXT: v_cndmask_b32
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; GCN-LABEL: {{^}}test_fmax_f32:
-; GCN: v_max_f32_e32
-define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 {
- %val = call float @llvm.maxnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+ %val = call float @llvm.maxnum.f32(float %a, float %b) #1
store float %val, float addrspace(1)* %out, align 4
ret void
}
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off:
+; GCN: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 {
+ %val = call float @llvm.maxnum.f32(float %a, float %b) #1
+ ret float %val
+}
+
; GCN-LABEL: {{^}}test_fmax_v2f32:
; GCN: v_max_f32_e32
; GCN: v_max_f32_e32
ret void
}
-; GCN-LABEL: {{^}}fmax_var_immediate_f32:
+; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee:
; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.maxnum.f32(float %a, float 2.0)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+ ret float %val
}
-; GCN-LABEL: {{^}}fmax_immediate_var_f32:
+; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee:
; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.maxnum.f32(float 2.0, float %a)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+ ret float %val
}
-; GCN-LABEL: {{^}}fmax_var_literal_f32:
+; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.maxnum.f32(float %a, float 99.0)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+ ret float %val
}
-; GCN-LABEL: {{^}}fmax_literal_var_f32:
+; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.maxnum.f32(float 99.0, float %a)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+ ret float %val
}
; GCN-LABEL: {{^}}test_func_fmax_v3f32:
; SI-NEXT: v_min3_f32
; SI-NEXT: v_min3_f32
-; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v1
-; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_min_f16_e32 v0, v2, v0
-; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+; VI: s_waitcnt
+; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT: s_setpc_b64
+define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
entry:
- %min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
- %min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
- %res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
+ %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+ %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
+ %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
ret <2 x half> %res
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s
; GCN-LABEL: {{^}}min_fneg_select_regression_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ult float %a, 1.0
}
; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
}
; GCN-LABEL: {{^}}max_fneg_select_regression_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
-define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ugt float %a, 1.0
%min.a = select i1 %cmp.a, float %fneg.a, float -1.0
}
; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
-define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ugt float %a, -1.0
%min.a = select i1 %cmp.a, float %fneg.a, float 1.0
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #1
; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
; EG: MIN *
-; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
}
; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
-; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+
+; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
+; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%cmp = fcmp ule float %a, %b
%val = select i1 %cmp, float %a, float %b
ret void
}
+; Nsz also needed
+; FIXME: Should separate tests
; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
%a.nnan = fadd nnan float %a, 1.0
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
}
; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
-; GCN: buffer_load_dwordx2
-; GCN: buffer_load_dwordx2
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE-NOT: v_min_
+
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
+; GCN-NONAN-NOT: v_min_
define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
}
; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: v_min
; GCN: v_cmp_le_f32
; GCN-NEXT: v_cndmask_b32
declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
-; FUNC-LABEL: @test_fmin_f64
-; SI: v_min_f64
-define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+; FUNC-LABEL: {{^}}test_fmin_f64_ieee:
+; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
+; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
+; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
+define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind {
+ %val = call double @llvm.minnum.f64(double %a, double %b) #0
+ store double %val, double addrspace(1)* undef, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee:
+; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
+; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
+; SI-NOT: [[VAL0]]
+; SI-NOT: [[VAL1]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
+; SI-NOT: [[RESULT]]
+; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
+define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
+ %a = load volatile double, double addrspace(3)* undef
+ %b = load volatile double, double addrspace(3)* undef
%val = call double @llvm.minnum.f64(double %a, double %b) #0
- store double %val, double addrspace(1)* %out, align 8
+ store volatile double %val, double addrspace(3)* undef
ret void
}
-; FUNC-LABEL: @test_fmin_v2f64
+; FUNC-LABEL: {{^}}test_fmin_v2f64:
; SI: v_min_f64
; SI: v_min_f64
define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
ret void
}
-; FUNC-LABEL: @test_fmin_v4f64
+; FUNC-LABEL: {{^}}test_fmin_v4f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
ret void
}
-; FUNC-LABEL: @test_fmin_v8f64
+; FUNC-LABEL: {{^}}test_fmin_v8f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
ret void
}
-; FUNC-LABEL: @test_fmin_v16f64
+; FUNC-LABEL: {{^}}test_fmin_v16f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; GCN-LABEL: {{^}}test_fmin_f32:
-; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 {
- %val = call float @llvm.minnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+ %val = call float @llvm.minnum.f32(float %a, float %b) #1
store float %val, float addrspace(1)* %out, align 4
ret void
}
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on:
+; GCN: s_waitcnt
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64
+define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 {
+ %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+ ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 {
+ %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+ ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off:
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
+ %val = call float @llvm.minnum.f32(float %a, float %b) #1
+ ret float %val
+}
+
; GCN-LABEL: {{^}}test_fmin_v2f32:
; GCN: v_min_f32_e32
; GCN: v_min_f32_e32
ret void
}
-; GCN-LABEL: {{^}}fmin_var_immediate_f32:
-; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.minnum.f32(float %a, float 2.0)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee:
+; GCN: v_min_f32_e32 v0, 2.0, v0
+define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 {
+ %val = call float @llvm.minnum.f32(float %a, float 2.0) #1
+ ret float %val
}
-; GCN-LABEL: {{^}}fmin_immediate_var_f32:
+; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee:
; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.minnum.f32(float 2.0, float %a)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.minnum.f32(float 2.0, float %a) #1
+ ret float %val
}
-; GCN-LABEL: {{^}}fmin_var_literal_f32:
+; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.minnum.f32(float %a, float 99.0)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.minnum.f32(float %a, float 99.0) #1
+ ret float %val
}
-; GCN-LABEL: {{^}}fmin_literal_var_f32:
+; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
- %val = call float @llvm.minnum.f32(float 99.0, float %a)
- store float %val, float addrspace(1)* %out, align 4
- ret void
+define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 {
+ %val = call float @llvm.minnum.f32(float 99.0, float %a) #1
+ ret float %val
}
; GCN-LABEL: {{^}}test_func_fmin_v3f32:
; fminnum tests
; --------------------------------------------------------------------------------
-; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
-; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
+ %min = call float @llvm.minnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %min
+ ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
-; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
+ %min = call float @llvm.minnum.f32(float %a, float %a)
+ %min.fneg = fsub float -0.0, %min
+ ret float %min.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
-; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
+ %min = call float @llvm.minnum.f32(float 4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %min
+ ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
+ %min = call float @llvm.minnum.f32(float -4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %min
+ ret float %fneg
+}
+
; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
ret void
}
-; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
-; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
-; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
-; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
+; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
-; VI: v_max_f16_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
-; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], 0.15915494
+; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
ret float %fneg
}
-; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
}
; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
-; SI: v_max_f32_e64 [[MIN:v[0-9]+]], -[[A]], [[K]]
+; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
ret void
}
-; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+ %min = call float @llvm.minnum.f32(float 0.0, float %a)
+ %fneg = fsub float -0.000000e+00, %min
+ %mul = fmul float %fneg, %b
+ ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
+ %min = call float @llvm.minnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %min
+ %use1 = fmul float %min, 4.0
+ %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+ %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+ ret <2 x float> %ins1
+}
+
; --------------------------------------------------------------------------------
; fmaxnum tests
; --------------------------------------------------------------------------------
-; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
- %min = call float @llvm.maxnum.f32(float %a, float %b)
- %fneg = fsub float -0.000000e+00, %min
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %max
store float %fneg, float addrspace(1)* %out.gep
ret void
}
-; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %max
+ ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
- %min = call float @llvm.maxnum.f32(float %a, float %a)
- %min.fneg = fsub float -0.0, %min
- store float %min.fneg, float addrspace(1)* %out.gep
+ %max = call float @llvm.maxnum.f32(float %a, float %a)
+ %max.fneg = fsub float -0.0, %max
+ store float %max.fneg, float addrspace(1)* %out.gep
ret void
}
-; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
+ %max = call float @llvm.maxnum.f32(float %a, float %a)
+ %max.fneg = fsub float -0.0, %max
+ ret float %max.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
- %min = call float @llvm.maxnum.f32(float 4.0, float %a)
- %fneg = fsub float -0.000000e+00, %min
+ %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
store float %fneg, float addrspace(1)* %out.gep
ret void
}
-; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
+ %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
+ ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
- %min = call float @llvm.maxnum.f32(float -4.0, float %a)
- %fneg = fsub float -0.000000e+00, %min
+ %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
store float %fneg, float addrspace(1)* %out.gep
ret void
}
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
+ %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
+ ret float %fneg
+}
+
; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
ret void
}
-; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
-; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
+ %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
+ ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
ret void
}
-; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+ %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+ %fneg = fsub float -0.000000e+00, %max
+ %mul = fmul float %fneg, %b
+ ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
- %min = call float @llvm.maxnum.f32(float %a, float %b)
- %fneg = fsub float -0.000000e+00, %min
- %use1 = fmul float %min, 4.0
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %max
+ %use1 = fmul float %max, 4.0
store volatile float %fneg, float addrspace(1)* %out
store volatile float %use1, float addrspace(1)* %out
ret void
}
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %fneg = fsub float -0.000000e+00, %max
+ %use1 = fmul float %max, 4.0
+ %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+ %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+ ret <2 x float> %ins1
+}
+
; --------------------------------------------------------------------------------
; fma tests
; --------------------------------------------------------------------------------
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%b.nnan.add = fadd nnan float %b, 1.0
ret float %med
}
+define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %b.nsnan = fadd float %b, 1.0
+ %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
+ %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+ %med = call float @llvm.minnum.f32(float %max, float 4.0)
+ ret float %med
+}
+
+define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %a.nsnan = fadd float %a, 1.0
+ %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
+ %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+ %med = call float @llvm.minnum.f32(float %max, float 4.0)
+ ret float %med
+}
+
define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%b.nnan.add = fadd nnan float %b, 1.0
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%cmp = icmp eq i32 %c, 0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%cmp = icmp eq i32 %c, 0
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_med3_f32 v0, v0, v1, v2
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min0 = call float @llvm.minnum.f32(float %a, float %b)
%known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
declare half @llvm.maxnum.f16(half %a, half %b)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
-; GCN-LABEL: {{^}}maxnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @maxnum_f16(
+; SI-LABEL: maxnum_f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}maxnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @maxnum_f16_imm_a(
+; SI-LABEL: maxnum_f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}maxnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @maxnum_f16_imm_b(
+; SI-LABEL: maxnum_f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_b:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %a) {
entry:
ret void
}
-; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16(
+; SI-LABEL: maxnum_v2f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s6, s[6:7], 0x0
+; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s1, s6, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT: s_lshr_b32 s0, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_v2f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_load_dword s5, s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s5, s5
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: v_max_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
<2 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @maxnum_v2f16_imm_a(
+; SI-LABEL: maxnum_v2f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x44004200
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @maxnum_v2f16_imm_b(
+; SI-LABEL: maxnum_v2f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4200
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_b:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x42004400
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
entry:
}
; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}maxnum_v3f16:
-; GFX9: v_pk_max_f16
-; GFX9: v_pk_max_f16
define amdgpu_kernel void @maxnum_v3f16(
+; SI-LABEL: maxnum_v3f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s1, s6, 16
+; SI-NEXT: s_lshr_b32 s4, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_max_f32_e32 v1, v1, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_v3f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s6, s6, 16
+; VI-NEXT: v_max_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_max_f16_e32 v1, v2, v1
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_v3f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<3 x half> addrspace(1)* %r,
<3 x half> addrspace(1)* %a,
<3 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}maxnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
define amdgpu_kernel void @maxnum_v4f16(
+; SI-LABEL: maxnum_v4f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT: s_lshr_b32 s4, s5, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: s_lshr_b32 s4, s7, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT: s_lshr_b32 s4, s6, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_max_f32_e32 v3, v3, v5
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_max_f32_e32 v1, v1, v5
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_max_f32_e32 v2, v2, v5
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_max_f32_e32 v0, v0, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: maxnum_v4f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v0, s7, s7
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: s_lshr_b32 s7, s7, 16
+; VI-NEXT: v_max_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s5, s6, 16
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: maxnum_v4f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<4 x half> addrspace(1)* %r,
<4 x half> addrspace(1)* %a,
<4 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}fmax_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
define amdgpu_kernel void @fmax_v4f16_imm_a(
+; SI-LABEL: fmax_v4f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT: s_lshr_b32 s5, s5, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fmax_v4f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x4400
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: v_max_f16_e64 v3, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v3, 0x4000
+; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: fmax_v4f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s8, 0x44004200
+; GFX9-NEXT: s_mov_b32 s9, 0x40004800
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
+; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<4 x half> addrspace(1)* %r,
<4 x half> addrspace(1)* %b) {
entry:
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
declare half @llvm.minnum.f16(half %a, half %b)
declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
-; GCN-LABEL: {{^}}minnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
-define amdgpu_kernel void @minnum_f16(
+define amdgpu_kernel void @minnum_f16_ieee(
+; SI-LABEL: minnum_f16_ieee:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_f16_ieee:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_f16_ieee:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}minnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
+define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) {
+; SI-LABEL: minnum_f16_no_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: minnum_f16_no_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_f16_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
+ %r.val = call half @llvm.minnum.f16(half %a, half %b)
+ ret half %r.val
+}
+
define amdgpu_kernel void @minnum_f16_imm_a(
+; SI-LABEL: minnum_f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}minnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @minnum_f16_imm_b(
+; SI-LABEL: minnum_f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_b:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %a) {
entry:
ret void
}
-; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-define amdgpu_kernel void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16_ieee(
+; SI-LABEL: minnum_v2f16_ieee:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s6, s[6:7], 0x0
+; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s1, s6, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT: s_lshr_b32 s0, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_v2f16_ieee:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_load_dword s5, s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s5, s5
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: v_min_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_ieee:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
<2 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
+define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) {
+; SI-LABEL: minnum_v2f16_no_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, v0, v2
+; SI-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: minnum_v2f16_no_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_v2f16_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
+ %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r.val
+}
-; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @minnum_v2f16_imm_a(
+; SI-LABEL: minnum_v2f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x44004200
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @minnum_v2f16_imm_b(
+; SI-LABEL: minnum_v2f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4200
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_b:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x42004400
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
entry:
}
; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}minnum_v3f16:
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
define amdgpu_kernel void @minnum_v3f16(
+; SI-LABEL: minnum_v3f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s1, s6, 16
+; SI-NEXT: s_lshr_b32 s4, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_min_f32_e32 v2, v3, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_v3f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s6, s6, 16
+; VI-NEXT: v_min_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_min_f16_e32 v1, v2, v1
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_v3f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<3 x half> addrspace(1)* %r,
<3 x half> addrspace(1)* %a,
<3 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}minnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
define amdgpu_kernel void @minnum_v4f16(
+; SI-LABEL: minnum_v4f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT: s_lshr_b32 s4, s5, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: s_lshr_b32 s4, s7, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT: s_lshr_b32 s4, s6, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_min_f32_e32 v3, v3, v5
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_min_f32_e32 v1, v1, v5
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_min_f32_e32 v2, v2, v5
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_min_f32_e32 v0, v0, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: minnum_v4f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v0, s7, s7
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: s_lshr_b32 s7, s7, 16
+; VI-NEXT: v_min_f16_e32 v0, v1, v0
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_lshr_b32 s5, s6, 16
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: minnum_v4f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_min_f16 v1, v1, v0
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<4 x half> addrspace(1)* %r,
<4 x half> addrspace(1)* %a,
<4 x half> addrspace(1)* %b) {
ret void
}
-; GCN-LABEL: {{^}}fmin_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
define amdgpu_kernel void @fmin_v4f16_imm_a(
+; SI-LABEL: fmin_v4f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT: s_lshr_b32 s5, s5, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fmin_v4f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x4400
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: v_max_f16_e64 v3, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v3, 0x4000
+; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: fmin_v4f16_imm_a:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s8, 0x44004200
+; GFX9-NEXT: s_mov_b32 s9, 0x40004800
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
+; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
<4 x half> addrspace(1)* %r,
<4 x half> addrspace(1)* %b) {
entry:
}
; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
-; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-
-; VI: v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
}
; GCN-LABEL: {{^}}reduction_minnum_v4f16:
-; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
-; VI: v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_minnum_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
ret half %res
}
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
-; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
-; VI: v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
ret half %res
}
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
-; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
+
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
-; VI: v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>