From 1abb66152a205f979ddb757f7d9c0e9a17371384 Mon Sep 17 00:00:00 2001 From: Ayman Musa Date: Thu, 15 Jun 2017 13:02:37 +0000 Subject: [PATCH] [X86][AVX512] Improve lowering of AVX512 compare intrinsics (remove redundant shift left+right instructions). AVX512 compare instructions return v*i1 types. In cases where the number of elements in the returned value are less than 8, clang adds zeroes to get a mask of v8i1 type. Later on it's replaced with CONCAT_VECTORS, which then is lowered to many DAG nodes including insert/extract element and shift right/left nodes. The fact that AVX512 compare instructions put the result in a k register and zeroes all its upper bits allows us to remove the extra nodes simply by copying the result to the required register class. When lowering, identify these cases and transform them into an INSERT_SUBVECTOR node (marked legal), then catch this pattern in instructions selection phase and transform it into one avx512 cmp instruction. Differential Revision: https://reviews.llvm.org/D33188 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305465 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 95 + lib/Target/X86/X86InstrAVX512.td | 645 +- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 32 - test/CodeGen/X86/avx512vl-vec-masked-cmp.ll | 13484 ++++++++++++++++++++++ test/CodeGen/X86/compress_expand.ll | 8 +- test/CodeGen/X86/masked_memop.ll | 16 +- 6 files changed, 14205 insertions(+), 75 deletions(-) create mode 100644 test/CodeGen/X86/avx512vl-vec-masked-cmp.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 29b438e9bff..cfdbbc3ee32 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5059,6 +5059,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +// Return true if the instruction zeroes the unused upper part of the +// destination and accepts mask. +static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { + switch (Opcode) { + default: + return false; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return true; + } +} + /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5091,6 +5105,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) + // If this node widens - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark this node as legal to enable replacing them with + // the v8i1 version of the previous instruction during instruction selection. + // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg, + // while zeroing all the upper remaining 60 bits of the register. if the + // result of such instruction is inserted into an allZeroVector, then we can + // safely remove insert_vector (in instruction selection) as the cmp instr + // already zeroed the rest of the register. + if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 && + (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) || + (SubVec.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode()))))) + return Op; + // extend to natively supported kshift MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; MVT WideOpVT = OpVT; @@ -7913,6 +7943,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } +// Return true if all the operands of the given CONCAT_VECTORS node are zeros +// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) +static bool isExpandWithZeros(const SDValue &Op) { + assert(Op.getOpcode() == ISD::CONCAT_VECTORS && + "Expand with zeros only possible in CONCAT_VECTORS nodes!"); + + for (unsigned i = 1; i < Op.getNumOperands(); i++) + if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) + return false; + + return true; +} + +// Returns true if the given node is a type promotion (by concatenating i1 +// zeros) of the result of a node that already zeros all upper bits of +// k-register. +static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { + unsigned Opc = Op.getOpcode(); + + assert(Opc == ISD::CONCAT_VECTORS && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected node to check for type promotion!"); + + // As long as we are concatenating zeros to the upper part of a previous node + // result, climb up the tree until a node with different opcode is + // encountered + while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { + if (Opc == ISD::INSERT_SUBVECTOR) { + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && + Op.getConstantOperandVal(2) == 0) + Op = Op.getOperand(1); + else + return SDValue(); + } else { // Opc == ISD::CONCAT_VECTORS + if (isExpandWithZeros(Op)) + Op = Op.getOperand(0); + else + return SDValue(); + } + Opc = Op.getOpcode(); + } + + // Check if the first inserted node zeroes the upper bits, or an 'and' result + // of a node that zeros the upper bits (its masked version). + if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || + (Op.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { + return Op; + } + + return SDValue(); +} + static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { @@ -7923,6 +8007,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + // If this node promotes - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark it as legal and catch the pattern in instruction + // selection to avoid emitting extra insturctions (for zeroing upper bits). + if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { + SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64); + SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, + ZeroC); + } + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2620679df25..6fa9fdc73f8 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -185,6 +185,20 @@ def avx512vl_f32_info : AVX512VLVectorVTInfo; +class X86KVectorVTInfo { + RegisterClass KRC = _krc; + RegisterClass KRCWM = _krcwm; + ValueType KVT = _vt; +} + +def v2i1_info : X86KVectorVTInfo; +def v4i1_info : X86KVectorVTInfo; +def v8i1_info : X86KVectorVTInfo; +def v16i1_info : X86KVectorVTInfo; +def v32i1_info : X86KVectorVTInfo; +def v64i1_info : X86KVectorVTInfo; + // This multiclass generates the masking variants from the non-masking // variant. It only provides the assembly pieces for the masking variants. // It assumes custom ISel patterns for masking which can be provided as @@ -1735,17 +1749,217 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -} +multiclass avx512_icmp_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rr) _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rm) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrk) _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_packed_rmb_lowering Preds> + : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmb) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmbk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +// VPCMPEQB - i8 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPEQW - i16 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPEQD - i32 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPEQQ - i64 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPGTB - i8 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPGTW - i16 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPGTD - i32 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPGTQ - i64 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { @@ -1908,6 +2122,237 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_cc_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrik) _.KRCWM:$mask, + _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmik) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_cc_packed_rmb_lowering Preds> + : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmib) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmibk) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +// VPCMPB - i8 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPW - i16 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPUB - i8 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPUW - i16 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPUD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPUQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + multiclass avx512_vcmp_common { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -1998,21 +2443,108 @@ defm VCMPPD : avx512_vcmp, defm VCMPPS : avx512_vcmp, AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VCMPPSZrri - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPUDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; +multiclass avx512_fcmp_cc_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmbi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_fcmp_cc_packed_sae_lowering Preds> + : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { + +let Predicates = Preds in + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrib) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; +} + + +// VCMPPS - f32 +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; + +// VCMPPD - f64 +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; // ---------------------------------------------------------------- // FPClass @@ -2498,6 +3030,69 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; +multiclass axv512_icmp_packed_no_vlx_lowering { +def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (!cast(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrk) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; +} + +multiclass axv512_icmp_packed_cc_no_vlx_lowering { +def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (!cast(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrik) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; +} + // Mask setting all 0s or 1s multiclass avx512_mask_setop { let Predicates = [HasAVX512] in diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index c2d8df6476b..9e0090c26e4 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1004,8 +1004,6 @@ define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_pcmpeq_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1018,8 +1016,6 @@ define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1058,8 +1054,6 @@ define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_pcmpgt_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1072,8 +1066,6 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1087,8 +1079,6 @@ define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_pcmpeq_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1101,8 +1091,6 @@ define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1116,10 +1104,6 @@ define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_pcmpeq_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1132,10 +1116,6 @@ define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1149,8 +1129,6 @@ define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_pcmpgt_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1163,8 +1141,6 @@ define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1178,10 +1154,6 @@ define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_pcmpgt_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1194,10 +1166,6 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll new file mode 100644 index 00000000000..6779c0753c4 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -0,0 +1,13484 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX + +define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) + %3 = zext i16 %2 to i32 + ret i32 %3 +} + + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) + %3 = zext i16 %2 to i64 + ret i64 %3 +} + + +declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i16 + ret i16 %3 +} + + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i32 + ret i32 %3 +} + + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i64 + ret i64 %3 +} + + diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll index e09fcf2a336..f62e18869a9 100644 --- a/test/CodeGen/X86/compress_expand.ll +++ b/test/CodeGen/X86/compress_expand.ll @@ -265,9 +265,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; @@ -295,9 +293,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index 3c616e8a9f4..7a2e41e10a3 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -462,9 +462,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -550,9 +548,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> % ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -601,9 +597,7 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -645,9 +639,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer -- 2.11.0