From 37e671e894a4a3f4fcb330f2f62e3ad2df94a4c6 Mon Sep 17 00:00:00 2001 From: Robert Khasanov Date: Fri, 29 Aug 2014 08:46:04 +0000 Subject: [PATCH] [SKX] Enable lowering of integer CMP operations. Added new types to Legalizer. Fixed getSetCCResultType function Added lowering tests. Reviewed by Elena Demikhovsky. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216717 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 84 +++++++- test/CodeGen/X86/avx512-vec-cmp.ll | 148 +++++++++++++ test/CodeGen/X86/avx512bw-vec-cmp.ll | 135 ++++++++++++ test/CodeGen/X86/avx512bwvl-vec-cmp.ll | 269 +++++++++++++++++++++++ test/CodeGen/X86/avx512vl-vec-cmp.ll | 381 +++++++++++++++++++++++++++++++++ 5 files changed, 1008 insertions(+), 9 deletions(-) create mode 100644 test/CodeGen/X86/avx512bw-vec-cmp.ll create mode 100644 test/CodeGen/X86/avx512bwvl-vec-cmp.ll create mode 100644 test/CodeGen/X86/avx512vl-vec-cmp.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b3a02726ebf..7c0bfeef947 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1526,8 +1526,39 @@ void X86TargetLowering::resetOperationActions() { }// has AVX-512 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + setOperationAction(ISD::LOAD, MVT::v32i16, Legal); + setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); + setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + const MVT VT = (MVT::SimpleValueType)i; + + const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Do not attempt to promote non-256-bit vectors + if (!VT.is512BitVector()) + continue; + + if ( EltSize < 32) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + } + } + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); } // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion @@ -1665,10 +1696,40 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - if (Subtarget->hasAVX512()) - switch(VT.getVectorNumElements()) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; + const unsigned NumElts = VT.getVectorNumElements(); + const EVT EltVT = VT.getVectorElementType(); + if (VT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } + + if (VT.is256BitVector() || VT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } } return VT.changeVectorElementTypeToInteger(); @@ -10435,6 +10496,8 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { break; case MVT::v8i16: case MVT::v16i16: + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + break; return SDValue(); } @@ -12829,7 +12892,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); @@ -12943,11 +13006,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, EVT OpVT = Op1.getValueType(); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || + (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, - // But there is no compare instruction for i8 and i16 elements. + // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && @@ -20218,13 +20282,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper - // lowering on AVX-512. In this case we convert it to + // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. - // The same situation for all 128 and 256-bit vectors of i8 and i16 + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || - OpVT.getVectorElementType() == MVT::i16)) { + OpVT.getVectorElementType() == MVT::i16) && + !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 950e43fea67..d9acc1d325f 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -162,3 +162,151 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { ret <8 x i64>%res } +; CHECK-LABEL: @test16 +; CHECK: vpcmpled +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind { + %mask = icmp sge <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +; CHECK-LABEL: @test17 +; CHECK: vpcmpgtd (%rdi) +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sgt <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: @test18 +; CHECK: vpcmpled (%rdi) +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: @test19 +; CHECK: vpcmpleud (%rdi) +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp ule <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: @test20 +; CHECK: vpcmpeqd %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind { + %mask1 = icmp eq <16 x i32> %x1, %y1 + %mask0 = icmp eq <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +; CHECK-LABEL: @test21 +; CHECK: vpcmpleq %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind { + %mask1 = icmp sge <8 x i64> %x1, %y1 + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +; CHECK-LABEL: @test22 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { + %mask1 = icmp sgt <8 x i64> %x1, %y1 + %y = load <8 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +; CHECK-LABEL: @test23 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { + %mask1 = icmp sge <16 x i32> %x1, %y1 + %y = load <16 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: test24 +; CHECK: vpcmpeqq (%rdi){1to8} +; CHECK: vmovdqa64 +; CHECK: ret +define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask = icmp eq <8 x i64> %x, %y + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +; CHECK-LABEL: test25 +; CHECK: vpcmpled (%rdi){1to16} +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: test26 +; CHECK: vpcmpgtd (%rdi){1to16}{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { + %mask1 = icmp sge <16 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask0 = icmp sgt <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +; CHECK-LABEL: test27 +; CHECK: vpcmpleq (%rdi){1to8}{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { + %mask1 = icmp sge <8 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll new file mode 100644 index 00000000000..d2b1724ebf9 --- /dev/null +++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp eq <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp sgt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: @test3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind { + %mask = icmp sge <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: test4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp ugt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind { + %y = load <32 x i16>* %yp, align 4 + %mask = icmp eq <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sgt <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sle <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp ule <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test9 +; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp eq <32 x i16> %x1, %y1 + %mask0 = icmp eq <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: @test10 +; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sge <64 x i8> %x1, %y1 + %mask0 = icmp sle <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sgt <64 x i8> %x1, %y1 + %y = load <64 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp sge <32 x i16> %x1, %y1 + %y = load <32 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll new file mode 100644 index 00000000000..2d13a166a72 --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -0,0 +1,269 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind { + %mask = icmp eq <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp sgt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind { + %mask = icmp sge <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp ugt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind { + %y = load <16 x i16>* %yp, align 4 + %mask = icmp eq <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sgt <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sle <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp ule <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp eq <16 x i16> %x1, %y1 + %mask0 = icmp eq <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sge <32 x i8> %x1, %y1 + %mask0 = icmp sle <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sgt <32 x i8> %x1, %y1 + %y = load <32 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp sge <16 x i16> %x1, %y1 + %y = load <16 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind { + %mask = icmp eq <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp sgt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind { + %mask = icmp sge <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp ugt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind { + %y = load <8 x i16>* %yp, align 4 + %mask = icmp eq <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sgt <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sle <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp ule <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp eq <8 x i16> %x1, %y1 + %mask0 = icmp eq <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sge <16 x i8> %x1, %y1 + %mask0 = icmp sle <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sgt <16 x i8> %x1, %y1 + %y = load <16 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp sge <8 x i16> %x1, %y1 + %y = load <8 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll new file mode 100644 index 00000000000..9c64c0341e3 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -0,0 +1,381 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp sgt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind { + %mask = icmp sge <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp ugt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { + %y = load <8 x i32>* %yp, align 4 + %mask = icmp eq <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sgt <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp ule <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp eq <8 x i32> %x1, %y1 + %mask0 = icmp eq <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %mask0 = icmp sle <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sgt <4 x i64> %x1, %y1 + %y = load <4 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %y = load <8 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_13 +; CHECK: vpcmpeqq (%rdi){1to4}, %ymm +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_14 +; CHECK: vpcmpled (%rdi){1to8}, %ymm +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_15 +; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask0 = icmp sgt <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_16 +; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp sgt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind { + %mask = icmp sge <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp ugt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { + %y = load <4 x i32>* %yp, align 4 + %mask = icmp eq <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sgt <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp ule <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp eq <4 x i32> %x1, %y1 + %mask0 = icmp eq <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %mask0 = icmp sle <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sgt <2 x i64> %x1, %y1 + %y = load <2 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %y = load <4 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_13 +; CHECK: vpcmpeqq (%rdi){1to2}, %xmm +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_14 +; CHECK: vpcmpled (%rdi){1to4}, %xmm +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_15 +; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_16 +; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} -- 2.11.0