From: Craig Topper Date: Thu, 21 Jun 2018 05:00:56 +0000 (+0000) Subject: [X86] Remove masking from 512-bit floating max/min intrinsics. Use select instruction... X-Git-Tag: android-x86-8.1-r1~1849 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=cb5e58dbd93b62afb455c2ca8ee47c57eaf2a5c1;p=android-x86%2Fexternal-llvm.git [X86] Remove masking from 512-bit floating max/min intrinsics. Use select instruction instead. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@335199 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index d9701c2a9e8..399d327f9da 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3752,18 +3752,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">, + def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 7aa2685a14e..4a79275feea 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -225,14 +225,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.cvtw2mask.") || // Added in 7.0 Name.startswith("avx512.cvtd2mask.") || // Added in 7.0 Name.startswith("avx512.cvtq2mask.") || // Added in 7.0 - Name == "avx512.mask.max.pd.128" || // Added in 5.0 - Name == "avx512.mask.max.pd.256" || // Added in 5.0 - Name == "avx512.mask.max.ps.128" || // Added in 5.0 - Name == "avx512.mask.max.ps.256" || // Added in 5.0 - Name == "avx512.mask.min.pd.128" || // Added in 5.0 - Name == "avx512.mask.min.pd.256" || // Added in 5.0 - Name == "avx512.mask.min.ps.128" || // Added in 5.0 - Name == "avx512.mask.min.ps.256" || // Added in 5.0 Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0 Name.startswith("avx512.mask.psll.d") || // Added in 4.0 Name.startswith("avx512.mask.psll.q") || // Added in 4.0 @@ -274,10 +266,12 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.dbpsadbw.") || // Added in 7.0 Name.startswith("avx512.mask.vpshld.") || // Added in 7.0 Name.startswith("avx512.mask.vpshrd.") || // Added in 7.0 - Name.startswith("avx512.mask.add.p") || // Added in 7.0 - Name.startswith("avx512.mask.sub.p") || // Added in 7.0 - Name.startswith("avx512.mask.mul.p") || // Added in 7.0 - Name.startswith("avx512.mask.div.p") || // Added in 7.0 + Name.startswith("avx512.mask.add.p") || // Added in 7.0. 128/256 in 4.0 + Name.startswith("avx512.mask.sub.p") || // Added in 7.0. 128/256 in 4.0 + Name.startswith("avx512.mask.mul.p") || // Added in 7.0. 128/256 in 4.0 + Name.startswith("avx512.mask.div.p") || // Added in 7.0. 128/256 in 4.0 + Name.startswith("avx512.mask.max.p") || // Added in 7.0. 128/256 in 5.0 + Name.startswith("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2383,6 +2377,32 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { } Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsX86 && Name.startswith("avx512.mask.max.p") && + Name.drop_front(18) == ".512") { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_max_ps_512; + else + IID = Intrinsic::x86_avx512_max_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); + } else if (IsX86 && Name.startswith("avx512.mask.min.p") && + Name.drop_front(18) == ".512") { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_min_ps_512; + else + IID = Intrinsic::x86_avx512_min_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) { Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 65954f01b81..61fac9fcb0b 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -668,18 +668,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK, X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), - X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, - X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, - X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK, X86ISD::FMAXS, X86ISD::FMAXS_RND), X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK, X86ISD::FMAXS, X86ISD::FMAXS_RND), - X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, - X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, - X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK, X86ISD::FMINS, X86ISD::FMINS_RND), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK, @@ -1085,6 +1077,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index eb49656d403..4f17a84d6d9 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -8697,6 +8697,366 @@ entry: ret float %vecext.i } +define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_mask_max_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_max_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_maskz_max_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_max_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_mask_max_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_max_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_mask_max_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_max_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) + +define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_maskz_max_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_max_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) { +; CHECK-LABEL: test_mm512_max_round_pd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + ret <8 x double> %0 +} + +define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_maskz_max_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_max_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_mask_max_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_max_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_maskz_max_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_max_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) { +; CHECK-LABEL: test_mm512_max_round_ps: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + ret <16 x float> %0 +} + +define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_mask_min_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_min_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_maskz_min_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_min_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_mask_min_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_min_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) + +define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { +; X86-LABEL: test_mm512_maskz_min_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_min_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) { +; CHECK-LABEL: test_mm512_min_round_pd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) + ret <8 x double> %0 +} + +define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_mask_min_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_min_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_maskz_min_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_min_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_mask_min_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_min_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { +; X86-LABEL: test_mm512_maskz_min_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_min_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) { +; CHECK-LABEL: test_mm512_min_round_ps: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) + ret <16 x float> %0 +} + declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 declare float @llvm.fma.f32(float, float, float) #9 diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index cb1706f5710..61555d4cb7e 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -8198,3 +8198,177 @@ define <16 x i32> @test_expand_load_d_512(i8* %addr, <16 x i32> %data) { %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(i8* %addr, <16 x i32> %data, i16 -1) ret <16 x i32> %res } + +define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_min_round_ps_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5d,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_min_round_ps_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5d,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_min_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5d,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_min_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5d,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_min_round_ps_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5d,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_min_round_ps_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5d,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_min_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5d,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_min_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5d,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_min_round_ps_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x5d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_min_round_ps_current: +; CHECK: ## %bb.0: +; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x5d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_max_round_ps_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5f,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_max_round_ps_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_max_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5f,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_max_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_max_round_ps_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5f,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_max_round_ps_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5f,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_max_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5f,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_max_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5f,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_max_round_ps_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x5f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_max_round_ps_current: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x5f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 546f29d7fc0..2023c238b7c 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -930,24 +930,20 @@ define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double>zeroinitializer, i8 -1, i32 4) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) + ret <8 x double> %1 } -declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, - <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: test_vminpd: ; CHECK: ## %bb.0: ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double>zeroinitializer, i8 -1, i32 4) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) + ret <8 x double> %1 } -declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, - <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { ; CHECK-LABEL: test_mask_store_ss: @@ -1646,8 +1642,10 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1656,8 +1654,10 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1667,8 +1667,10 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1678,8 +1680,10 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1687,8 +1691,8 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK: ## %bb.0: ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 } define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1696,10 +1700,10 @@ define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK: ## %bb.0: ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 } -declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae: @@ -1707,8 +1711,10 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1717,8 +1723,10 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1728,8 +1736,10 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1739,8 +1749,10 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1748,8 +1760,8 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK: ## %bb.0: ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 } define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1757,10 +1769,10 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK: ## %bb.0: ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 } -declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone diff --git a/test/CodeGen/X86/stack-folding-fp-avx512.ll b/test/CodeGen/X86/stack-folding-fp-avx512.ll index 4b5179ad506..4aab6a27472 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -242,16 +242,16 @@ define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 ;CHECK-LABEL: stack_fold_maxpd_zmm ;CHECK: vmaxpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> zeroinitializer, i8 -1, i32 4) + %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) ret <8 x double> %2 } -declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { ;CHECK-LABEL: stack_fold_maxpd_zmm_commutable ;CHECK: vmaxpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> zeroinitializer, i8 -1, i32 4) + %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) ret <8 x double> %2 } @@ -259,24 +259,26 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x d ;CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz ;CHECK: vmaxpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a1, <8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) - ret <8 x double> %2 + %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 } define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ;CHECK-LABEL: stack_fold_maxps_zmm ;CHECK: vmaxps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) ret <16 x float> %2 } -declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { ;CHECK-LABEL: stack_fold_maxps_zmm_commutable ;CHECK: vmaxps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) ret <16 x float> %2 } @@ -284,24 +286,26 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x ;CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz ;CHECK: vmaxps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a1, <16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %2 + %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer + ret <16 x float> %4 } define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ;CHECK-LABEL: stack_fold_minpd_zmm ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> zeroinitializer, i8 -1, i32 4) + %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) ret <8 x double> %2 } -declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { ;CHECK-LABEL: stack_fold_minpd_zmm_commutable ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> zeroinitializer, i8 -1, i32 4) + %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) ret <8 x double> %2 } @@ -309,24 +313,26 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x d ;CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a1, <8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) - ret <8 x double> %2 + %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 } define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ;CHECK-LABEL: stack_fold_minps_zmm ;CHECK: vminps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) ret <16 x float> %2 } -declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { ;CHECK-LABEL: stack_fold_minps_zmm_commutable ;CHECK: vminps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) ret <16 x float> %2 } @@ -334,8 +340,10 @@ define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x ;CHECK-LABEL: stack_fold_minps_zmm_commutable_kz ;CHECK: vminps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a1, <16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %2 + %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer + ret <16 x float> %4 } define double @stack_fold_mulsd(double %a0, double %a1) {