From f071b970e463948c160677d37e8c17b7776663cc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 5 Dec 2017 12:14:36 +0000 Subject: [PATCH] [X86][AVX512] Tag VPCMP/VPCMPU instruction scheduler classes Move hardcoded itinerary out to the instruction declarations. Not sure that IIC_SSE_ALU_F32P is the best schedule for integer comparisons, but I'm not going to change it right now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319760 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 102 +++++++++++++++++++++--------------- test/CodeGen/X86/avx512-schedule.ll | 86 +++++++++++++++--------------- 2 files changed, 103 insertions(+), 85 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 29c455d353c..6d0b2f4d3a2 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2092,7 +2092,7 @@ def CommutePCMPCC : SDNodeXForm; multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let isCommutable = 1 in def rri : AVX512AIi8 opc, string Suffix, SDNode OpNode, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rmi : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V; + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCommutable = 1 in def rrik : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc)))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rmik : AVX512AIi8 opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -2138,20 +2139,20 @@ multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in def rmi_alt : AVX512AIi8, EVEX_4V; + [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rrik_alt : AVX512AIi8, EVEX_4V, EVEX_K; + [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; let mayLoad = 1 in def rmik_alt : AVX512AIi8 opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + [], itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)), @@ -2175,8 +2177,8 @@ multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, } multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> : - avx512_icmp_cc { + OpndItins itins, X86VectorVTInfo _> : + avx512_icmp_cc { def rmib : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk : AVX512AIi8 opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { @@ -2207,14 +2211,16 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk_alt : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)), @@ -2231,49 +2237,61 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, } multiclass avx512_icmp_cc_vl opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc, EVEX_V512; + defm Z : avx512_icmp_cc, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc, EVEX_V256; - defm Z128 : avx512_icmp_cc, EVEX_V128; + defm Z256 : avx512_icmp_cc, + EVEX_V256; + defm Z128 : avx512_icmp_cc, + EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb, + defm Z : avx512_icmp_cc_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb, + defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb, + defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; } } -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; - -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; - -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; - -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU? +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; multiclass avx512_vcmp_common { diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 6c879231447..0c9e22db426 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -401,7 +401,7 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma ; GENERIC-LABEL: vpaddd_mask_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -421,7 +421,7 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m ; GENERIC-LABEL: vpaddd_maskz_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -441,7 +441,7 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ; GENERIC-LABEL: vpaddd_mask_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -462,7 +462,7 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) ; GENERIC-LABEL: vpaddd_mask_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -482,7 +482,7 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ; GENERIC-LABEL: vpaddd_maskz_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -503,7 +503,7 @@ define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) ; GENERIC-LABEL: vpaddd_maskz_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -718,7 +718,7 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ; GENERIC-LABEL: test_mask_vaddps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -740,7 +740,7 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vmulps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -760,7 +760,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vminps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -781,7 +781,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vminpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -802,7 +802,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vmaxps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -823,7 +823,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vmaxpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -844,7 +844,7 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vsubps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -864,7 +864,7 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vdivps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -884,7 +884,7 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -904,7 +904,7 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i6 ; GENERIC-LABEL: test_maskz_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -924,7 +924,7 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, < ; GENERIC-LABEL: test_mask_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -945,7 +945,7 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, < ; GENERIC-LABEL: test_maskz_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -984,7 +984,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ; GENERIC-LABEL: test_mask_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1010,7 +1010,7 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ; GENERIC-LABEL: test_maskz_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2965,7 +2965,7 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2988,7 +2988,7 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4423,7 +4423,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6402,7 +6402,7 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6423,7 +6423,7 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test33: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6444,7 +6444,7 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test34: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6465,7 +6465,7 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test35: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6486,7 +6486,7 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test36: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6507,7 +6507,7 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test37: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6528,7 +6528,7 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test38: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6549,7 +6549,7 @@ define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test39: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6947,7 +6947,7 @@ define i8 @shuf_test1(i16 %v) nounwind { define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -6973,7 +6973,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -7001,7 +7001,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -7150,7 +7150,7 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: -; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8339,7 +8339,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m ; GENERIC-LABEL: _ss16xfloat_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8362,7 +8362,7 @@ define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8399,7 +8399,7 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 ; GENERIC-LABEL: _ss16xfloat_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8421,7 +8421,7 @@ define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) ; GENERIC-LABEL: _ss16xfloat_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8458,7 +8458,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m ; GENERIC-LABEL: _sd8xdouble_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8481,7 +8481,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8518,7 +8518,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 ; GENERIC-LABEL: _sd8xdouble_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8540,7 +8540,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) ; GENERIC-LABEL: _sd8xdouble_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; -- 2.11.0