; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
; CHECK-NEXT: vpbroadcastd %edi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
; CHECK-NEXT: vpbroadcastq %rdi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovups %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovupd %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovaps %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovapd %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
; CHECK-LABEL: test_storent_q_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
ret void
; CHECK-LABEL: test_storent_pd_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
ret void
; CHECK-LABEL: test_storent_ps_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
ret void
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpslld $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
ret <4 x float> %res
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
; CHECK-NEXT: andb %al, %dil
; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
%res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
; CHECK-NEXT: andl %eax, %edi
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
%res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
; CHECK-NEXT: andl %eax, %edi
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
; CHECK-NEXT: andb %al, %dil
; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k1, %k0
; CHECK-NEXT: sete %al
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%0 = bitcast <8 x i64> %A to <16 x i32>
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k1, %k0
; CHECK-NEXT: sete %al
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%0 = bitcast <8 x i64> %A to <16 x i32>
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
ret i16 %res
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
ret i8 %res
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_512:
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
%1 = bitcast <16 x i1> %res to i16
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
%1 = bitcast <8 x i1> %res to i8
; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovqb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovsqb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovusqb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovqw %zmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovqd %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovsqd %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovusqd %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovdb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovsdb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovusdb %zmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovdw %zmm0, %ymm0
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovsdw %zmm0, %ymm0
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vpmovusdw %zmm0, %ymm0
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
; CHECK-NEXT: kmovw %k0, %esi
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: orb %sil, %al
; CHECK-NEXT: orb %dl, %al
+; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
; CHECK-NEXT: kmovw %k0, %esi
; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: andb %sil, %al
; CHECK-NEXT: andb %dl, %al
+; CHECK-NEXT: andb %cl, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vprold $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
%2 = bitcast i8 %x3 to <8 x i1>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
%2 = bitcast i16 %x3 to <16 x i1>