1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
173 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
176 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
177 ; X64: # %bb.0: # %entry
178 ; X64-NEXT: kmovw %edi, %k1
179 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
182 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
183 %0 = bitcast i16 %__U to <16 x i1>
184 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
188 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
189 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
190 ; X86: # %bb.0: # %entry
191 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
192 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
195 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
196 ; X64: # %bb.0: # %entry
197 ; X64-NEXT: kmovw %edi, %k1
198 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
201 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
202 %0 = bitcast i16 %__U to <16 x i1>
203 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
207 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
208 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
209 ; CHECK: # %bb.0: # %entry
210 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
211 ; CHECK-NEXT: ret{{[l|q]}}
213 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
214 ret <8 x double> %shuffle
217 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
218 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
219 ; X86: # %bb.0: # %entry
220 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
221 ; X86-NEXT: kmovw %eax, %k1
222 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
225 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
226 ; X64: # %bb.0: # %entry
227 ; X64-NEXT: kmovw %edi, %k1
228 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
231 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
232 %0 = bitcast i8 %__U to <8 x i1>
233 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
237 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
238 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
239 ; X86: # %bb.0: # %entry
240 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
241 ; X86-NEXT: kmovw %eax, %k1
242 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
245 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
246 ; X64: # %bb.0: # %entry
247 ; X64-NEXT: kmovw %edi, %k1
248 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
251 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
252 %0 = bitcast i8 %__U to <8 x i1>
253 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
257 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
258 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
259 ; CHECK: # %bb.0: # %entry
260 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
261 ; CHECK-NEXT: ret{{[l|q]}}
263 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
264 ret <8 x i64> %shuffle
267 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
268 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
269 ; X86: # %bb.0: # %entry
270 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
271 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
274 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
275 ; X64: # %bb.0: # %entry
276 ; X64-NEXT: kmovw %edi, %k1
277 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
280 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
281 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
282 %1 = bitcast <8 x i64> %__W to <16 x i32>
283 %2 = bitcast i16 %__U to <16 x i1>
284 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
285 %4 = bitcast <16 x i32> %3 to <8 x i64>
289 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
290 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
291 ; X86: # %bb.0: # %entry
292 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
293 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
296 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
297 ; X64: # %bb.0: # %entry
298 ; X64-NEXT: kmovw %edi, %k1
299 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
302 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
303 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
304 %1 = bitcast i16 %__U to <16 x i1>
305 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
306 %3 = bitcast <16 x i32> %2 to <8 x i64>
310 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
311 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
312 ; CHECK: # %bb.0: # %entry
313 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
314 ; CHECK-NEXT: ret{{[l|q]}}
316 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
317 ret <8 x i64> %shuffle
320 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
321 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
322 ; X86: # %bb.0: # %entry
323 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
324 ; X86-NEXT: kmovw %eax, %k1
325 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
328 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
329 ; X64: # %bb.0: # %entry
330 ; X64-NEXT: kmovw %edi, %k1
331 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
334 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
335 %0 = bitcast i8 %__U to <8 x i1>
336 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
340 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
341 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
342 ; X86: # %bb.0: # %entry
343 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
344 ; X86-NEXT: kmovw %eax, %k1
345 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
348 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
349 ; X64: # %bb.0: # %entry
350 ; X64-NEXT: kmovw %edi, %k1
351 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
354 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
355 %0 = bitcast i8 %__U to <8 x i1>
356 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
361 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
362 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
363 ; CHECK: # %bb.0: # %entry
364 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
365 ; CHECK-NEXT: kmovw %k0, %eax
366 ; CHECK-NEXT: movzwl %ax, %eax
367 ; CHECK-NEXT: vzeroupper
368 ; CHECK-NEXT: ret{{[l|q]}}
370 %and1.i.i = and <8 x i64> %__B, %__A
371 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
372 %1 = icmp eq <16 x i32> %0, zeroinitializer
373 %2 = bitcast <16 x i1> %1 to i16
377 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
378 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
379 ; X86: # %bb.0: # %entry
380 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
381 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
382 ; X86-NEXT: kmovw %k0, %eax
383 ; X86-NEXT: movzwl %ax, %eax
384 ; X86-NEXT: vzeroupper
387 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
388 ; X64: # %bb.0: # %entry
389 ; X64-NEXT: kmovw %edi, %k1
390 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
391 ; X64-NEXT: kmovw %k0, %eax
392 ; X64-NEXT: movzwl %ax, %eax
393 ; X64-NEXT: vzeroupper
396 %and1.i.i = and <8 x i64> %__B, %__A
397 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
398 %1 = icmp eq <16 x i32> %0, zeroinitializer
399 %2 = bitcast i16 %__U to <16 x i1>
400 %3 = and <16 x i1> %1, %2
401 %4 = bitcast <16 x i1> %3 to i16
405 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
406 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
407 ; CHECK: # %bb.0: # %entry
408 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
409 ; CHECK-NEXT: kmovw %k0, %eax
410 ; CHECK-NEXT: movzbl %al, %eax
411 ; CHECK-NEXT: vzeroupper
412 ; CHECK-NEXT: ret{{[l|q]}}
414 %and1.i.i = and <8 x i64> %__B, %__A
415 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
416 %1 = bitcast <8 x i1> %0 to i8
420 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
421 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
422 ; X86: # %bb.0: # %entry
423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
424 ; X86-NEXT: kmovw %eax, %k1
425 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
426 ; X86-NEXT: kmovw %k0, %eax
427 ; X86-NEXT: movzbl %al, %eax
428 ; X86-NEXT: vzeroupper
431 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
432 ; X64: # %bb.0: # %entry
433 ; X64-NEXT: kmovw %edi, %k1
434 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
435 ; X64-NEXT: kmovw %k0, %eax
436 ; X64-NEXT: movzbl %al, %eax
437 ; X64-NEXT: vzeroupper
440 %and1.i.i = and <8 x i64> %__B, %__A
441 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
442 %1 = bitcast i8 %__U to <8 x i1>
443 %2 = and <8 x i1> %0, %1
444 %3 = bitcast <8 x i1> %2 to i8
448 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
449 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
450 ; X86: # %bb.0: # %entry
451 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
452 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
453 ; X86-NEXT: kmovw %k0, %eax
454 ; X86-NEXT: movzwl %ax, %eax
455 ; X86-NEXT: vzeroupper
458 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
459 ; X64: # %bb.0: # %entry
460 ; X64-NEXT: kmovw %edi, %k1
461 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
462 ; X64-NEXT: kmovw %k0, %eax
463 ; X64-NEXT: movzwl %ax, %eax
464 ; X64-NEXT: vzeroupper
467 %and1.i.i = and <8 x i64> %__B, %__A
468 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
469 %1 = icmp ne <16 x i32> %0, zeroinitializer
470 %2 = bitcast i16 %__U to <16 x i1>
471 %3 = and <16 x i1> %1, %2
472 %4 = bitcast <16 x i1> %3 to i16
476 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
477 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
478 ; X86: # %bb.0: # %entry
479 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
480 ; X86-NEXT: kmovw %eax, %k1
481 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
482 ; X86-NEXT: kmovw %k0, %eax
483 ; X86-NEXT: movzbl %al, %eax
484 ; X86-NEXT: vzeroupper
487 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
488 ; X64: # %bb.0: # %entry
489 ; X64-NEXT: kmovw %edi, %k1
490 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
491 ; X64-NEXT: kmovw %k0, %eax
492 ; X64-NEXT: movzbl %al, %eax
493 ; X64-NEXT: vzeroupper
496 %and1.i.i = and <8 x i64> %__B, %__A
497 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
498 %1 = bitcast i8 %__U to <8 x i1>
499 %2 = and <8 x i1> %0, %1
500 %3 = bitcast <8 x i1> %2 to i8
504 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
505 ; X86-LABEL: test_mm512_mask_set1_epi32:
506 ; X86: # %bb.0: # %entry
507 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
508 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
509 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
512 ; X64-LABEL: test_mm512_mask_set1_epi32:
513 ; X64: # %bb.0: # %entry
514 ; X64-NEXT: kmovw %edi, %k1
515 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
518 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
519 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
520 %0 = bitcast <8 x i64> %__O to <16 x i32>
521 %1 = bitcast i16 %__M to <16 x i1>
522 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
523 %3 = bitcast <16 x i32> %2 to <8 x i64>
527 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
528 ; X86-LABEL: test_mm512_maskz_set1_epi32:
529 ; X86: # %bb.0: # %entry
530 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
531 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
532 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
535 ; X64-LABEL: test_mm512_maskz_set1_epi32:
536 ; X64: # %bb.0: # %entry
537 ; X64-NEXT: kmovw %edi, %k1
538 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
541 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
542 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
543 %0 = bitcast i16 %__M to <16 x i1>
544 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
545 %2 = bitcast <16 x i32> %1 to <8 x i64>
549 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
550 ; X86-LABEL: test_mm512_mask_set1_epi64:
551 ; X86: # %bb.0: # %entry
552 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
553 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
554 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
555 ; X86-NEXT: kmovw %eax, %k1
556 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
559 ; X64-LABEL: test_mm512_mask_set1_epi64:
560 ; X64: # %bb.0: # %entry
561 ; X64-NEXT: kmovw %edi, %k1
562 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
565 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
566 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
567 %0 = bitcast i8 %__M to <8 x i1>
568 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
572 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
573 ; X86-LABEL: test_mm512_maskz_set1_epi64:
574 ; X86: # %bb.0: # %entry
575 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
576 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
577 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
578 ; X86-NEXT: kmovw %eax, %k1
579 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
582 ; X64-LABEL: test_mm512_maskz_set1_epi64:
583 ; X64: # %bb.0: # %entry
584 ; X64-NEXT: kmovw %edi, %k1
585 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
588 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
589 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
590 %0 = bitcast i8 %__M to <8 x i1>
591 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
596 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
597 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
599 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
600 ; CHECK-NEXT: ret{{[l|q]}}
601 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
602 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
603 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
607 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
608 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
610 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
611 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
614 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
616 ; X64-NEXT: kmovw %edi, %k1
617 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
619 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
620 %arg1 = bitcast i16 %a1 to <16 x i1>
621 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
622 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
623 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
624 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
628 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
629 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
631 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
632 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
635 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
637 ; X64-NEXT: kmovw %edi, %k1
638 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
640 %arg0 = bitcast i16 %a0 to <16 x i1>
641 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
642 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
643 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
644 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
648 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
649 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
651 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
652 ; CHECK-NEXT: ret{{[l|q]}}
653 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
657 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
658 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
660 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
661 ; X86-NEXT: kmovw %eax, %k1
662 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
665 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
667 ; X64-NEXT: kmovw %edi, %k1
668 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
670 %arg1 = bitcast i8 %a1 to <8 x i1>
671 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
672 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
676 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
677 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
679 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
680 ; X86-NEXT: kmovw %eax, %k1
681 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
684 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
686 ; X64-NEXT: kmovw %edi, %k1
687 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
689 %arg0 = bitcast i8 %a0 to <8 x i1>
690 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
691 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
695 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
696 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
698 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
699 ; CHECK-NEXT: ret{{[l|q]}}
700 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
701 ret <8 x double> %res
704 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
705 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
707 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
708 ; X86-NEXT: kmovw %eax, %k1
709 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
712 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
714 ; X64-NEXT: kmovw %edi, %k1
715 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
717 %arg1 = bitcast i8 %a1 to <8 x i1>
718 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
719 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
720 ret <8 x double> %res1
723 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
724 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
726 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
727 ; X86-NEXT: kmovw %eax, %k1
728 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
731 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
733 ; X64-NEXT: kmovw %edi, %k1
734 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
736 %arg0 = bitcast i8 %a0 to <8 x i1>
737 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
738 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
739 ret <8 x double> %res1
742 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
743 ; CHECK-LABEL: test_mm512_broadcastss_ps:
745 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
746 ; CHECK-NEXT: ret{{[l|q]}}
747 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
748 ret <16 x float> %res
751 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
752 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
754 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
755 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
758 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
760 ; X64-NEXT: kmovw %edi, %k1
761 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
763 %arg1 = bitcast i16 %a1 to <16 x i1>
764 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
765 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
766 ret <16 x float> %res1
769 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
770 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
772 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
773 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
776 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
778 ; X64-NEXT: kmovw %edi, %k1
779 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
781 %arg0 = bitcast i16 %a0 to <16 x i1>
782 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
783 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
784 ret <16 x float> %res1
787 define <8 x double> @test_mm512_movddup_pd(<8 x double> %a0) {
788 ; CHECK-LABEL: test_mm512_movddup_pd:
790 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
791 ; CHECK-NEXT: ret{{[l|q]}}
792 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
793 ret <8 x double> %res
796 define <8 x double> @test_mm512_mask_movddup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
797 ; X86-LABEL: test_mm512_mask_movddup_pd:
799 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
800 ; X86-NEXT: kmovw %eax, %k1
801 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
804 ; X64-LABEL: test_mm512_mask_movddup_pd:
806 ; X64-NEXT: kmovw %edi, %k1
807 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
809 %arg1 = bitcast i8 %a1 to <8 x i1>
810 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
811 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
812 ret <8 x double> %res1
815 define <8 x double> @test_mm512_maskz_movddup_pd(i8 %a0, <8 x double> %a1) {
816 ; X86-LABEL: test_mm512_maskz_movddup_pd:
818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
819 ; X86-NEXT: kmovw %eax, %k1
820 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
823 ; X64-LABEL: test_mm512_maskz_movddup_pd:
825 ; X64-NEXT: kmovw %edi, %k1
826 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
828 %arg0 = bitcast i8 %a0 to <8 x i1>
829 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
830 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
831 ret <8 x double> %res1
834 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
835 ; CHECK-LABEL: test_mm512_movehdup_ps:
837 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
838 ; CHECK-NEXT: ret{{[l|q]}}
839 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
840 ret <16 x float> %res
843 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
844 ; X86-LABEL: test_mm512_mask_movehdup_ps:
846 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
847 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; X64-LABEL: test_mm512_mask_movehdup_ps:
852 ; X64-NEXT: kmovw %edi, %k1
853 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
855 %arg1 = bitcast i16 %a1 to <16 x i1>
856 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
857 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
858 ret <16 x float> %res1
861 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
862 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
864 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
865 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
870 ; X64-NEXT: kmovw %edi, %k1
871 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
873 %arg0 = bitcast i16 %a0 to <16 x i1>
874 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
875 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
876 ret <16 x float> %res1
879 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
880 ; CHECK-LABEL: test_mm512_moveldup_ps:
882 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
883 ; CHECK-NEXT: ret{{[l|q]}}
884 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
885 ret <16 x float> %res
888 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
889 ; X86-LABEL: test_mm512_mask_moveldup_ps:
891 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
892 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
895 ; X64-LABEL: test_mm512_mask_moveldup_ps:
897 ; X64-NEXT: kmovw %edi, %k1
898 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
900 %arg1 = bitcast i16 %a1 to <16 x i1>
901 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
902 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
903 ret <16 x float> %res1
906 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
907 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
909 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
910 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
913 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
915 ; X64-NEXT: kmovw %edi, %k1
916 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
918 %arg0 = bitcast i16 %a0 to <16 x i1>
919 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
920 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
921 ret <16 x float> %res1
924 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
925 ; CHECK-LABEL: test_mm512_permute_pd:
927 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
928 ; CHECK-NEXT: ret{{[l|q]}}
929 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
930 ret <8 x double> %res
933 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
934 ; X86-LABEL: test_mm512_mask_permute_pd:
936 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
937 ; X86-NEXT: kmovw %eax, %k1
938 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
941 ; X64-LABEL: test_mm512_mask_permute_pd:
943 ; X64-NEXT: kmovw %edi, %k1
944 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
946 %arg1 = bitcast i8 %a1 to <8 x i1>
947 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
948 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
949 ret <8 x double> %res1
952 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
953 ; X86-LABEL: test_mm512_maskz_permute_pd:
955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
956 ; X86-NEXT: kmovw %eax, %k1
957 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
960 ; X64-LABEL: test_mm512_maskz_permute_pd:
962 ; X64-NEXT: kmovw %edi, %k1
963 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
965 %arg0 = bitcast i8 %a0 to <8 x i1>
966 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
967 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
968 ret <8 x double> %res1
971 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
972 ; CHECK-LABEL: test_mm512_permute_ps:
974 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
975 ; CHECK-NEXT: ret{{[l|q]}}
976 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
977 ret <16 x float> %res
980 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
981 ; X86-LABEL: test_mm512_mask_permute_ps:
983 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
984 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
987 ; X64-LABEL: test_mm512_mask_permute_ps:
989 ; X64-NEXT: kmovw %edi, %k1
990 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
992 %arg1 = bitcast i16 %a1 to <16 x i1>
993 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
994 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
995 ret <16 x float> %res1
998 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
999 ; X86-LABEL: test_mm512_maskz_permute_ps:
1001 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1002 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1005 ; X64-LABEL: test_mm512_maskz_permute_ps:
1007 ; X64-NEXT: kmovw %edi, %k1
1008 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1010 %arg0 = bitcast i16 %a0 to <16 x i1>
1011 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1012 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1013 ret <16 x float> %res1
1016 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1017 ; CHECK-LABEL: test_mm512_permutex_epi64:
1019 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1020 ; CHECK-NEXT: ret{{[l|q]}}
1021 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1025 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1026 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1028 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1029 ; X86-NEXT: kmovw %eax, %k1
1030 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1033 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1035 ; X64-NEXT: kmovw %edi, %k1
1036 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1038 %arg1 = bitcast i8 %a1 to <8 x i1>
1039 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1040 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1044 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1045 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1048 ; X86-NEXT: kmovw %eax, %k1
1049 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1052 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1054 ; X64-NEXT: kmovw %edi, %k1
1055 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1057 %arg0 = bitcast i8 %a0 to <8 x i1>
1058 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1059 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1063 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1064 ; CHECK-LABEL: test_mm512_permutex_pd:
1066 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1067 ; CHECK-NEXT: ret{{[l|q]}}
1068 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1069 ret <8 x double> %res
1072 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1073 ; X86-LABEL: test_mm512_mask_permutex_pd:
1075 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1076 ; X86-NEXT: kmovw %eax, %k1
1077 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1080 ; X64-LABEL: test_mm512_mask_permutex_pd:
1082 ; X64-NEXT: kmovw %edi, %k1
1083 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1085 %arg1 = bitcast i8 %a1 to <8 x i1>
1086 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1088 ret <8 x double> %res1
1091 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1092 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1094 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1095 ; X86-NEXT: kmovw %eax, %k1
1096 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1099 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1101 ; X64-NEXT: kmovw %edi, %k1
1102 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1104 %arg0 = bitcast i8 %a0 to <8 x i1>
1105 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1106 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1107 ret <8 x double> %res1
1110 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1111 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1113 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1114 ; CHECK-NEXT: ret{{[l|q]}}
1115 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1116 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1117 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1121 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1122 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1124 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1125 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1128 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1130 ; X64-NEXT: kmovw %edi, %k1
1131 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %arg1 = bitcast i16 %a1 to <16 x i1>
1135 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1136 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1137 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1138 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1142 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1143 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1145 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1146 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1149 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1151 ; X64-NEXT: kmovw %edi, %k1
1152 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1154 %arg0 = bitcast i16 %a0 to <16 x i1>
1155 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1156 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1157 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1158 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1162 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1163 ; CHECK-LABEL: test_mm512_shuffle_pd:
1165 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1166 ; CHECK-NEXT: ret{{[l|q]}}
1167 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1168 ret <8 x double> %res
1171 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1172 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1175 ; X86-NEXT: kmovw %eax, %k1
1176 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1179 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1181 ; X64-NEXT: kmovw %edi, %k1
1182 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1184 %arg1 = bitcast i8 %a1 to <8 x i1>
1185 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1186 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1187 ret <8 x double> %res1
1190 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1191 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1194 ; X86-NEXT: kmovw %eax, %k1
1195 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1198 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1200 ; X64-NEXT: kmovw %edi, %k1
1201 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1203 %arg0 = bitcast i8 %a0 to <8 x i1>
1204 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1205 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1206 ret <8 x double> %res1
1209 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1210 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1212 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1213 ; CHECK-NEXT: ret{{[l|q]}}
1214 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1215 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1216 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1217 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1221 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1222 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1224 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1225 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1228 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1230 ; X64-NEXT: kmovw %edi, %k1
1231 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1233 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1234 %arg1 = bitcast i16 %a1 to <16 x i1>
1235 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1236 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1237 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1238 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1239 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1243 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1244 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1246 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1247 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1250 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1252 ; X64-NEXT: kmovw %edi, %k1
1253 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1255 %arg0 = bitcast i16 %a0 to <16 x i1>
1256 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1257 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1265 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1267 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1268 ; CHECK-NEXT: ret{{[l|q]}}
1269 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1273 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1274 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1276 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1277 ; X86-NEXT: kmovw %eax, %k1
1278 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1281 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1283 ; X64-NEXT: kmovw %edi, %k1
1284 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1286 %arg1 = bitcast i8 %a1 to <8 x i1>
1287 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1288 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1292 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1293 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1295 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1296 ; X86-NEXT: kmovw %eax, %k1
1297 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1300 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1302 ; X64-NEXT: kmovw %edi, %k1
1303 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1305 %arg0 = bitcast i8 %a0 to <8 x i1>
1306 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1307 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1311 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1312 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1314 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1315 ; CHECK-NEXT: ret{{[l|q]}}
1316 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1317 ret <8 x double> %res
1320 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1321 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1323 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1324 ; X86-NEXT: kmovw %eax, %k1
1325 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1328 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1330 ; X64-NEXT: kmovw %edi, %k1
1331 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1333 %arg1 = bitcast i8 %a1 to <8 x i1>
1334 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1335 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1336 ret <8 x double> %res1
1339 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1340 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1342 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1343 ; X86-NEXT: kmovw %eax, %k1
1344 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1347 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1349 ; X64-NEXT: kmovw %edi, %k1
1350 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1352 %arg0 = bitcast i8 %a0 to <8 x i1>
1353 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1354 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1355 ret <8 x double> %res1
1358 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1359 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1361 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1362 ; CHECK-NEXT: ret{{[l|q]}}
1363 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1364 ret <16 x float> %res
1367 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1368 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1370 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1371 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1374 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1376 ; X64-NEXT: kmovw %edi, %k1
1377 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1379 %arg1 = bitcast i16 %a1 to <16 x i1>
1380 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1381 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1382 ret <16 x float> %res1
1385 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1386 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1388 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1389 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1392 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1394 ; X64-NEXT: kmovw %edi, %k1
1395 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1397 %arg0 = bitcast i16 %a0 to <16 x i1>
1398 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1399 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1400 ret <16 x float> %res1
1403 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1404 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1406 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1407 ; CHECK-NEXT: ret{{[l|q]}}
1408 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1409 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1410 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1411 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1415 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1416 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1418 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1419 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1422 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1424 ; X64-NEXT: kmovw %edi, %k1
1425 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1427 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1428 %arg1 = bitcast i16 %a1 to <16 x i1>
1429 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1430 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1431 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1432 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1433 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1437 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1438 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1440 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1441 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1444 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1446 ; X64-NEXT: kmovw %edi, %k1
1447 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1449 %arg0 = bitcast i16 %a0 to <16 x i1>
1450 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1451 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1452 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1453 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1454 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1458 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1459 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1461 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1462 ; CHECK-NEXT: ret{{[l|q]}}
1463 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1467 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1468 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1470 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1471 ; X86-NEXT: kmovw %eax, %k1
1472 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1475 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1477 ; X64-NEXT: kmovw %edi, %k1
1478 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1480 %arg1 = bitcast i8 %a1 to <8 x i1>
1481 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1482 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1486 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1487 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1489 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1490 ; X86-NEXT: kmovw %eax, %k1
1491 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1494 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1496 ; X64-NEXT: kmovw %edi, %k1
1497 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1499 %arg0 = bitcast i8 %a0 to <8 x i1>
1500 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1501 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1505 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1506 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1508 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1509 ; CHECK-NEXT: ret{{[l|q]}}
1510 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1511 ret <8 x double> %res
1514 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1515 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1518 ; X86-NEXT: kmovw %eax, %k1
1519 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1522 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1524 ; X64-NEXT: kmovw %edi, %k1
1525 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1527 %arg1 = bitcast i8 %a1 to <8 x i1>
1528 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1529 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1530 ret <8 x double> %res1
1533 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1534 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1536 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1537 ; X86-NEXT: kmovw %eax, %k1
1538 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1541 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1543 ; X64-NEXT: kmovw %edi, %k1
1544 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1546 %arg0 = bitcast i8 %a0 to <8 x i1>
1547 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1548 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1549 ret <8 x double> %res1
1552 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1553 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1555 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1556 ; CHECK-NEXT: ret{{[l|q]}}
1557 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1558 ret <16 x float> %res
1561 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1562 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1564 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1565 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1568 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1570 ; X64-NEXT: kmovw %edi, %k1
1571 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1573 %arg1 = bitcast i16 %a1 to <16 x i1>
1574 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1575 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1576 ret <16 x float> %res1
1579 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1580 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1582 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1583 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1586 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1588 ; X64-NEXT: kmovw %edi, %k1
1589 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1591 %arg0 = bitcast i16 %a0 to <16 x i1>
1592 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1593 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1594 ret <16 x float> %res1
1597 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1598 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1600 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1601 ; CHECK-NEXT: ret{{[l|q]}}
1602 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1603 ret <8 x double> %res
1606 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1607 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1609 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1610 ; CHECK-NEXT: ret{{[l|q]}}
1611 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1612 ret <8 x double> %res
1615 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1616 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1618 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1619 ; CHECK-NEXT: ret{{[l|q]}}
1620 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1621 ret <16 x float> %res
1624 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1625 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1627 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1628 ; CHECK-NEXT: ret{{[l|q]}}
1629 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1630 ret <16 x float> %res
1633 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1634 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1636 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1637 ; CHECK-NEXT: ret{{[l|q]}}
1638 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1642 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1643 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1645 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1646 ; CHECK-NEXT: ret{{[l|q]}}
1647 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1651 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1652 ; CHECK-LABEL: test_mm512_mul_epi32:
1654 ; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0
1655 ; CHECK-NEXT: vpsraq $32, %zmm0, %zmm0
1656 ; CHECK-NEXT: vpsllq $32, %zmm1, %zmm1
1657 ; CHECK-NEXT: vpsraq $32, %zmm1, %zmm1
1658 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1659 ; CHECK-NEXT: ret{{[l|q]}}
1660 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1661 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1662 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1663 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1664 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1668 define <8 x i64> @test_mm512_maskz_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1669 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1671 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1672 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1675 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1677 ; X64-NEXT: kmovw %edi, %k1
1678 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1680 %conv = trunc i16 %__k to i8
1681 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1682 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1683 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1684 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1686 %tmp5 = bitcast i8 %conv to <8 x i1>
1687 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> zeroinitializer
1691 define <8 x i64> @test_mm512_mask_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1692 ; X86-LABEL: test_mm512_mask_mul_epi32:
1694 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1695 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1696 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1699 ; X64-LABEL: test_mm512_mask_mul_epi32:
1701 ; X64-NEXT: kmovw %edi, %k1
1702 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1703 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1705 %conv = trunc i16 %__k to i8
1706 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1711 %tmp5 = bitcast i8 %conv to <8 x i1>
1712 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> %__src
1716 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1717 ; CHECK-LABEL: test_mm512_mul_epu32:
1719 ; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA
1720 ; CHECK-NEXT: kmovw %eax, %k0
1721 ; CHECK-NEXT: knotw %k0, %k1
1722 ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
1723 ; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
1724 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1725 ; CHECK-NEXT: ret{{[l|q]}}
1726 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1727 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1728 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1732 define <8 x i64> @test_mm512_maskz_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1733 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1735 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1736 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1739 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1741 ; X64-NEXT: kmovw %edi, %k1
1742 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1744 %conv = trunc i16 %__k to i8
1745 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1746 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1747 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1748 %tmp3 = bitcast i8 %conv to <8 x i1>
1749 %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> zeroinitializer
1753 define <8 x i64> @test_mm512_mask_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1754 ; X86-LABEL: test_mm512_mask_mul_epu32:
1756 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1757 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1758 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1761 ; X64-LABEL: test_mm512_mask_mul_epu32:
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1765 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1767 %conv = trunc i16 %__k to i8
1768 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1770 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1771 %tmp3 = bitcast i8 %conv to <8 x i1>
1772 %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> %__src
1776 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1777 ; X86-LABEL: test_mm512_set1_epi8:
1778 ; X86: # %bb.0: # %entry
1779 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1780 ; X86-NEXT: vmovd %eax, %xmm0
1781 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1782 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1785 ; X64-LABEL: test_mm512_set1_epi8:
1786 ; X64: # %bb.0: # %entry
1787 ; X64-NEXT: vmovd %edi, %xmm0
1788 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1789 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1792 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1793 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1794 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1798 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1799 ; X86-LABEL: test_mm_cvtu32_sd:
1800 ; X86: # %bb.0: # %entry
1801 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1804 ; X64-LABEL: test_mm_cvtu32_sd:
1805 ; X64: # %bb.0: # %entry
1806 ; X64-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
1809 %conv.i = uitofp i32 %__B to double
1810 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1811 ret <2 x double> %vecins.i
1814 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1815 ; X86-LABEL: test_mm_cvtu64_sd:
1816 ; X86: # %bb.0: # %entry
1817 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1818 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1819 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1820 ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1821 ; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1822 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1825 ; X64-LABEL: test_mm_cvtu64_sd:
1826 ; X64: # %bb.0: # %entry
1827 ; X64-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0
1830 %conv.i = uitofp i64 %__B to double
1831 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1832 ret <2 x double> %vecins.i
1835 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1836 ; X86-LABEL: test_mm_cvtu32_ss:
1837 ; X86: # %bb.0: # %entry
1838 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1841 ; X64-LABEL: test_mm_cvtu32_ss:
1842 ; X64: # %bb.0: # %entry
1843 ; X64-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
1846 %conv.i = uitofp i32 %__B to float
1847 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1848 ret <4 x float> %vecins.i
1851 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1852 ; X86-LABEL: test_mm_cvtu64_ss:
1853 ; X86: # %bb.0: # %entry
1854 ; X86-NEXT: pushl %ebp
1855 ; X86-NEXT: .cfi_def_cfa_offset 8
1856 ; X86-NEXT: .cfi_offset %ebp, -8
1857 ; X86-NEXT: movl %esp, %ebp
1858 ; X86-NEXT: .cfi_def_cfa_register %ebp
1859 ; X86-NEXT: andl $-8, %esp
1860 ; X86-NEXT: subl $16, %esp
1861 ; X86-NEXT: movl 12(%ebp), %eax
1862 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1863 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1864 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1865 ; X86-NEXT: xorl %ecx, %ecx
1866 ; X86-NEXT: testl %eax, %eax
1867 ; X86-NEXT: setns %cl
1868 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1869 ; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
1870 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1871 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1872 ; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1873 ; X86-NEXT: movl %ebp, %esp
1874 ; X86-NEXT: popl %ebp
1875 ; X86-NEXT: .cfi_def_cfa %esp, 4
1878 ; X64-LABEL: test_mm_cvtu64_ss:
1879 ; X64: # %bb.0: # %entry
1880 ; X64-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0
1883 %conv.i = uitofp i64 %__B to float
1884 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1885 ret <4 x float> %vecins.i
1888 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1889 ; CHECK-LABEL: test_mm512_cvtps_pd:
1890 ; CHECK: # %bb.0: # %entry
1891 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1892 ; CHECK-NEXT: ret{{[l|q]}}
1894 %conv.i = fpext <8 x float> %__A to <8 x double>
1895 ret <8 x double> %conv.i
1898 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1899 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1900 ; CHECK: # %bb.0: # %entry
1901 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1902 ; CHECK-NEXT: ret{{[l|q]}}
1904 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1905 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1906 ret <8 x double> %conv.i.i
1909 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1910 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1911 ; X86: # %bb.0: # %entry
1912 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1913 ; X86-NEXT: kmovw %eax, %k1
1914 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1917 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1918 ; X64: # %bb.0: # %entry
1919 ; X64-NEXT: kmovw %edi, %k1
1920 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1923 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1924 %0 = bitcast i8 %__U to <8 x i1>
1925 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
1929 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
1930 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
1931 ; X86: # %bb.0: # %entry
1932 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1933 ; X86-NEXT: kmovw %eax, %k1
1934 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1937 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
1938 ; X64: # %bb.0: # %entry
1939 ; X64-NEXT: kmovw %edi, %k1
1940 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1943 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1944 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1945 %0 = bitcast i8 %__U to <8 x i1>
1946 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
1950 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
1951 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
1952 ; X86: # %bb.0: # %entry
1953 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1954 ; X86-NEXT: kmovw %eax, %k1
1955 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1958 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
1959 ; X64: # %bb.0: # %entry
1960 ; X64-NEXT: kmovw %edi, %k1
1961 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1964 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1965 %0 = bitcast i8 %__U to <8 x i1>
1966 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
1970 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
1971 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
1972 ; CHECK: # %bb.0: # %entry
1973 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
1974 ; CHECK-NEXT: vzeroupper
1975 ; CHECK-NEXT: ret{{[l|q]}}
1977 %0 = bitcast <8 x i64> %__A to <16 x i32>
1978 %conv.i = trunc <16 x i32> %0 to <16 x i8>
1979 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
1983 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
1984 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
1985 ; X86: # %bb.0: # %entry
1986 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1987 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
1988 ; X86-NEXT: vzeroupper
1991 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
1992 ; X64: # %bb.0: # %entry
1993 ; X64-NEXT: kmovw %edi, %k1
1994 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
1995 ; X64-NEXT: vzeroupper
1998 %0 = bitcast <8 x i64> %__A to <16 x i32>
1999 %1 = bitcast <2 x i64> %__O to <16 x i8>
2000 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2001 %3 = bitcast <16 x i8> %2 to <2 x i64>
2005 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2006 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2007 ; X86: # %bb.0: # %entry
2008 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2009 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2010 ; X86-NEXT: vzeroupper
2013 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2014 ; X64: # %bb.0: # %entry
2015 ; X64-NEXT: kmovw %edi, %k1
2016 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2017 ; X64-NEXT: vzeroupper
2020 %0 = bitcast <8 x i64> %__A to <16 x i32>
2021 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2022 %2 = bitcast <16 x i8> %1 to <2 x i64>
2026 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2027 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2028 ; CHECK: # %bb.0: # %entry
2029 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2030 ; CHECK-NEXT: ret{{[l|q]}}
2032 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2033 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2037 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2038 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2039 ; X86: # %bb.0: # %entry
2040 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2041 ; X86-NEXT: kmovw %eax, %k1
2042 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2045 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2046 ; X64: # %bb.0: # %entry
2047 ; X64-NEXT: kmovw %edi, %k1
2048 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2051 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2052 %0 = bitcast <4 x i64> %__O to <8 x i32>
2053 %1 = bitcast i8 %__M to <8 x i1>
2054 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2055 %3 = bitcast <8 x i32> %2 to <4 x i64>
2059 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2060 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2061 ; X86: # %bb.0: # %entry
2062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2063 ; X86-NEXT: kmovw %eax, %k1
2064 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2067 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2068 ; X64: # %bb.0: # %entry
2069 ; X64-NEXT: kmovw %edi, %k1
2070 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2073 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2074 %0 = bitcast i8 %__M to <8 x i1>
2075 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2076 %2 = bitcast <8 x i32> %1 to <4 x i64>
2080 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2081 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2082 ; CHECK: # %bb.0: # %entry
2083 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2084 ; CHECK-NEXT: vzeroupper
2085 ; CHECK-NEXT: ret{{[l|q]}}
2087 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2088 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2092 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2093 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2094 ; X86: # %bb.0: # %entry
2095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2096 ; X86-NEXT: kmovw %eax, %k1
2097 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2098 ; X86-NEXT: vzeroupper
2101 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2102 ; X64: # %bb.0: # %entry
2103 ; X64-NEXT: kmovw %edi, %k1
2104 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2105 ; X64-NEXT: vzeroupper
2108 %0 = bitcast <2 x i64> %__O to <8 x i16>
2109 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2110 %2 = bitcast <8 x i16> %1 to <2 x i64>
2114 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2115 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2116 ; X86: # %bb.0: # %entry
2117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2118 ; X86-NEXT: kmovw %eax, %k1
2119 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2120 ; X86-NEXT: vzeroupper
2123 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2124 ; X64: # %bb.0: # %entry
2125 ; X64-NEXT: kmovw %edi, %k1
2126 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2127 ; X64-NEXT: vzeroupper
2130 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2131 %1 = bitcast <8 x i16> %0 to <2 x i64>
2135 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2136 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2138 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2139 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2140 ; CHECK: # %bb.0: # %entry
2141 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2142 ; CHECK-NEXT: ret{{[l|q]}}
2144 %0 = bitcast <8 x i64> %__A to <16 x i32>
2145 %1 = bitcast <8 x i64> %__B to <16 x i32>
2146 %2 = bitcast <8 x i64> %__C to <16 x i32>
2147 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2148 %4 = bitcast <16 x i32> %3 to <8 x i64>
2152 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2154 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2155 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2156 ; X86: # %bb.0: # %entry
2157 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2158 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2161 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2162 ; X64: # %bb.0: # %entry
2163 ; X64-NEXT: kmovw %edi, %k1
2164 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2167 %0 = bitcast <8 x i64> %__A to <16 x i32>
2168 %1 = bitcast <8 x i64> %__B to <16 x i32>
2169 %2 = bitcast <8 x i64> %__C to <16 x i32>
2170 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2171 %4 = bitcast i16 %__U to <16 x i1>
2172 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2173 %6 = bitcast <16 x i32> %5 to <8 x i64>
2177 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2178 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2179 ; X86: # %bb.0: # %entry
2180 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2181 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2184 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2185 ; X64: # %bb.0: # %entry
2186 ; X64-NEXT: kmovw %edi, %k1
2187 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2190 %0 = bitcast <8 x i64> %__A to <16 x i32>
2191 %1 = bitcast <8 x i64> %__B to <16 x i32>
2192 %2 = bitcast <8 x i64> %__C to <16 x i32>
2193 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2194 %4 = bitcast i16 %__U to <16 x i1>
2195 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2196 %6 = bitcast <16 x i32> %5 to <8 x i64>
2200 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2201 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2202 ; CHECK: # %bb.0: # %entry
2203 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2204 ; CHECK-NEXT: ret{{[l|q]}}
2206 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2210 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2212 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2213 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2214 ; X86: # %bb.0: # %entry
2215 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2216 ; X86-NEXT: kmovw %eax, %k1
2217 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2220 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2221 ; X64: # %bb.0: # %entry
2222 ; X64-NEXT: kmovw %edi, %k1
2223 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2226 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2227 %1 = bitcast i8 %__U to <8 x i1>
2228 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2232 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2233 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2234 ; X86: # %bb.0: # %entry
2235 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2236 ; X86-NEXT: kmovw %eax, %k1
2237 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2240 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2241 ; X64: # %bb.0: # %entry
2242 ; X64-NEXT: kmovw %edi, %k1
2243 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2246 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2247 %1 = bitcast i8 %__U to <8 x i1>
2248 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2252 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2254 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2255 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2256 ; X86: # %bb.0: # %entry
2257 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2258 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2259 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2262 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2263 ; X64: # %bb.0: # %entry
2264 ; X64-NEXT: kmovw %edi, %k1
2265 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2266 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2269 %0 = bitcast <8 x i64> %__A to <16 x i32>
2270 %1 = bitcast <8 x i64> %__I to <16 x i32>
2271 %2 = bitcast <8 x i64> %__B to <16 x i32>
2272 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2273 %4 = bitcast i16 %__U to <16 x i1>
2274 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2275 %6 = bitcast <16 x i32> %5 to <8 x i64>
2279 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2281 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2282 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2283 ; X86: # %bb.0: # %entry
2284 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2285 ; X86-NEXT: kmovw %eax, %k1
2286 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2287 ; X86-NEXT: vmovapd %zmm1, %zmm0
2290 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2291 ; X64: # %bb.0: # %entry
2292 ; X64-NEXT: kmovw %edi, %k1
2293 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2294 ; X64-NEXT: vmovapd %zmm1, %zmm0
2297 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2298 %1 = bitcast <8 x i64> %__I to <8 x double>
2299 %2 = bitcast i8 %__U to <8 x i1>
2300 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2304 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2306 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2307 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2308 ; X86: # %bb.0: # %entry
2309 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2310 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2311 ; X86-NEXT: vmovaps %zmm1, %zmm0
2314 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2315 ; X64: # %bb.0: # %entry
2316 ; X64-NEXT: kmovw %edi, %k1
2317 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2318 ; X64-NEXT: vmovaps %zmm1, %zmm0
2321 %0 = bitcast <8 x i64> %__I to <16 x i32>
2322 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2323 %2 = bitcast <8 x i64> %__I to <16 x float>
2324 %3 = bitcast i16 %__U to <16 x i1>
2325 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2329 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2331 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2332 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2333 ; X86: # %bb.0: # %entry
2334 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2335 ; X86-NEXT: kmovw %eax, %k1
2336 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2337 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2340 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2341 ; X64: # %bb.0: # %entry
2342 ; X64-NEXT: kmovw %edi, %k1
2343 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2344 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2347 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2348 %1 = bitcast i8 %__U to <8 x i1>
2349 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2353 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2354 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2355 ; CHECK: # %bb.0: # %entry
2356 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2357 ; CHECK-NEXT: ret{{[l|q]}}
2359 %0 = bitcast <8 x i64> %__A to <16 x i32>
2360 %1 = bitcast <8 x i64> %__I to <16 x i32>
2361 %2 = bitcast <8 x i64> %__B to <16 x i32>
2362 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2363 %4 = bitcast <16 x i32> %3 to <8 x i64>
2367 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2368 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2369 ; X86: # %bb.0: # %entry
2370 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2371 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2374 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2375 ; X64: # %bb.0: # %entry
2376 ; X64-NEXT: kmovw %edi, %k1
2377 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2380 %0 = bitcast <8 x i64> %__A to <16 x i32>
2381 %1 = bitcast <8 x i64> %__I to <16 x i32>
2382 %2 = bitcast <8 x i64> %__B to <16 x i32>
2383 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2384 %4 = bitcast i16 %__U to <16 x i1>
2385 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2386 %6 = bitcast <16 x i32> %5 to <8 x i64>
2390 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2391 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2392 ; X86: # %bb.0: # %entry
2393 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2394 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2397 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2398 ; X64: # %bb.0: # %entry
2399 ; X64-NEXT: kmovw %edi, %k1
2400 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2403 %0 = bitcast <8 x i64> %__A to <16 x i32>
2404 %1 = bitcast <8 x i64> %__I to <16 x i32>
2405 %2 = bitcast <8 x i64> %__B to <16 x i32>
2406 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2407 %4 = bitcast i16 %__U to <16 x i1>
2408 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2409 %6 = bitcast <16 x i32> %5 to <8 x i64>
2413 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2414 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2415 ; CHECK: # %bb.0: # %entry
2416 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2417 ; CHECK-NEXT: ret{{[l|q]}}
2419 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2423 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2424 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2425 ; X86: # %bb.0: # %entry
2426 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2427 ; X86-NEXT: kmovw %eax, %k1
2428 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2431 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2432 ; X64: # %bb.0: # %entry
2433 ; X64-NEXT: kmovw %edi, %k1
2434 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2437 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2438 %1 = bitcast i8 %__U to <8 x i1>
2439 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2443 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2444 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2445 ; X86: # %bb.0: # %entry
2446 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2447 ; X86-NEXT: kmovw %eax, %k1
2448 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2451 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2452 ; X64: # %bb.0: # %entry
2453 ; X64-NEXT: kmovw %edi, %k1
2454 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2457 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2458 %1 = bitcast i8 %__U to <8 x i1>
2459 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2463 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2464 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2465 ; CHECK: # %bb.0: # %entry
2466 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2467 ; CHECK-NEXT: ret{{[l|q]}}
2469 %0 = bitcast <8 x i64> %__I to <16 x i32>
2470 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2474 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2475 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2476 ; X86: # %bb.0: # %entry
2477 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2478 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2481 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2482 ; X64: # %bb.0: # %entry
2483 ; X64-NEXT: kmovw %edi, %k1
2484 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2487 %0 = bitcast <8 x i64> %__I to <16 x i32>
2488 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2489 %2 = bitcast i16 %__U to <16 x i1>
2490 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2494 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2495 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2496 ; X86: # %bb.0: # %entry
2497 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2498 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2501 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2502 ; X64: # %bb.0: # %entry
2503 ; X64-NEXT: kmovw %edi, %k1
2504 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2507 %0 = bitcast <8 x i64> %__I to <16 x i32>
2508 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2509 %2 = bitcast i16 %__U to <16 x i1>
2510 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2514 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2515 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2516 ; CHECK: # %bb.0: # %entry
2517 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2518 ; CHECK-NEXT: ret{{[l|q]}}
2520 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2524 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2525 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2526 ; X86: # %bb.0: # %entry
2527 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2528 ; X86-NEXT: kmovw %eax, %k1
2529 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2532 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2533 ; X64: # %bb.0: # %entry
2534 ; X64-NEXT: kmovw %edi, %k1
2535 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2538 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2539 %1 = bitcast i8 %__U to <8 x i1>
2540 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2544 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2545 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2546 ; X86: # %bb.0: # %entry
2547 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2548 ; X86-NEXT: kmovw %eax, %k1
2549 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2552 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2553 ; X64: # %bb.0: # %entry
2554 ; X64-NEXT: kmovw %edi, %k1
2555 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2558 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2559 %1 = bitcast i8 %__U to <8 x i1>
2560 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2563 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2564 ; X86-LABEL: test_mm_mask_add_ss:
2565 ; X86: # %bb.0: # %entry
2566 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2567 ; X86-NEXT: kmovw %eax, %k1
2568 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2571 ; X64-LABEL: test_mm_mask_add_ss:
2572 ; X64: # %bb.0: # %entry
2573 ; X64-NEXT: kmovw %edi, %k1
2574 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2577 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2578 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2579 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2581 %tobool.i = icmp eq i8 %0, 0
2582 %vecext1.i = extractelement <4 x float> %__W, i32 0
2583 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2584 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2585 ret <4 x float> %vecins.i
2588 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2589 ; X86-LABEL: test_mm_maskz_add_ss:
2590 ; X86: # %bb.0: # %entry
2591 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2592 ; X86-NEXT: kmovw %eax, %k1
2593 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2596 ; X64-LABEL: test_mm_maskz_add_ss:
2597 ; X64: # %bb.0: # %entry
2598 ; X64-NEXT: kmovw %edi, %k1
2599 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2602 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2603 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2604 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2606 %tobool.i = icmp eq i8 %0, 0
2607 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2608 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2609 ret <4 x float> %vecins.i
2612 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2613 ; X86-LABEL: test_mm_mask_add_sd:
2614 ; X86: # %bb.0: # %entry
2615 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2616 ; X86-NEXT: kmovw %eax, %k1
2617 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2620 ; X64-LABEL: test_mm_mask_add_sd:
2621 ; X64: # %bb.0: # %entry
2622 ; X64-NEXT: kmovw %edi, %k1
2623 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2626 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2627 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2628 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2630 %tobool.i = icmp eq i8 %0, 0
2631 %vecext1.i = extractelement <2 x double> %__W, i32 0
2632 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2633 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2634 ret <2 x double> %vecins.i
2637 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2638 ; X86-LABEL: test_mm_maskz_add_sd:
2639 ; X86: # %bb.0: # %entry
2640 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2641 ; X86-NEXT: kmovw %eax, %k1
2642 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2645 ; X64-LABEL: test_mm_maskz_add_sd:
2646 ; X64: # %bb.0: # %entry
2647 ; X64-NEXT: kmovw %edi, %k1
2648 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2651 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2652 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2653 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2655 %tobool.i = icmp eq i8 %0, 0
2656 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2657 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2658 ret <2 x double> %vecins.i
2661 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2662 ; X86-LABEL: test_mm_mask_sub_ss:
2663 ; X86: # %bb.0: # %entry
2664 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2665 ; X86-NEXT: kmovw %eax, %k1
2666 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2669 ; X64-LABEL: test_mm_mask_sub_ss:
2670 ; X64: # %bb.0: # %entry
2671 ; X64-NEXT: kmovw %edi, %k1
2672 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2675 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2676 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2677 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2679 %tobool.i = icmp eq i8 %0, 0
2680 %vecext1.i = extractelement <4 x float> %__W, i32 0
2681 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2682 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2683 ret <4 x float> %vecins.i
2686 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2687 ; X86-LABEL: test_mm_maskz_sub_ss:
2688 ; X86: # %bb.0: # %entry
2689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2690 ; X86-NEXT: kmovw %eax, %k1
2691 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2694 ; X64-LABEL: test_mm_maskz_sub_ss:
2695 ; X64: # %bb.0: # %entry
2696 ; X64-NEXT: kmovw %edi, %k1
2697 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2700 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2701 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2702 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2704 %tobool.i = icmp eq i8 %0, 0
2705 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2706 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2707 ret <4 x float> %vecins.i
2710 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2711 ; X86-LABEL: test_mm_mask_sub_sd:
2712 ; X86: # %bb.0: # %entry
2713 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2714 ; X86-NEXT: kmovw %eax, %k1
2715 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2718 ; X64-LABEL: test_mm_mask_sub_sd:
2719 ; X64: # %bb.0: # %entry
2720 ; X64-NEXT: kmovw %edi, %k1
2721 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2724 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2725 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2726 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2728 %tobool.i = icmp eq i8 %0, 0
2729 %vecext1.i = extractelement <2 x double> %__W, i32 0
2730 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2731 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2732 ret <2 x double> %vecins.i
2735 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2736 ; X86-LABEL: test_mm_maskz_sub_sd:
2737 ; X86: # %bb.0: # %entry
2738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2739 ; X86-NEXT: kmovw %eax, %k1
2740 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2743 ; X64-LABEL: test_mm_maskz_sub_sd:
2744 ; X64: # %bb.0: # %entry
2745 ; X64-NEXT: kmovw %edi, %k1
2746 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2749 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2750 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2751 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2753 %tobool.i = icmp eq i8 %0, 0
2754 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2755 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2756 ret <2 x double> %vecins.i
2759 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2760 ; X86-LABEL: test_mm_mask_mul_ss:
2761 ; X86: # %bb.0: # %entry
2762 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2763 ; X86-NEXT: kmovw %eax, %k1
2764 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2767 ; X64-LABEL: test_mm_mask_mul_ss:
2768 ; X64: # %bb.0: # %entry
2769 ; X64-NEXT: kmovw %edi, %k1
2770 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2773 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2774 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2775 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2777 %tobool.i = icmp eq i8 %0, 0
2778 %vecext1.i = extractelement <4 x float> %__W, i32 0
2779 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2780 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2781 ret <4 x float> %vecins.i
2784 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2785 ; X86-LABEL: test_mm_maskz_mul_ss:
2786 ; X86: # %bb.0: # %entry
2787 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2788 ; X86-NEXT: kmovw %eax, %k1
2789 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2792 ; X64-LABEL: test_mm_maskz_mul_ss:
2793 ; X64: # %bb.0: # %entry
2794 ; X64-NEXT: kmovw %edi, %k1
2795 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2798 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2799 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2800 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2802 %tobool.i = icmp eq i8 %0, 0
2803 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2804 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2805 ret <4 x float> %vecins.i
2808 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2809 ; X86-LABEL: test_mm_mask_mul_sd:
2810 ; X86: # %bb.0: # %entry
2811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2812 ; X86-NEXT: kmovw %eax, %k1
2813 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2816 ; X64-LABEL: test_mm_mask_mul_sd:
2817 ; X64: # %bb.0: # %entry
2818 ; X64-NEXT: kmovw %edi, %k1
2819 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2822 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2823 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2824 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2826 %tobool.i = icmp eq i8 %0, 0
2827 %vecext1.i = extractelement <2 x double> %__W, i32 0
2828 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2829 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2830 ret <2 x double> %vecins.i
2833 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2834 ; X86-LABEL: test_mm_maskz_mul_sd:
2835 ; X86: # %bb.0: # %entry
2836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2837 ; X86-NEXT: kmovw %eax, %k1
2838 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2841 ; X64-LABEL: test_mm_maskz_mul_sd:
2842 ; X64: # %bb.0: # %entry
2843 ; X64-NEXT: kmovw %edi, %k1
2844 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2847 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2848 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2849 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2851 %tobool.i = icmp eq i8 %0, 0
2852 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2853 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2854 ret <2 x double> %vecins.i
2857 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2858 ; X86-LABEL: test_mm_mask_div_ss:
2859 ; X86: # %bb.0: # %entry
2860 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2861 ; X86-NEXT: kmovw %eax, %k1
2862 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2865 ; X64-LABEL: test_mm_mask_div_ss:
2866 ; X64: # %bb.0: # %entry
2867 ; X64-NEXT: kmovw %edi, %k1
2868 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2871 %0 = extractelement <4 x float> %__A, i64 0
2872 %1 = extractelement <4 x float> %__B, i64 0
2873 %2 = extractelement <4 x float> %__W, i64 0
2874 %3 = fdiv float %0, %1
2875 %4 = bitcast i8 %__U to <8 x i1>
2876 %5 = extractelement <8 x i1> %4, i64 0
2877 %6 = select i1 %5, float %3, float %2
2878 %7 = insertelement <4 x float> %__A, float %6, i64 0
2882 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2883 ; X86-LABEL: test_mm_maskz_div_ss:
2884 ; X86: # %bb.0: # %entry
2885 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2886 ; X86-NEXT: kmovw %eax, %k1
2887 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2890 ; X64-LABEL: test_mm_maskz_div_ss:
2891 ; X64: # %bb.0: # %entry
2892 ; X64-NEXT: kmovw %edi, %k1
2893 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2896 %0 = extractelement <4 x float> %__A, i64 0
2897 %1 = extractelement <4 x float> %__B, i64 0
2898 %2 = fdiv float %0, %1
2899 %3 = bitcast i8 %__U to <8 x i1>
2900 %4 = extractelement <8 x i1> %3, i64 0
2901 %5 = select i1 %4, float %2, float 0.000000e+00
2902 %6 = insertelement <4 x float> %__A, float %5, i64 0
2906 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2907 ; X86-LABEL: test_mm_mask_div_sd:
2908 ; X86: # %bb.0: # %entry
2909 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2910 ; X86-NEXT: kmovw %eax, %k1
2911 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2914 ; X64-LABEL: test_mm_mask_div_sd:
2915 ; X64: # %bb.0: # %entry
2916 ; X64-NEXT: kmovw %edi, %k1
2917 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2920 %0 = extractelement <2 x double> %__A, i64 0
2921 %1 = extractelement <2 x double> %__B, i64 0
2922 %2 = extractelement <2 x double> %__W, i64 0
2923 %3 = fdiv double %0, %1
2924 %4 = bitcast i8 %__U to <8 x i1>
2925 %5 = extractelement <8 x i1> %4, i64 0
2926 %6 = select i1 %5, double %3, double %2
2927 %7 = insertelement <2 x double> %__A, double %6, i64 0
2931 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2932 ; X86-LABEL: test_mm_maskz_div_sd:
2933 ; X86: # %bb.0: # %entry
2934 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2935 ; X86-NEXT: kmovw %eax, %k1
2936 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2939 ; X64-LABEL: test_mm_maskz_div_sd:
2940 ; X64: # %bb.0: # %entry
2941 ; X64-NEXT: kmovw %edi, %k1
2942 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2945 %0 = extractelement <2 x double> %__A, i64 0
2946 %1 = extractelement <2 x double> %__B, i64 0
2947 %2 = fdiv double %0, %1
2948 %3 = bitcast i8 %__U to <8 x i1>
2949 %4 = extractelement <8 x i1> %3, i64 0
2950 %5 = select i1 %4, double %2, double 0.000000e+00
2951 %6 = insertelement <2 x double> %__A, double %5, i64 0
2956 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
2957 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
2958 ; CHECK: # %bb.0: # %entry
2959 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
2960 ; CHECK-NEXT: ret{{[l|q]}}
2962 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2966 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
2968 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
2969 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
2970 ; X86: # %bb.0: # %entry
2971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2972 ; X86-NEXT: kmovw %eax, %k1
2973 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
2976 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
2977 ; X64: # %bb.0: # %entry
2978 ; X64-NEXT: kmovw %edi, %k1
2979 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
2982 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2983 %1 = bitcast i8 %__U to <8 x i1>
2984 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2988 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
2989 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
2990 ; X86: # %bb.0: # %entry
2991 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2992 ; X86-NEXT: kmovw %eax, %k1
2993 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2994 ; X86-NEXT: vmovapd %zmm2, %zmm0
2997 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
2998 ; X64: # %bb.0: # %entry
2999 ; X64-NEXT: kmovw %edi, %k1
3000 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3001 ; X64-NEXT: vmovapd %zmm2, %zmm0
3004 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3005 %1 = bitcast i8 %__U to <8 x i1>
3006 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3010 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3011 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3012 ; X86: # %bb.0: # %entry
3013 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3014 ; X86-NEXT: kmovw %eax, %k1
3015 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3018 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3019 ; X64: # %bb.0: # %entry
3020 ; X64-NEXT: kmovw %edi, %k1
3021 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3024 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3025 %1 = bitcast i8 %__U to <8 x i1>
3026 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3030 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3031 ; X86-LABEL: test_mm512_fmsub_round_pd:
3032 ; X86: # %bb.0: # %entry
3033 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3034 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3037 ; X64-LABEL: test_mm512_fmsub_round_pd:
3038 ; X64: # %bb.0: # %entry
3039 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3040 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3043 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3044 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3048 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3049 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3050 ; X86: # %bb.0: # %entry
3051 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3052 ; X86-NEXT: kmovw %eax, %k1
3053 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3056 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3057 ; X64: # %bb.0: # %entry
3058 ; X64-NEXT: kmovw %edi, %k1
3059 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3062 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3063 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3064 %1 = bitcast i8 %__U to <8 x i1>
3065 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3069 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3070 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3071 ; X86: # %bb.0: # %entry
3072 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3073 ; X86-NEXT: kmovw %eax, %k1
3074 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3077 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3078 ; X64: # %bb.0: # %entry
3079 ; X64-NEXT: kmovw %edi, %k1
3080 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3083 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3084 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3085 %1 = bitcast i8 %__U to <8 x i1>
3086 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3090 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3091 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3092 ; X86: # %bb.0: # %entry
3093 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3094 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3097 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3098 ; X64: # %bb.0: # %entry
3099 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3100 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3103 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3104 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3108 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3109 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3110 ; X86: # %bb.0: # %entry
3111 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3112 ; X86-NEXT: kmovw %eax, %k1
3113 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3114 ; X86-NEXT: vmovapd %zmm2, %zmm0
3117 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3118 ; X64: # %bb.0: # %entry
3119 ; X64-NEXT: kmovw %edi, %k1
3120 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3121 ; X64-NEXT: vmovapd %zmm2, %zmm0
3124 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3125 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3126 %1 = bitcast i8 %__U to <8 x i1>
3127 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3131 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3132 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3133 ; X86: # %bb.0: # %entry
3134 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3135 ; X86-NEXT: kmovw %eax, %k1
3136 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3139 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3140 ; X64: # %bb.0: # %entry
3141 ; X64-NEXT: kmovw %edi, %k1
3142 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3145 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3146 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3147 %1 = bitcast i8 %__U to <8 x i1>
3148 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3152 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3153 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3154 ; CHECK: # %bb.0: # %entry
3155 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
3156 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3157 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3158 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3159 ; CHECK-NEXT: ret{{[l|q]}}
3161 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3162 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3163 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3167 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3168 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3169 ; X86: # %bb.0: # %entry
3170 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3171 ; X86-NEXT: kmovw %eax, %k1
3172 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3175 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3176 ; X64: # %bb.0: # %entry
3177 ; X64-NEXT: kmovw %edi, %k1
3178 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3181 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3182 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3183 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3184 %1 = bitcast i8 %__U to <8 x i1>
3185 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3189 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3190 ; CHECK-LABEL: test_mm512_fmadd_pd:
3191 ; CHECK: # %bb.0: # %entry
3192 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3193 ; CHECK-NEXT: ret{{[l|q]}}
3195 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3199 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3200 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3201 ; X86: # %bb.0: # %entry
3202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3203 ; X86-NEXT: kmovw %eax, %k1
3204 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3207 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3208 ; X64: # %bb.0: # %entry
3209 ; X64-NEXT: kmovw %edi, %k1
3210 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3213 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3214 %1 = bitcast i8 %__U to <8 x i1>
3215 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3219 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3220 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3221 ; X86: # %bb.0: # %entry
3222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3223 ; X86-NEXT: kmovw %eax, %k1
3224 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3225 ; X86-NEXT: vmovapd %zmm2, %zmm0
3228 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3229 ; X64: # %bb.0: # %entry
3230 ; X64-NEXT: kmovw %edi, %k1
3231 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3232 ; X64-NEXT: vmovapd %zmm2, %zmm0
3235 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3236 %1 = bitcast i8 %__U to <8 x i1>
3237 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3241 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3242 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3243 ; X86: # %bb.0: # %entry
3244 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3245 ; X86-NEXT: kmovw %eax, %k1
3246 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3249 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3250 ; X64: # %bb.0: # %entry
3251 ; X64-NEXT: kmovw %edi, %k1
3252 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3255 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3256 %1 = bitcast i8 %__U to <8 x i1>
3257 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3261 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3262 ; X86-LABEL: test_mm512_fmsub_pd:
3263 ; X86: # %bb.0: # %entry
3264 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3265 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3268 ; X64-LABEL: test_mm512_fmsub_pd:
3269 ; X64: # %bb.0: # %entry
3270 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3271 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3274 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3275 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3279 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3280 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3281 ; X86: # %bb.0: # %entry
3282 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3283 ; X86-NEXT: kmovw %eax, %k1
3284 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3287 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3288 ; X64: # %bb.0: # %entry
3289 ; X64-NEXT: kmovw %edi, %k1
3290 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3293 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3294 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3295 %1 = bitcast i8 %__U to <8 x i1>
3296 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3300 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3301 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3302 ; X86: # %bb.0: # %entry
3303 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3304 ; X86-NEXT: kmovw %eax, %k1
3305 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3308 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3309 ; X64: # %bb.0: # %entry
3310 ; X64-NEXT: kmovw %edi, %k1
3311 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3314 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3315 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3316 %1 = bitcast i8 %__U to <8 x i1>
3317 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3321 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3322 ; X86-LABEL: test_mm512_fnmadd_pd:
3323 ; X86: # %bb.0: # %entry
3324 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3325 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3328 ; X64-LABEL: test_mm512_fnmadd_pd:
3329 ; X64: # %bb.0: # %entry
3330 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3331 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3334 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3335 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3339 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3340 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3341 ; X86: # %bb.0: # %entry
3342 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3343 ; X86-NEXT: kmovw %eax, %k1
3344 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3345 ; X86-NEXT: vmovapd %zmm2, %zmm0
3348 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3349 ; X64: # %bb.0: # %entry
3350 ; X64-NEXT: kmovw %edi, %k1
3351 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3352 ; X64-NEXT: vmovapd %zmm2, %zmm0
3355 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3356 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3357 %1 = bitcast i8 %__U to <8 x i1>
3358 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3362 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3363 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3364 ; X86: # %bb.0: # %entry
3365 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3366 ; X86-NEXT: kmovw %eax, %k1
3367 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3370 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3371 ; X64: # %bb.0: # %entry
3372 ; X64-NEXT: kmovw %edi, %k1
3373 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3376 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3377 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3378 %1 = bitcast i8 %__U to <8 x i1>
3379 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3383 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3384 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3385 ; CHECK: # %bb.0: # %entry
3386 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
3387 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3388 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3389 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3390 ; CHECK-NEXT: ret{{[l|q]}}
3392 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3393 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3394 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3398 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3399 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3400 ; X86: # %bb.0: # %entry
3401 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3402 ; X86-NEXT: kmovw %eax, %k1
3403 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3406 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3407 ; X64: # %bb.0: # %entry
3408 ; X64-NEXT: kmovw %edi, %k1
3409 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3412 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3413 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3414 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3415 %1 = bitcast i8 %__U to <8 x i1>
3416 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3420 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3421 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3422 ; CHECK: # %bb.0: # %entry
3423 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3424 ; CHECK-NEXT: ret{{[l|q]}}
3426 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3430 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3432 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3433 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3434 ; X86: # %bb.0: # %entry
3435 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3436 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3439 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3440 ; X64: # %bb.0: # %entry
3441 ; X64-NEXT: kmovw %edi, %k1
3442 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3445 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3446 %1 = bitcast i16 %__U to <16 x i1>
3447 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3451 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3452 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3453 ; X86: # %bb.0: # %entry
3454 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3455 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3456 ; X86-NEXT: vmovaps %zmm2, %zmm0
3459 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3460 ; X64: # %bb.0: # %entry
3461 ; X64-NEXT: kmovw %edi, %k1
3462 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3463 ; X64-NEXT: vmovaps %zmm2, %zmm0
3466 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3467 %1 = bitcast i16 %__U to <16 x i1>
3468 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3472 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3473 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3474 ; X86: # %bb.0: # %entry
3475 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3476 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3479 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3480 ; X64: # %bb.0: # %entry
3481 ; X64-NEXT: kmovw %edi, %k1
3482 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3485 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3486 %1 = bitcast i16 %__U to <16 x i1>
3487 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3491 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3492 ; X86-LABEL: test_mm512_fmsub_round_ps:
3493 ; X86: # %bb.0: # %entry
3494 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3495 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3498 ; X64-LABEL: test_mm512_fmsub_round_ps:
3499 ; X64: # %bb.0: # %entry
3500 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3501 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3504 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3505 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3509 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3510 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3511 ; X86: # %bb.0: # %entry
3512 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3513 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3516 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3517 ; X64: # %bb.0: # %entry
3518 ; X64-NEXT: kmovw %edi, %k1
3519 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3522 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3523 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3524 %1 = bitcast i16 %__U to <16 x i1>
3525 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3529 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3530 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3531 ; X86: # %bb.0: # %entry
3532 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3533 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3536 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3537 ; X64: # %bb.0: # %entry
3538 ; X64-NEXT: kmovw %edi, %k1
3539 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3542 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3543 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3544 %1 = bitcast i16 %__U to <16 x i1>
3545 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3549 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3550 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3551 ; X86: # %bb.0: # %entry
3552 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3553 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3556 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3557 ; X64: # %bb.0: # %entry
3558 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3559 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3562 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3563 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3567 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3568 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3569 ; X86: # %bb.0: # %entry
3570 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3571 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3572 ; X86-NEXT: vmovaps %zmm2, %zmm0
3575 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3576 ; X64: # %bb.0: # %entry
3577 ; X64-NEXT: kmovw %edi, %k1
3578 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3579 ; X64-NEXT: vmovaps %zmm2, %zmm0
3582 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3583 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3584 %1 = bitcast i16 %__U to <16 x i1>
3585 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3589 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3590 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3591 ; X86: # %bb.0: # %entry
3592 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3593 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3596 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3597 ; X64: # %bb.0: # %entry
3598 ; X64-NEXT: kmovw %edi, %k1
3599 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3602 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3603 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3604 %1 = bitcast i16 %__U to <16 x i1>
3605 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3609 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3610 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3611 ; CHECK: # %bb.0: # %entry
3612 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
3613 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3614 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3615 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3616 ; CHECK-NEXT: ret{{[l|q]}}
3618 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3619 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3620 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3624 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3625 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3626 ; X86: # %bb.0: # %entry
3627 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3628 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3631 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3632 ; X64: # %bb.0: # %entry
3633 ; X64-NEXT: kmovw %edi, %k1
3634 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3637 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3638 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3639 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3640 %1 = bitcast i16 %__U to <16 x i1>
3641 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3645 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3646 ; CHECK-LABEL: test_mm512_fmadd_ps:
3647 ; CHECK: # %bb.0: # %entry
3648 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3649 ; CHECK-NEXT: ret{{[l|q]}}
3651 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3655 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3656 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3657 ; X86: # %bb.0: # %entry
3658 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3659 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3662 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3663 ; X64: # %bb.0: # %entry
3664 ; X64-NEXT: kmovw %edi, %k1
3665 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3668 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3669 %1 = bitcast i16 %__U to <16 x i1>
3670 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3674 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3675 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3676 ; X86: # %bb.0: # %entry
3677 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3678 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3679 ; X86-NEXT: vmovaps %zmm2, %zmm0
3682 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3683 ; X64: # %bb.0: # %entry
3684 ; X64-NEXT: kmovw %edi, %k1
3685 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3686 ; X64-NEXT: vmovaps %zmm2, %zmm0
3689 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3690 %1 = bitcast i16 %__U to <16 x i1>
3691 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3695 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3696 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3697 ; X86: # %bb.0: # %entry
3698 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3699 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3702 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3703 ; X64: # %bb.0: # %entry
3704 ; X64-NEXT: kmovw %edi, %k1
3705 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3708 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3709 %1 = bitcast i16 %__U to <16 x i1>
3710 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3714 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3715 ; X86-LABEL: test_mm512_fmsub_ps:
3716 ; X86: # %bb.0: # %entry
3717 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3718 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3721 ; X64-LABEL: test_mm512_fmsub_ps:
3722 ; X64: # %bb.0: # %entry
3723 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3724 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3727 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3728 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3732 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3733 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3734 ; X86: # %bb.0: # %entry
3735 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3736 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3739 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3740 ; X64: # %bb.0: # %entry
3741 ; X64-NEXT: kmovw %edi, %k1
3742 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3745 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3746 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3747 %1 = bitcast i16 %__U to <16 x i1>
3748 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3752 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3753 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3754 ; X86: # %bb.0: # %entry
3755 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3756 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3759 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3760 ; X64: # %bb.0: # %entry
3761 ; X64-NEXT: kmovw %edi, %k1
3762 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3765 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3766 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3767 %1 = bitcast i16 %__U to <16 x i1>
3768 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3772 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3773 ; X86-LABEL: test_mm512_fnmadd_ps:
3774 ; X86: # %bb.0: # %entry
3775 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3776 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3779 ; X64-LABEL: test_mm512_fnmadd_ps:
3780 ; X64: # %bb.0: # %entry
3781 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3782 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3785 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3786 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3790 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3791 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3792 ; X86: # %bb.0: # %entry
3793 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3794 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3795 ; X86-NEXT: vmovaps %zmm2, %zmm0
3798 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3799 ; X64: # %bb.0: # %entry
3800 ; X64-NEXT: kmovw %edi, %k1
3801 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3802 ; X64-NEXT: vmovaps %zmm2, %zmm0
3805 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3806 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3807 %1 = bitcast i16 %__U to <16 x i1>
3808 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3812 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3813 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3814 ; X86: # %bb.0: # %entry
3815 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3816 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3819 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3820 ; X64: # %bb.0: # %entry
3821 ; X64-NEXT: kmovw %edi, %k1
3822 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3825 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3826 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3827 %1 = bitcast i16 %__U to <16 x i1>
3828 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3832 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3833 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3834 ; CHECK: # %bb.0: # %entry
3835 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
3836 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3837 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3838 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3839 ; CHECK-NEXT: ret{{[l|q]}}
3841 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3842 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3843 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3847 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3848 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3849 ; X86: # %bb.0: # %entry
3850 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3851 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3854 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3855 ; X64: # %bb.0: # %entry
3856 ; X64-NEXT: kmovw %edi, %k1
3857 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3860 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3861 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3862 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3863 %1 = bitcast i16 %__U to <16 x i1>
3864 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3868 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3869 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3870 ; CHECK: # %bb.0: # %entry
3871 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3872 ; CHECK-NEXT: ret{{[l|q]}}
3874 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3878 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3880 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3881 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3882 ; X86: # %bb.0: # %entry
3883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3884 ; X86-NEXT: kmovw %eax, %k1
3885 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3888 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3889 ; X64: # %bb.0: # %entry
3890 ; X64-NEXT: kmovw %edi, %k1
3891 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3894 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3895 %1 = bitcast i8 %__U to <8 x i1>
3896 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3900 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3901 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3902 ; X86: # %bb.0: # %entry
3903 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3904 ; X86-NEXT: kmovw %eax, %k1
3905 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3906 ; X86-NEXT: vmovapd %zmm2, %zmm0
3909 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3910 ; X64: # %bb.0: # %entry
3911 ; X64-NEXT: kmovw %edi, %k1
3912 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3913 ; X64-NEXT: vmovapd %zmm2, %zmm0
3916 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3917 %1 = bitcast i8 %__U to <8 x i1>
3918 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3922 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3923 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3924 ; X86: # %bb.0: # %entry
3925 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3926 ; X86-NEXT: kmovw %eax, %k1
3927 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3930 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3931 ; X64: # %bb.0: # %entry
3932 ; X64-NEXT: kmovw %edi, %k1
3933 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3936 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3937 %1 = bitcast i8 %__U to <8 x i1>
3938 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3942 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3943 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
3944 ; X86: # %bb.0: # %entry
3945 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3946 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3949 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
3950 ; X64: # %bb.0: # %entry
3951 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3952 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3955 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3956 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3960 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3961 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
3962 ; X86: # %bb.0: # %entry
3963 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3964 ; X86-NEXT: kmovw %eax, %k1
3965 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3968 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
3969 ; X64: # %bb.0: # %entry
3970 ; X64-NEXT: kmovw %edi, %k1
3971 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3974 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3975 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3976 %1 = bitcast i8 %__U to <8 x i1>
3977 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3981 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3982 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
3983 ; X86: # %bb.0: # %entry
3984 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3985 ; X86-NEXT: kmovw %eax, %k1
3986 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3989 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
3990 ; X64: # %bb.0: # %entry
3991 ; X64-NEXT: kmovw %edi, %k1
3992 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3995 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3996 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3997 %1 = bitcast i8 %__U to <8 x i1>
3998 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4002 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4003 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4004 ; CHECK: # %bb.0: # %entry
4005 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4006 ; CHECK-NEXT: ret{{[l|q]}}
4008 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4009 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4010 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4011 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4015 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4016 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4017 ; X86: # %bb.0: # %entry
4018 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4019 ; X86-NEXT: kmovw %eax, %k1
4020 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4023 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4024 ; X64: # %bb.0: # %entry
4025 ; X64-NEXT: kmovw %edi, %k1
4026 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4029 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4030 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4031 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4032 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4033 %4 = bitcast i8 %__U to <8 x i1>
4034 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4038 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4039 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4040 ; X86: # %bb.0: # %entry
4041 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4042 ; X86-NEXT: kmovw %eax, %k1
4043 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4044 ; X86-NEXT: vmovapd %zmm2, %zmm0
4047 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4048 ; X64: # %bb.0: # %entry
4049 ; X64-NEXT: kmovw %edi, %k1
4050 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4051 ; X64-NEXT: vmovapd %zmm2, %zmm0
4054 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4055 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4056 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4057 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4058 %4 = bitcast i8 %__U to <8 x i1>
4059 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4063 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4064 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4065 ; X86: # %bb.0: # %entry
4066 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4067 ; X86-NEXT: kmovw %eax, %k1
4068 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4071 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4072 ; X64: # %bb.0: # %entry
4073 ; X64-NEXT: kmovw %edi, %k1
4074 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4077 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4078 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4079 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4080 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4081 %4 = bitcast i8 %__U to <8 x i1>
4082 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4086 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4087 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4088 ; CHECK: # %bb.0: # %entry
4089 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4090 ; CHECK-NEXT: ret{{[l|q]}}
4092 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4093 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4094 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4095 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4099 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4100 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4101 ; X86: # %bb.0: # %entry
4102 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4103 ; X86-NEXT: kmovw %eax, %k1
4104 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4107 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4108 ; X64: # %bb.0: # %entry
4109 ; X64-NEXT: kmovw %edi, %k1
4110 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4113 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4114 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4115 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4116 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4117 %3 = bitcast i8 %__U to <8 x i1>
4118 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4122 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4123 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4124 ; X86: # %bb.0: # %entry
4125 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4126 ; X86-NEXT: kmovw %eax, %k1
4127 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4130 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4131 ; X64: # %bb.0: # %entry
4132 ; X64-NEXT: kmovw %edi, %k1
4133 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4136 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4137 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4138 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4139 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4140 %3 = bitcast i8 %__U to <8 x i1>
4141 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4145 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4146 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4147 ; CHECK: # %bb.0: # %entry
4148 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4149 ; CHECK-NEXT: ret{{[l|q]}}
4151 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4155 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4157 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4158 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4159 ; X86: # %bb.0: # %entry
4160 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4161 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4164 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4165 ; X64: # %bb.0: # %entry
4166 ; X64-NEXT: kmovw %edi, %k1
4167 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4170 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4171 %1 = bitcast i16 %__U to <16 x i1>
4172 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4176 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4177 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4178 ; X86: # %bb.0: # %entry
4179 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4180 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4181 ; X86-NEXT: vmovaps %zmm2, %zmm0
4184 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4185 ; X64: # %bb.0: # %entry
4186 ; X64-NEXT: kmovw %edi, %k1
4187 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4188 ; X64-NEXT: vmovaps %zmm2, %zmm0
4191 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4192 %1 = bitcast i16 %__U to <16 x i1>
4193 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4197 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4198 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4199 ; X86: # %bb.0: # %entry
4200 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4201 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4204 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4205 ; X64: # %bb.0: # %entry
4206 ; X64-NEXT: kmovw %edi, %k1
4207 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4210 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4211 %1 = bitcast i16 %__U to <16 x i1>
4212 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4216 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4217 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4218 ; X86: # %bb.0: # %entry
4219 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4220 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4223 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4224 ; X64: # %bb.0: # %entry
4225 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4226 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4229 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4230 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4234 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4235 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4236 ; X86: # %bb.0: # %entry
4237 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4238 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4241 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4242 ; X64: # %bb.0: # %entry
4243 ; X64-NEXT: kmovw %edi, %k1
4244 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4247 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4248 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4249 %1 = bitcast i16 %__U to <16 x i1>
4250 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4254 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4255 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4256 ; X86: # %bb.0: # %entry
4257 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4258 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4261 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4262 ; X64: # %bb.0: # %entry
4263 ; X64-NEXT: kmovw %edi, %k1
4264 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4267 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4268 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4269 %1 = bitcast i16 %__U to <16 x i1>
4270 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4274 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4275 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4276 ; CHECK: # %bb.0: # %entry
4277 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4278 ; CHECK-NEXT: ret{{[l|q]}}
4280 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4281 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4282 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4283 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4287 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4288 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4289 ; X86: # %bb.0: # %entry
4290 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4291 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4294 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4295 ; X64: # %bb.0: # %entry
4296 ; X64-NEXT: kmovw %edi, %k1
4297 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4300 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4301 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4302 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4303 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4304 %4 = bitcast i16 %__U to <16 x i1>
4305 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4309 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4310 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4311 ; X86: # %bb.0: # %entry
4312 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4313 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4314 ; X86-NEXT: vmovaps %zmm2, %zmm0
4317 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4318 ; X64: # %bb.0: # %entry
4319 ; X64-NEXT: kmovw %edi, %k1
4320 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4321 ; X64-NEXT: vmovaps %zmm2, %zmm0
4324 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4325 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4326 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4327 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4328 %4 = bitcast i16 %__U to <16 x i1>
4329 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4333 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4334 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4335 ; X86: # %bb.0: # %entry
4336 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4337 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4340 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4341 ; X64: # %bb.0: # %entry
4342 ; X64-NEXT: kmovw %edi, %k1
4343 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4346 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4347 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4348 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4349 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4350 %4 = bitcast i16 %__U to <16 x i1>
4351 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4355 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4356 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4357 ; CHECK: # %bb.0: # %entry
4358 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4359 ; CHECK-NEXT: ret{{[l|q]}}
4361 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4362 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4363 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4364 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4368 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4369 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4370 ; X86: # %bb.0: # %entry
4371 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4372 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4375 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4376 ; X64: # %bb.0: # %entry
4377 ; X64-NEXT: kmovw %edi, %k1
4378 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4381 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4382 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4383 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4384 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4385 %3 = bitcast i16 %__U to <16 x i1>
4386 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4390 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4391 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4392 ; X86: # %bb.0: # %entry
4393 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4394 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4397 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4398 ; X64: # %bb.0: # %entry
4399 ; X64-NEXT: kmovw %edi, %k1
4400 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4403 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4404 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4405 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4406 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4407 %3 = bitcast i16 %__U to <16 x i1>
4408 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4412 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4413 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4414 ; X86: # %bb.0: # %entry
4415 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4416 ; X86-NEXT: kmovw %eax, %k1
4417 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4418 ; X86-NEXT: vmovapd %zmm2, %zmm0
4421 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4422 ; X64: # %bb.0: # %entry
4423 ; X64-NEXT: kmovw %edi, %k1
4424 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4425 ; X64-NEXT: vmovapd %zmm2, %zmm0
4428 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4429 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4430 %1 = bitcast i8 %__U to <8 x i1>
4431 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4435 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4436 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4437 ; X86: # %bb.0: # %entry
4438 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4439 ; X86-NEXT: kmovw %eax, %k1
4440 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4441 ; X86-NEXT: vmovapd %zmm2, %zmm0
4444 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4445 ; X64: # %bb.0: # %entry
4446 ; X64-NEXT: kmovw %edi, %k1
4447 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4448 ; X64-NEXT: vmovapd %zmm2, %zmm0
4451 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4452 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4453 %1 = bitcast i8 %__U to <8 x i1>
4454 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4458 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4459 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4460 ; X86: # %bb.0: # %entry
4461 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4462 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4463 ; X86-NEXT: vmovaps %zmm2, %zmm0
4466 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4467 ; X64: # %bb.0: # %entry
4468 ; X64-NEXT: kmovw %edi, %k1
4469 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4470 ; X64-NEXT: vmovaps %zmm2, %zmm0
4473 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4474 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4475 %1 = bitcast i16 %__U to <16 x i1>
4476 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4480 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4481 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4482 ; X86: # %bb.0: # %entry
4483 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4484 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4485 ; X86-NEXT: vmovaps %zmm2, %zmm0
4488 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4489 ; X64: # %bb.0: # %entry
4490 ; X64-NEXT: kmovw %edi, %k1
4491 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4492 ; X64-NEXT: vmovaps %zmm2, %zmm0
4495 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4496 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4497 %1 = bitcast i16 %__U to <16 x i1>
4498 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4502 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4503 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4504 ; X86: # %bb.0: # %entry
4505 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4506 ; X86-NEXT: kmovw %eax, %k1
4507 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4508 ; X86-NEXT: vmovapd %zmm2, %zmm0
4511 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4512 ; X64: # %bb.0: # %entry
4513 ; X64-NEXT: kmovw %edi, %k1
4514 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4515 ; X64-NEXT: vmovapd %zmm2, %zmm0
4518 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4519 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4520 %1 = bitcast i8 %__U to <8 x i1>
4521 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4525 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4526 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4527 ; X86: # %bb.0: # %entry
4528 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4529 ; X86-NEXT: kmovw %eax, %k1
4530 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4531 ; X86-NEXT: vmovapd %zmm2, %zmm0
4534 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4535 ; X64: # %bb.0: # %entry
4536 ; X64-NEXT: kmovw %edi, %k1
4537 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4538 ; X64-NEXT: vmovapd %zmm2, %zmm0
4541 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4542 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4543 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4544 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4545 %3 = bitcast i8 %__U to <8 x i1>
4546 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4550 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4551 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4552 ; X86: # %bb.0: # %entry
4553 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4554 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4555 ; X86-NEXT: vmovaps %zmm2, %zmm0
4558 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4559 ; X64: # %bb.0: # %entry
4560 ; X64-NEXT: kmovw %edi, %k1
4561 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4562 ; X64-NEXT: vmovaps %zmm2, %zmm0
4565 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4566 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4567 %1 = bitcast i16 %__U to <16 x i1>
4568 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4572 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4573 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4574 ; X86: # %bb.0: # %entry
4575 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4576 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4577 ; X86-NEXT: vmovaps %zmm2, %zmm0
4580 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4581 ; X64: # %bb.0: # %entry
4582 ; X64-NEXT: kmovw %edi, %k1
4583 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4584 ; X64-NEXT: vmovaps %zmm2, %zmm0
4587 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4588 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4589 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4590 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4591 %3 = bitcast i16 %__U to <16 x i1>
4592 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4596 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4597 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4598 ; X86: # %bb.0: # %entry
4599 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4600 ; X86-NEXT: kmovw %eax, %k1
4601 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4604 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4605 ; X64: # %bb.0: # %entry
4606 ; X64-NEXT: kmovw %edi, %k1
4607 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4610 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4611 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4612 %1 = bitcast i8 %__U to <8 x i1>
4613 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4617 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4618 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4619 ; X86: # %bb.0: # %entry
4620 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4621 ; X86-NEXT: kmovw %eax, %k1
4622 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4625 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4626 ; X64: # %bb.0: # %entry
4627 ; X64-NEXT: kmovw %edi, %k1
4628 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4631 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4632 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4633 %1 = bitcast i8 %__U to <8 x i1>
4634 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4638 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4639 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4640 ; X86: # %bb.0: # %entry
4641 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4642 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4645 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4646 ; X64: # %bb.0: # %entry
4647 ; X64-NEXT: kmovw %edi, %k1
4648 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4651 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4652 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4653 %1 = bitcast i16 %__U to <16 x i1>
4654 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4658 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4659 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4660 ; X86: # %bb.0: # %entry
4661 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4662 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4665 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4666 ; X64: # %bb.0: # %entry
4667 ; X64-NEXT: kmovw %edi, %k1
4668 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4671 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4672 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4673 %1 = bitcast i16 %__U to <16 x i1>
4674 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4678 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4679 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4680 ; X86: # %bb.0: # %entry
4681 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4682 ; X86-NEXT: kmovw %eax, %k1
4683 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4686 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4687 ; X64: # %bb.0: # %entry
4688 ; X64-NEXT: kmovw %edi, %k1
4689 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4692 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4693 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4694 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4695 %1 = bitcast i8 %__U to <8 x i1>
4696 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4700 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4701 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4702 ; X86: # %bb.0: # %entry
4703 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4704 ; X86-NEXT: kmovw %eax, %k1
4705 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4706 ; X86-NEXT: vmovapd %zmm2, %zmm0
4709 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4710 ; X64: # %bb.0: # %entry
4711 ; X64-NEXT: kmovw %edi, %k1
4712 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4713 ; X64-NEXT: vmovapd %zmm2, %zmm0
4716 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4717 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4718 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4719 %1 = bitcast i8 %__U to <8 x i1>
4720 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4724 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4725 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4726 ; X86: # %bb.0: # %entry
4727 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4728 ; X86-NEXT: kmovw %eax, %k1
4729 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4732 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4733 ; X64: # %bb.0: # %entry
4734 ; X64-NEXT: kmovw %edi, %k1
4735 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4738 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4739 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4740 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4741 %1 = bitcast i8 %__U to <8 x i1>
4742 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4746 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4747 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4748 ; X86: # %bb.0: # %entry
4749 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4750 ; X86-NEXT: kmovw %eax, %k1
4751 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4752 ; X86-NEXT: vmovapd %zmm2, %zmm0
4755 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4756 ; X64: # %bb.0: # %entry
4757 ; X64-NEXT: kmovw %edi, %k1
4758 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4759 ; X64-NEXT: vmovapd %zmm2, %zmm0
4762 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4763 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4764 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4765 %1 = bitcast i8 %__U to <8 x i1>
4766 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4770 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4771 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4772 ; X86: # %bb.0: # %entry
4773 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4774 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4777 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4778 ; X64: # %bb.0: # %entry
4779 ; X64-NEXT: kmovw %edi, %k1
4780 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4783 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4784 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4785 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4786 %1 = bitcast i16 %__U to <16 x i1>
4787 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4791 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4792 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4793 ; X86: # %bb.0: # %entry
4794 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4795 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4796 ; X86-NEXT: vmovaps %zmm2, %zmm0
4799 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4800 ; X64: # %bb.0: # %entry
4801 ; X64-NEXT: kmovw %edi, %k1
4802 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4803 ; X64-NEXT: vmovaps %zmm2, %zmm0
4806 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4807 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4808 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4809 %1 = bitcast i16 %__U to <16 x i1>
4810 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4814 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4815 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4816 ; X86: # %bb.0: # %entry
4817 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4818 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4821 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4822 ; X64: # %bb.0: # %entry
4823 ; X64-NEXT: kmovw %edi, %k1
4824 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4827 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4828 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4829 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4830 %1 = bitcast i16 %__U to <16 x i1>
4831 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4835 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4836 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4837 ; X86: # %bb.0: # %entry
4838 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4839 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4840 ; X86-NEXT: vmovaps %zmm2, %zmm0
4843 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4844 ; X64: # %bb.0: # %entry
4845 ; X64-NEXT: kmovw %edi, %k1
4846 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4847 ; X64-NEXT: vmovaps %zmm2, %zmm0
4850 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4851 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4852 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4853 %1 = bitcast i16 %__U to <16 x i1>
4854 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4858 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4859 ; X86-LABEL: test_mm_mask_fmadd_ss:
4860 ; X86: # %bb.0: # %entry
4861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4862 ; X86-NEXT: kmovw %eax, %k1
4863 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4866 ; X64-LABEL: test_mm_mask_fmadd_ss:
4867 ; X64: # %bb.0: # %entry
4868 ; X64-NEXT: kmovw %edi, %k1
4869 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4872 %0 = extractelement <4 x float> %__W, i64 0
4873 %1 = extractelement <4 x float> %__A, i64 0
4874 %2 = extractelement <4 x float> %__B, i64 0
4875 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4877 %tobool.i = icmp eq i8 %4, 0
4878 %vecext1.i = extractelement <4 x float> %__W, i32 0
4879 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4880 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
4881 ret <4 x float> %vecins.i
4884 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4885 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
4886 ; X86: # %bb.0: # %entry
4887 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4888 ; X86-NEXT: kmovw %eax, %k1
4889 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4892 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
4893 ; X64: # %bb.0: # %entry
4894 ; X64-NEXT: kmovw %edi, %k1
4895 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4898 %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %__B, i8 %__U, i32 4)
4902 declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
4904 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4905 ; X86-LABEL: test_mm_maskz_fmadd_ss:
4906 ; X86: # %bb.0: # %entry
4907 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4908 ; X86-NEXT: kmovw %eax, %k1
4909 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4912 ; X64-LABEL: test_mm_maskz_fmadd_ss:
4913 ; X64: # %bb.0: # %entry
4914 ; X64-NEXT: kmovw %edi, %k1
4915 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4918 %0 = extractelement <4 x float> %__A, i64 0
4919 %1 = extractelement <4 x float> %__B, i64 0
4920 %2 = extractelement <4 x float> %__C, i64 0
4921 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4923 %tobool.i = icmp eq i8 %4, 0
4924 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
4925 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
4926 ret <4 x float> %vecins.i
4929 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4930 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
4931 ; X86: # %bb.0: # %entry
4932 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4933 ; X86-NEXT: kmovw %eax, %k1
4934 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4937 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
4938 ; X64: # %bb.0: # %entry
4939 ; X64-NEXT: kmovw %edi, %k1
4940 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4943 %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 %__U, i32 4)
4947 declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
4949 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
4950 ; X86-LABEL: test_mm_mask3_fmadd_ss:
4951 ; X86: # %bb.0: # %entry
4952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4953 ; X86-NEXT: kmovw %eax, %k1
4954 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4955 ; X86-NEXT: vmovaps %xmm2, %xmm0
4958 ; X64-LABEL: test_mm_mask3_fmadd_ss:
4959 ; X64: # %bb.0: # %entry
4960 ; X64-NEXT: kmovw %edi, %k1
4961 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4962 ; X64-NEXT: vmovaps %xmm2, %xmm0
4965 %0 = extractelement <4 x float> %__W, i64 0
4966 %1 = extractelement <4 x float> %__X, i64 0
4967 %2 = extractelement <4 x float> %__Y, i64 0
4968 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4970 %tobool.i = icmp eq i8 %4, 0
4971 %vecext1.i = extractelement <4 x float> %__Y, i32 0
4972 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4973 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
4974 ret <4 x float> %vecins.i
4977 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
4978 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
4979 ; X86: # %bb.0: # %entry
4980 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4981 ; X86-NEXT: kmovw %eax, %k1
4982 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4983 ; X86-NEXT: vmovaps %xmm2, %xmm0
4986 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
4987 ; X64: # %bb.0: # %entry
4988 ; X64-NEXT: kmovw %edi, %k1
4989 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4990 ; X64-NEXT: vmovaps %xmm2, %xmm0
4993 %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 4)
4997 declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
4999 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5000 ; X86-LABEL: test_mm_mask_fmsub_ss:
5001 ; X86: # %bb.0: # %entry
5002 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5003 ; X86-NEXT: kmovw %eax, %k1
5004 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5007 ; X64-LABEL: test_mm_mask_fmsub_ss:
5008 ; X64: # %bb.0: # %entry
5009 ; X64-NEXT: kmovw %edi, %k1
5010 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5013 %0 = extractelement <4 x float> %__W, i64 0
5014 %1 = extractelement <4 x float> %__A, i64 0
5015 %.rhs.i = extractelement <4 x float> %__B, i64 0
5016 %2 = fsub float -0.000000e+00, %.rhs.i
5017 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5019 %tobool.i = icmp eq i8 %4, 0
5020 %vecext1.i = extractelement <4 x float> %__W, i32 0
5021 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5022 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5023 ret <4 x float> %vecins.i
5026 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5027 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5028 ; X86: # %bb.0: # %entry
5029 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5030 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5031 ; X86-NEXT: vxorps %xmm3, %xmm2, %xmm2
5032 ; X86-NEXT: kmovw %eax, %k1
5033 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5036 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5037 ; X64: # %bb.0: # %entry
5038 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5039 ; X64-NEXT: vxorps %xmm3, %xmm2, %xmm2
5040 ; X64-NEXT: kmovw %edi, %k1
5041 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5044 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5045 %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %sub, i8 %__U, i32 4)
5049 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5050 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5051 ; X86: # %bb.0: # %entry
5052 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5053 ; X86-NEXT: kmovw %eax, %k1
5054 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5057 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5058 ; X64: # %bb.0: # %entry
5059 ; X64-NEXT: kmovw %edi, %k1
5060 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5063 %0 = extractelement <4 x float> %__A, i64 0
5064 %1 = extractelement <4 x float> %__B, i64 0
5065 %.rhs.i = extractelement <4 x float> %__C, i64 0
5066 %2 = fsub float -0.000000e+00, %.rhs.i
5067 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5069 %tobool.i = icmp eq i8 %4, 0
5070 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5071 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5072 ret <4 x float> %vecins.i
5075 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5076 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5077 ; X86: # %bb.0: # %entry
5078 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5079 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5080 ; X86-NEXT: vxorps %xmm3, %xmm2, %xmm2
5081 ; X86-NEXT: kmovw %eax, %k1
5082 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5085 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5086 ; X64: # %bb.0: # %entry
5087 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5088 ; X64-NEXT: vxorps %xmm3, %xmm2, %xmm2
5089 ; X64-NEXT: kmovw %edi, %k1
5090 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5093 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5094 %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub, i8 %__U, i32 4)
5098 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5099 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5100 ; X86: # %bb.0: # %entry
5101 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5102 ; X86-NEXT: kmovw %eax, %k1
5103 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5104 ; X86-NEXT: vmovaps %xmm2, %xmm0
5107 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5108 ; X64: # %bb.0: # %entry
5109 ; X64-NEXT: kmovw %edi, %k1
5110 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5111 ; X64-NEXT: vmovaps %xmm2, %xmm0
5114 %0 = extractelement <4 x float> %__W, i64 0
5115 %1 = extractelement <4 x float> %__X, i64 0
5116 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5117 %2 = fsub float -0.000000e+00, %.rhs.i
5118 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5120 %tobool.i = icmp eq i8 %4, 0
5121 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5122 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5123 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5124 ret <4 x float> %vecins.i
5127 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5128 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5129 ; X86: # %bb.0: # %entry
5130 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5131 ; X86-NEXT: kmovw %eax, %k1
5132 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5133 ; X86-NEXT: vmovaps %xmm2, %xmm0
5136 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5137 ; X64: # %bb.0: # %entry
5138 ; X64-NEXT: kmovw %edi, %k1
5139 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5140 ; X64-NEXT: vmovaps %xmm2, %xmm0
5143 %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 4)
5147 declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
5149 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5150 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5151 ; X86: # %bb.0: # %entry
5152 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5153 ; X86-NEXT: kmovw %eax, %k1
5154 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5157 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5158 ; X64: # %bb.0: # %entry
5159 ; X64-NEXT: kmovw %edi, %k1
5160 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5163 %0 = extractelement <4 x float> %__W, i64 0
5164 %.rhs.i = extractelement <4 x float> %__A, i64 0
5165 %1 = fsub float -0.000000e+00, %.rhs.i
5166 %2 = extractelement <4 x float> %__B, i64 0
5167 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5169 %tobool.i = icmp eq i8 %4, 0
5170 %vecext1.i = extractelement <4 x float> %__W, i32 0
5171 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5172 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5173 ret <4 x float> %vecins.i
5176 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5177 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5178 ; X86: # %bb.0: # %entry
5179 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5180 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5181 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5182 ; X86-NEXT: kmovw %eax, %k1
5183 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5186 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5187 ; X64: # %bb.0: # %entry
5188 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5189 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5190 ; X64-NEXT: kmovw %edi, %k1
5191 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5194 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__B, i8 %__U, i32 4)
5199 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5200 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5201 ; X86: # %bb.0: # %entry
5202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5203 ; X86-NEXT: kmovw %eax, %k1
5204 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5207 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5208 ; X64: # %bb.0: # %entry
5209 ; X64-NEXT: kmovw %edi, %k1
5210 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5213 %0 = extractelement <4 x float> %__A, i64 0
5214 %.rhs.i = extractelement <4 x float> %__B, i64 0
5215 %1 = fsub float -0.000000e+00, %.rhs.i
5216 %2 = extractelement <4 x float> %__C, i64 0
5217 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5219 %tobool.i = icmp eq i8 %4, 0
5220 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5221 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5222 ret <4 x float> %vecins.i
5225 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5226 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5227 ; X86: # %bb.0: # %entry
5228 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5229 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5230 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5231 ; X86-NEXT: kmovw %eax, %k1
5232 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5235 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5236 ; X64: # %bb.0: # %entry
5237 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5238 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5239 ; X64-NEXT: kmovw %edi, %k1
5240 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5243 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5244 %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %__C, i8 %__U, i32 4)
5248 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5249 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5250 ; X86: # %bb.0: # %entry
5251 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5252 ; X86-NEXT: kmovw %eax, %k1
5253 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5254 ; X86-NEXT: vmovaps %xmm2, %xmm0
5257 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5258 ; X64: # %bb.0: # %entry
5259 ; X64-NEXT: kmovw %edi, %k1
5260 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5261 ; X64-NEXT: vmovaps %xmm2, %xmm0
5264 %0 = extractelement <4 x float> %__W, i64 0
5265 %.rhs.i = extractelement <4 x float> %__X, i64 0
5266 %1 = fsub float -0.000000e+00, %.rhs.i
5267 %2 = extractelement <4 x float> %__Y, i64 0
5268 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5270 %tobool.i = icmp eq i8 %4, 0
5271 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5272 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5273 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5274 ret <4 x float> %vecins.i
5277 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5278 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5279 ; X86: # %bb.0: # %entry
5280 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5281 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5282 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5283 ; X86-NEXT: kmovw %eax, %k1
5284 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5285 ; X86-NEXT: vmovaps %xmm2, %xmm0
5288 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5289 ; X64: # %bb.0: # %entry
5290 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5291 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5292 ; X64-NEXT: kmovw %edi, %k1
5293 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5294 ; X64-NEXT: vmovaps %xmm2, %xmm0
5297 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__X
5298 %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 4)
5302 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5303 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5304 ; X86: # %bb.0: # %entry
5305 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5306 ; X86-NEXT: kmovw %eax, %k1
5307 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5310 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5311 ; X64: # %bb.0: # %entry
5312 ; X64-NEXT: kmovw %edi, %k1
5313 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5316 %0 = extractelement <4 x float> %__W, i64 0
5317 %.rhs.i = extractelement <4 x float> %__A, i64 0
5318 %1 = fsub float -0.000000e+00, %.rhs.i
5319 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5320 %2 = fsub float -0.000000e+00, %.rhs7.i
5321 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5323 %tobool.i = icmp eq i8 %4, 0
5324 %vecext2.i = extractelement <4 x float> %__W, i32 0
5325 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5326 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5327 ret <4 x float> %vecins.i
5330 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5331 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5332 ; X86: # %bb.0: # %entry
5333 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5334 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5335 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5336 ; X86-NEXT: vxorps %xmm3, %xmm2, %xmm2
5337 ; X86-NEXT: kmovw %eax, %k1
5338 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5341 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5342 ; X64: # %bb.0: # %entry
5343 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5344 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5345 ; X64-NEXT: vxorps %xmm3, %xmm2, %xmm2
5346 ; X64-NEXT: kmovw %edi, %k1
5347 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5350 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5351 %sub1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5352 %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 4)
5356 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5357 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5358 ; X86: # %bb.0: # %entry
5359 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5360 ; X86-NEXT: kmovw %eax, %k1
5361 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5364 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5365 ; X64: # %bb.0: # %entry
5366 ; X64-NEXT: kmovw %edi, %k1
5367 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5370 %0 = extractelement <4 x float> %__A, i64 0
5371 %.rhs.i = extractelement <4 x float> %__B, i64 0
5372 %1 = fsub float -0.000000e+00, %.rhs.i
5373 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5374 %2 = fsub float -0.000000e+00, %.rhs5.i
5375 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5377 %tobool.i = icmp eq i8 %4, 0
5378 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5379 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5380 ret <4 x float> %vecins.i
5383 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5384 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5385 ; X86: # %bb.0: # %entry
5386 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5387 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5388 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5389 ; X86-NEXT: vxorps %xmm3, %xmm2, %xmm2
5390 ; X86-NEXT: kmovw %eax, %k1
5391 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5394 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5395 ; X64: # %bb.0: # %entry
5396 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5397 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5398 ; X64-NEXT: vxorps %xmm3, %xmm2, %xmm2
5399 ; X64-NEXT: kmovw %edi, %k1
5400 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5403 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5404 %sub1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5405 %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 4)
5409 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5410 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5411 ; X86: # %bb.0: # %entry
5412 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5413 ; X86-NEXT: kmovw %eax, %k1
5414 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5415 ; X86-NEXT: vmovaps %xmm2, %xmm0
5418 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5419 ; X64: # %bb.0: # %entry
5420 ; X64-NEXT: kmovw %edi, %k1
5421 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5422 ; X64-NEXT: vmovaps %xmm2, %xmm0
5425 %0 = extractelement <4 x float> %__W, i64 0
5426 %.rhs.i = extractelement <4 x float> %__X, i64 0
5427 %1 = fsub float -0.000000e+00, %.rhs.i
5428 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5429 %2 = fsub float -0.000000e+00, %.rhs7.i
5430 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5432 %tobool.i = icmp eq i8 %4, 0
5433 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5434 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5435 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5436 ret <4 x float> %vecins.i
5439 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5440 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5441 ; X86: # %bb.0: # %entry
5442 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5443 ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5444 ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1
5445 ; X86-NEXT: kmovw %eax, %k1
5446 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5447 ; X86-NEXT: vmovaps %xmm2, %xmm0
5450 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5451 ; X64: # %bb.0: # %entry
5452 ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
5453 ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1
5454 ; X64-NEXT: kmovw %edi, %k1
5455 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5456 ; X64-NEXT: vmovaps %xmm2, %xmm0
5459 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__X
5460 %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 4)
5464 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5465 ; X86-LABEL: test_mm_mask_fmadd_sd:
5466 ; X86: # %bb.0: # %entry
5467 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5468 ; X86-NEXT: kmovw %eax, %k1
5469 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5472 ; X64-LABEL: test_mm_mask_fmadd_sd:
5473 ; X64: # %bb.0: # %entry
5474 ; X64-NEXT: kmovw %edi, %k1
5475 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5478 %0 = extractelement <2 x double> %__W, i64 0
5479 %1 = extractelement <2 x double> %__A, i64 0
5480 %2 = extractelement <2 x double> %__B, i64 0
5481 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5483 %tobool.i = icmp eq i8 %4, 0
5484 %vecext1.i = extractelement <2 x double> %__W, i32 0
5485 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5486 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5487 ret <2 x double> %vecins.i
5490 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5491 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5492 ; X86: # %bb.0: # %entry
5493 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5494 ; X86-NEXT: kmovw %eax, %k1
5495 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5498 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5499 ; X64: # %bb.0: # %entry
5500 ; X64-NEXT: kmovw %edi, %k1
5501 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5504 %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %__B, i8 %__U, i32 4)
5508 declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
5510 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5511 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5512 ; X86: # %bb.0: # %entry
5513 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5514 ; X86-NEXT: kmovw %eax, %k1
5515 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5518 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5519 ; X64: # %bb.0: # %entry
5520 ; X64-NEXT: kmovw %edi, %k1
5521 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5524 %0 = extractelement <2 x double> %__A, i64 0
5525 %1 = extractelement <2 x double> %__B, i64 0
5526 %2 = extractelement <2 x double> %__C, i64 0
5527 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5529 %tobool.i = icmp eq i8 %4, 0
5530 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5531 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5532 ret <2 x double> %vecins.i
5535 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5536 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5537 ; X86: # %bb.0: # %entry
5538 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5539 ; X86-NEXT: kmovw %eax, %k1
5540 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5543 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5544 ; X64: # %bb.0: # %entry
5545 ; X64-NEXT: kmovw %edi, %k1
5546 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5549 %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 %__U, i32 4)
5553 declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
5555 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5556 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5557 ; X86: # %bb.0: # %entry
5558 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5559 ; X86-NEXT: kmovw %eax, %k1
5560 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5561 ; X86-NEXT: vmovapd %xmm2, %xmm0
5564 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5565 ; X64: # %bb.0: # %entry
5566 ; X64-NEXT: kmovw %edi, %k1
5567 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5568 ; X64-NEXT: vmovapd %xmm2, %xmm0
5571 %0 = extractelement <2 x double> %__W, i64 0
5572 %1 = extractelement <2 x double> %__X, i64 0
5573 %2 = extractelement <2 x double> %__Y, i64 0
5574 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5576 %tobool.i = icmp eq i8 %4, 0
5577 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5578 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5579 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5580 ret <2 x double> %vecins.i
5583 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5584 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5585 ; X86: # %bb.0: # %entry
5586 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5587 ; X86-NEXT: kmovw %eax, %k1
5588 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5589 ; X86-NEXT: vmovapd %xmm2, %xmm0
5592 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5593 ; X64: # %bb.0: # %entry
5594 ; X64-NEXT: kmovw %edi, %k1
5595 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5596 ; X64-NEXT: vmovapd %xmm2, %xmm0
5599 %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 4)
5603 declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
5605 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5606 ; X86-LABEL: test_mm_mask_fmsub_sd:
5607 ; X86: # %bb.0: # %entry
5608 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5609 ; X86-NEXT: kmovw %eax, %k1
5610 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5613 ; X64-LABEL: test_mm_mask_fmsub_sd:
5614 ; X64: # %bb.0: # %entry
5615 ; X64-NEXT: kmovw %edi, %k1
5616 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5619 %0 = extractelement <2 x double> %__W, i64 0
5620 %1 = extractelement <2 x double> %__A, i64 0
5621 %.rhs.i = extractelement <2 x double> %__B, i64 0
5622 %2 = fsub double -0.000000e+00, %.rhs.i
5623 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5625 %tobool.i = icmp eq i8 %4, 0
5626 %vecext1.i = extractelement <2 x double> %__W, i32 0
5627 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5628 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5629 ret <2 x double> %vecins.i
5632 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5633 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5634 ; X86: # %bb.0: # %entry
5635 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5636 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5637 ; X86-NEXT: kmovw %eax, %k1
5638 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5641 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5642 ; X64: # %bb.0: # %entry
5643 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2
5644 ; X64-NEXT: kmovw %edi, %k1
5645 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5648 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5649 %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %sub, i8 %__U, i32 4)
5653 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5654 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5655 ; X86: # %bb.0: # %entry
5656 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5657 ; X86-NEXT: kmovw %eax, %k1
5658 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5661 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5662 ; X64: # %bb.0: # %entry
5663 ; X64-NEXT: kmovw %edi, %k1
5664 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5667 %0 = extractelement <2 x double> %__A, i64 0
5668 %1 = extractelement <2 x double> %__B, i64 0
5669 %.rhs.i = extractelement <2 x double> %__C, i64 0
5670 %2 = fsub double -0.000000e+00, %.rhs.i
5671 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5673 %tobool.i = icmp eq i8 %4, 0
5674 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5675 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5676 ret <2 x double> %vecins.i
5679 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5680 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5681 ; X86: # %bb.0: # %entry
5682 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5683 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5684 ; X86-NEXT: kmovw %eax, %k1
5685 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5688 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5689 ; X64: # %bb.0: # %entry
5690 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2
5691 ; X64-NEXT: kmovw %edi, %k1
5692 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5695 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5696 %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub, i8 %__U, i32 4)
5700 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5701 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5702 ; X86: # %bb.0: # %entry
5703 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5704 ; X86-NEXT: kmovw %eax, %k1
5705 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5706 ; X86-NEXT: vmovapd %xmm2, %xmm0
5709 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5710 ; X64: # %bb.0: # %entry
5711 ; X64-NEXT: kmovw %edi, %k1
5712 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5713 ; X64-NEXT: vmovapd %xmm2, %xmm0
5716 %0 = extractelement <2 x double> %__W, i64 0
5717 %1 = extractelement <2 x double> %__X, i64 0
5718 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5719 %2 = fsub double -0.000000e+00, %.rhs.i
5720 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5722 %tobool.i = icmp eq i8 %4, 0
5723 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5724 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5725 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5726 ret <2 x double> %vecins.i
5729 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5730 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5731 ; X86: # %bb.0: # %entry
5732 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5733 ; X86-NEXT: kmovw %eax, %k1
5734 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5735 ; X86-NEXT: vmovapd %xmm2, %xmm0
5738 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5739 ; X64: # %bb.0: # %entry
5740 ; X64-NEXT: kmovw %edi, %k1
5741 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5742 ; X64-NEXT: vmovapd %xmm2, %xmm0
5745 %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 4)
5749 declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
5751 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5752 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5753 ; X86: # %bb.0: # %entry
5754 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5755 ; X86-NEXT: kmovw %eax, %k1
5756 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5759 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5760 ; X64: # %bb.0: # %entry
5761 ; X64-NEXT: kmovw %edi, %k1
5762 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5765 %0 = extractelement <2 x double> %__W, i64 0
5766 %.rhs.i = extractelement <2 x double> %__A, i64 0
5767 %1 = fsub double -0.000000e+00, %.rhs.i
5768 %2 = extractelement <2 x double> %__B, i64 0
5769 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5771 %tobool.i = icmp eq i8 %4, 0
5772 %vecext1.i = extractelement <2 x double> %__W, i32 0
5773 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5774 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5775 ret <2 x double> %vecins.i
5778 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5779 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5780 ; X86: # %bb.0: # %entry
5781 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5782 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5783 ; X86-NEXT: kmovw %eax, %k1
5784 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5787 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5788 ; X64: # %bb.0: # %entry
5789 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
5790 ; X64-NEXT: kmovw %edi, %k1
5791 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5794 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
5795 %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__B, i8 %__U, i32 4)
5799 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5800 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
5801 ; X86: # %bb.0: # %entry
5802 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5803 ; X86-NEXT: kmovw %eax, %k1
5804 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5807 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
5808 ; X64: # %bb.0: # %entry
5809 ; X64-NEXT: kmovw %edi, %k1
5810 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5813 %0 = extractelement <2 x double> %__A, i64 0
5814 %.rhs.i = extractelement <2 x double> %__B, i64 0
5815 %1 = fsub double -0.000000e+00, %.rhs.i
5816 %2 = extractelement <2 x double> %__C, i64 0
5817 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5819 %tobool.i = icmp eq i8 %4, 0
5820 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5821 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5822 ret <2 x double> %vecins.i
5825 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5826 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
5827 ; X86: # %bb.0: # %entry
5828 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5829 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5830 ; X86-NEXT: kmovw %eax, %k1
5831 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5834 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
5835 ; X64: # %bb.0: # %entry
5836 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
5837 ; X64-NEXT: kmovw %edi, %k1
5838 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5841 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5842 %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %__C, i8 %__U, i32 4)
5846 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5847 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
5848 ; X86: # %bb.0: # %entry
5849 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5850 ; X86-NEXT: kmovw %eax, %k1
5851 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5852 ; X86-NEXT: vmovapd %xmm2, %xmm0
5855 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
5856 ; X64: # %bb.0: # %entry
5857 ; X64-NEXT: kmovw %edi, %k1
5858 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5859 ; X64-NEXT: vmovapd %xmm2, %xmm0
5862 %0 = extractelement <2 x double> %__W, i64 0
5863 %.rhs.i = extractelement <2 x double> %__X, i64 0
5864 %1 = fsub double -0.000000e+00, %.rhs.i
5865 %2 = extractelement <2 x double> %__Y, i64 0
5866 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5868 %tobool.i = icmp eq i8 %4, 0
5869 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5870 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5871 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5872 ret <2 x double> %vecins.i
5875 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5876 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
5877 ; X86: # %bb.0: # %entry
5878 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5879 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5880 ; X86-NEXT: kmovw %eax, %k1
5881 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5882 ; X86-NEXT: vmovapd %xmm2, %xmm0
5885 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
5886 ; X64: # %bb.0: # %entry
5887 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
5888 ; X64-NEXT: kmovw %edi, %k1
5889 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5890 ; X64-NEXT: vmovapd %xmm2, %xmm0
5893 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__X
5894 %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 4)
5898 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5899 ; X86-LABEL: test_mm_mask_fnmsub_sd:
5900 ; X86: # %bb.0: # %entry
5901 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5902 ; X86-NEXT: kmovw %eax, %k1
5903 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5906 ; X64-LABEL: test_mm_mask_fnmsub_sd:
5907 ; X64: # %bb.0: # %entry
5908 ; X64-NEXT: kmovw %edi, %k1
5909 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5912 %0 = extractelement <2 x double> %__W, i64 0
5913 %.rhs.i = extractelement <2 x double> %__A, i64 0
5914 %1 = fsub double -0.000000e+00, %.rhs.i
5915 %.rhs7.i = extractelement <2 x double> %__B, i64 0
5916 %2 = fsub double -0.000000e+00, %.rhs7.i
5917 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5919 %tobool.i = icmp eq i8 %4, 0
5920 %vecext2.i = extractelement <2 x double> %__W, i32 0
5921 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
5922 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5923 ret <2 x double> %vecins.i
5926 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5927 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
5928 ; X86: # %bb.0: # %entry
5929 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5930 ; X86-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
5931 ; X86-NEXT: vxorpd %xmm3, %xmm1, %xmm1
5932 ; X86-NEXT: vxorpd %xmm3, %xmm2, %xmm2
5933 ; X86-NEXT: kmovw %eax, %k1
5934 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5937 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
5938 ; X64: # %bb.0: # %entry
5939 ; X64-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
5940 ; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1
5941 ; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm2
5942 ; X64-NEXT: kmovw %edi, %k1
5943 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5946 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
5947 %sub1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5948 %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 4)
5952 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5953 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
5954 ; X86: # %bb.0: # %entry
5955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5956 ; X86-NEXT: kmovw %eax, %k1
5957 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5960 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
5961 ; X64: # %bb.0: # %entry
5962 ; X64-NEXT: kmovw %edi, %k1
5963 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5966 %0 = extractelement <2 x double> %__A, i64 0
5967 %.rhs.i = extractelement <2 x double> %__B, i64 0
5968 %1 = fsub double -0.000000e+00, %.rhs.i
5969 %.rhs5.i = extractelement <2 x double> %__C, i64 0
5970 %2 = fsub double -0.000000e+00, %.rhs5.i
5971 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5973 %tobool.i = icmp eq i8 %4, 0
5974 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5975 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5976 ret <2 x double> %vecins.i
5979 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5980 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
5981 ; X86: # %bb.0: # %entry
5982 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5983 ; X86-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
5984 ; X86-NEXT: vxorpd %xmm3, %xmm1, %xmm1
5985 ; X86-NEXT: vxorpd %xmm3, %xmm2, %xmm2
5986 ; X86-NEXT: kmovw %eax, %k1
5987 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5990 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
5991 ; X64: # %bb.0: # %entry
5992 ; X64-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
5993 ; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1
5994 ; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm2
5995 ; X64-NEXT: kmovw %edi, %k1
5996 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5999 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
6000 %sub1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
6001 %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 4)
6005 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6006 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6007 ; X86: # %bb.0: # %entry
6008 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6009 ; X86-NEXT: kmovw %eax, %k1
6010 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6011 ; X86-NEXT: vmovapd %xmm2, %xmm0
6014 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6015 ; X64: # %bb.0: # %entry
6016 ; X64-NEXT: kmovw %edi, %k1
6017 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6018 ; X64-NEXT: vmovapd %xmm2, %xmm0
6021 %0 = extractelement <2 x double> %__W, i64 0
6022 %.rhs.i = extractelement <2 x double> %__X, i64 0
6023 %1 = fsub double -0.000000e+00, %.rhs.i
6024 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6025 %2 = fsub double -0.000000e+00, %.rhs7.i
6026 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6028 %tobool.i = icmp eq i8 %4, 0
6029 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6030 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6031 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6032 ret <2 x double> %vecins.i
6035 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6036 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6037 ; X86: # %bb.0: # %entry
6038 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6039 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
6040 ; X86-NEXT: kmovw %eax, %k1
6041 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
6042 ; X86-NEXT: vmovapd %xmm2, %xmm0
6045 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6046 ; X64: # %bb.0: # %entry
6047 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
6048 ; X64-NEXT: kmovw %edi, %k1
6049 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
6050 ; X64-NEXT: vmovapd %xmm2, %xmm0
6053 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__X
6054 %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 4)
6058 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6059 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6060 ; X86: # %bb.0: # %entry
6061 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6063 ; X86-NEXT: kmovw %ecx, %k1
6064 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6067 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6068 ; X64: # %bb.0: # %entry
6069 ; X64-NEXT: kmovw %edi, %k1
6070 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6073 %0 = bitcast i8* %__P to i64*
6074 %1 = bitcast i8 %__U to <8 x i1>
6075 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6079 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6080 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6081 ; X86: # %bb.0: # %entry
6082 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6083 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6084 ; X86-NEXT: kmovw %ecx, %k1
6085 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6088 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6089 ; X64: # %bb.0: # %entry
6090 ; X64-NEXT: kmovw %edi, %k1
6091 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6094 %0 = bitcast i8* %__P to i64*
6095 %1 = bitcast i8 %__U to <8 x i1>
6096 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6100 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6101 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6102 ; X86: # %bb.0: # %entry
6103 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6104 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6105 ; X86-NEXT: kmovw %ecx, %k1
6106 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6109 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6110 ; X64: # %bb.0: # %entry
6111 ; X64-NEXT: kmovw %edi, %k1
6112 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6115 %0 = bitcast i8* %__P to double*
6116 %1 = bitcast i8 %__U to <8 x i1>
6117 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6121 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6122 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6123 ; X86: # %bb.0: # %entry
6124 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6125 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6126 ; X86-NEXT: kmovw %ecx, %k1
6127 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6130 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6131 ; X64: # %bb.0: # %entry
6132 ; X64-NEXT: kmovw %edi, %k1
6133 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6136 %0 = bitcast i8* %__P to double*
6137 %1 = bitcast i8 %__U to <8 x i1>
6138 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6142 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6143 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6144 ; X86: # %bb.0: # %entry
6145 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6146 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6147 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6150 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6151 ; X64: # %bb.0: # %entry
6152 ; X64-NEXT: kmovw %edi, %k1
6153 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6156 %0 = bitcast <8 x i64> %__W to <16 x i32>
6157 %1 = bitcast i8* %__P to i32*
6158 %2 = bitcast i16 %__U to <16 x i1>
6159 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6160 %4 = bitcast <16 x i32> %3 to <8 x i64>
6164 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6165 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6166 ; X86: # %bb.0: # %entry
6167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6168 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6169 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6172 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6173 ; X64: # %bb.0: # %entry
6174 ; X64-NEXT: kmovw %edi, %k1
6175 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6178 %0 = bitcast i8* %__P to i32*
6179 %1 = bitcast i16 %__U to <16 x i1>
6180 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6181 %3 = bitcast <16 x i32> %2 to <8 x i64>
6185 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6186 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6187 ; X86: # %bb.0: # %entry
6188 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6189 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6190 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6193 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6194 ; X64: # %bb.0: # %entry
6195 ; X64-NEXT: kmovw %edi, %k1
6196 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6199 %0 = bitcast i8* %__P to float*
6200 %1 = bitcast i16 %__U to <16 x i1>
6201 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6205 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6206 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6207 ; X86: # %bb.0: # %entry
6208 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6209 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6210 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6213 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6214 ; X64: # %bb.0: # %entry
6215 ; X64-NEXT: kmovw %edi, %k1
6216 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6219 %0 = bitcast i8* %__P to float*
6220 %1 = bitcast i16 %__U to <16 x i1>
6221 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6225 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6226 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6227 ; X86: # %bb.0: # %entry
6228 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6229 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6230 ; X86-NEXT: kmovw %eax, %k1
6231 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6232 ; X86-NEXT: vzeroupper
6235 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6236 ; X64: # %bb.0: # %entry
6237 ; X64-NEXT: kmovw %esi, %k1
6238 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6239 ; X64-NEXT: vzeroupper
6242 %0 = bitcast i8* %__P to double*
6243 %1 = bitcast i8 %__U to <8 x i1>
6244 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6248 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6249 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6250 ; X86: # %bb.0: # %entry
6251 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6252 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6253 ; X86-NEXT: kmovw %eax, %k1
6254 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6255 ; X86-NEXT: vzeroupper
6258 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6259 ; X64: # %bb.0: # %entry
6260 ; X64-NEXT: kmovw %esi, %k1
6261 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6262 ; X64-NEXT: vzeroupper
6265 %0 = bitcast i8* %__P to i64*
6266 %1 = bitcast i8 %__U to <8 x i1>
6267 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6271 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6272 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6273 ; X86: # %bb.0: # %entry
6274 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6275 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6276 ; X86-NEXT: vcompressps %zmm0, (%eax) {%k1}
6277 ; X86-NEXT: vzeroupper
6280 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6281 ; X64: # %bb.0: # %entry
6282 ; X64-NEXT: kmovw %esi, %k1
6283 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6284 ; X64-NEXT: vzeroupper
6287 %0 = bitcast i8* %__P to float*
6288 %1 = bitcast i16 %__U to <16 x i1>
6289 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6293 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6294 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6295 ; X86: # %bb.0: # %entry
6296 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6297 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6298 ; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1}
6299 ; X86-NEXT: vzeroupper
6302 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6303 ; X64: # %bb.0: # %entry
6304 ; X64-NEXT: kmovw %esi, %k1
6305 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6306 ; X64-NEXT: vzeroupper
6309 %0 = bitcast <8 x i64> %__A to <16 x i32>
6310 %1 = bitcast i8* %__P to i32*
6311 %2 = bitcast i16 %__U to <16 x i1>
6312 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6316 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6317 ; X86-LABEL: test_mm512_reduce_add_epi64:
6318 ; X86: # %bb.0: # %entry
6319 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6320 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6321 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6322 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6323 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6324 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6325 ; X86-NEXT: vmovd %xmm0, %eax
6326 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6327 ; X86-NEXT: vzeroupper
6330 ; X64-LABEL: test_mm512_reduce_add_epi64:
6331 ; X64: # %bb.0: # %entry
6332 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6333 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6334 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6335 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6336 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6337 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6338 ; X64-NEXT: vmovq %xmm0, %rax
6339 ; X64-NEXT: vzeroupper
6342 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6343 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6344 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6345 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6346 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6347 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6348 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6349 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6350 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6354 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6355 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6356 ; X86: # %bb.0: # %entry
6357 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6358 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6359 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6360 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6361 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6362 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6363 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6364 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6365 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6366 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6367 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6368 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6369 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6370 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6371 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6372 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6373 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6374 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6375 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6376 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6377 ; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
6378 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6379 ; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
6380 ; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2
6381 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6382 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6383 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6384 ; X86-NEXT: vmovd %xmm0, %eax
6385 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6386 ; X86-NEXT: vzeroupper
6389 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6390 ; X64: # %bb.0: # %entry
6391 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6392 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6393 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6394 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6395 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6396 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6397 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6398 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6399 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6400 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6401 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6402 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6403 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6404 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6405 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6406 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6407 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6408 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6409 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6410 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6411 ; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
6412 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6413 ; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
6414 ; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
6415 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6416 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6417 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6418 ; X64-NEXT: vmovq %xmm0, %rax
6419 ; X64-NEXT: vzeroupper
6422 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6423 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6424 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6425 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6426 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6427 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6428 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6429 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6430 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6434 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6435 ; X86-LABEL: test_mm512_reduce_or_epi64:
6436 ; X86: # %bb.0: # %entry
6437 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6438 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6439 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6440 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6441 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6442 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6443 ; X86-NEXT: vmovd %xmm0, %eax
6444 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6445 ; X86-NEXT: vzeroupper
6448 ; X64-LABEL: test_mm512_reduce_or_epi64:
6449 ; X64: # %bb.0: # %entry
6450 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6451 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6452 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6453 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6454 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6455 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6456 ; X64-NEXT: vmovq %xmm0, %rax
6457 ; X64-NEXT: vzeroupper
6460 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6461 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6462 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6463 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6464 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6465 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6466 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6467 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6468 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6472 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6473 ; X86-LABEL: test_mm512_reduce_and_epi64:
6474 ; X86: # %bb.0: # %entry
6475 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6476 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6477 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6478 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6479 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6480 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6481 ; X86-NEXT: vmovd %xmm0, %eax
6482 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6483 ; X86-NEXT: vzeroupper
6486 ; X64-LABEL: test_mm512_reduce_and_epi64:
6487 ; X64: # %bb.0: # %entry
6488 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6489 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6490 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6491 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6492 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6493 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6494 ; X64-NEXT: vmovq %xmm0, %rax
6495 ; X64-NEXT: vzeroupper
6498 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6499 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6500 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6501 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6502 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6503 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6504 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6505 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6506 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6510 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6511 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6512 ; X86: # %bb.0: # %entry
6513 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6514 ; X86-NEXT: kmovw %eax, %k1
6515 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6516 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6517 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6518 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6519 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6520 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6521 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6522 ; X86-NEXT: vmovd %xmm0, %eax
6523 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6524 ; X86-NEXT: vzeroupper
6527 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6528 ; X64: # %bb.0: # %entry
6529 ; X64-NEXT: kmovw %edi, %k1
6530 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6531 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6532 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6533 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6534 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6535 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6536 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6537 ; X64-NEXT: vmovq %xmm0, %rax
6538 ; X64-NEXT: vzeroupper
6541 %0 = bitcast i8 %__M to <8 x i1>
6542 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6543 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6544 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6545 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6546 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6547 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6548 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6549 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6550 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6551 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6555 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6556 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6557 ; X86: # %bb.0: # %entry
6558 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6559 ; X86-NEXT: kmovw %eax, %k1
6560 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6561 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6562 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6563 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6564 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6565 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6566 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6567 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6568 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6569 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6570 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6571 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6572 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6573 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6574 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6575 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6576 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6577 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6578 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6579 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6580 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6581 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6582 ; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
6583 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6584 ; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
6585 ; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2
6586 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6587 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6588 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6589 ; X86-NEXT: vmovd %xmm0, %eax
6590 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6591 ; X86-NEXT: vzeroupper
6594 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6595 ; X64: # %bb.0: # %entry
6596 ; X64-NEXT: kmovw %edi, %k1
6597 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6598 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6599 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6600 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6601 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6602 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6603 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6604 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6605 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6606 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6607 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6608 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6609 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6610 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6611 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6612 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6613 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6614 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6615 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6616 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6617 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6618 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6619 ; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
6620 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6621 ; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
6622 ; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
6623 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6624 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6625 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6626 ; X64-NEXT: vmovq %xmm0, %rax
6627 ; X64-NEXT: vzeroupper
6630 %0 = bitcast i8 %__M to <8 x i1>
6631 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6632 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6633 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6634 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6635 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6636 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6637 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6638 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6639 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6640 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6644 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6645 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6646 ; X86: # %bb.0: # %entry
6647 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6648 ; X86-NEXT: kmovw %eax, %k1
6649 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6650 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6651 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6652 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6653 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6654 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6655 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6656 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6657 ; X86-NEXT: vmovd %xmm0, %eax
6658 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6659 ; X86-NEXT: vzeroupper
6662 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6663 ; X64: # %bb.0: # %entry
6664 ; X64-NEXT: kmovw %edi, %k1
6665 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6666 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6667 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6668 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6669 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6670 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6671 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6672 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6673 ; X64-NEXT: vmovq %xmm0, %rax
6674 ; X64-NEXT: vzeroupper
6677 %0 = bitcast i8 %__M to <8 x i1>
6678 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6679 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6680 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6681 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6682 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6683 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6684 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6685 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6686 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6687 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6691 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6692 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6693 ; X86: # %bb.0: # %entry
6694 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6695 ; X86-NEXT: kmovw %eax, %k1
6696 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6697 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6698 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6699 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6700 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6701 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6702 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6703 ; X86-NEXT: vmovd %xmm0, %eax
6704 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6705 ; X86-NEXT: vzeroupper
6708 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6709 ; X64: # %bb.0: # %entry
6710 ; X64-NEXT: kmovw %edi, %k1
6711 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6712 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6713 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6714 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6715 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6716 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6717 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6718 ; X64-NEXT: vmovq %xmm0, %rax
6719 ; X64-NEXT: vzeroupper
6722 %0 = bitcast i8 %__M to <8 x i1>
6723 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6724 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6725 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6726 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6727 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6728 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6729 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6730 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6731 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6732 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6736 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6737 ; X86-LABEL: test_mm512_reduce_add_epi32:
6738 ; X86: # %bb.0: # %entry
6739 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6740 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6741 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6742 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6743 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6744 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6745 ; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6746 ; X86-NEXT: vmovd %xmm0, %eax
6747 ; X86-NEXT: vzeroupper
6750 ; X64-LABEL: test_mm512_reduce_add_epi32:
6751 ; X64: # %bb.0: # %entry
6752 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6753 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6754 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6755 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6756 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6757 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6758 ; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6759 ; X64-NEXT: vmovq %xmm0, %rax
6760 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
6761 ; X64-NEXT: vzeroupper
6764 %0 = bitcast <8 x i64> %__W to <16 x i32>
6765 %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6766 %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6767 %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
6768 %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6769 %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6770 %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
6771 %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
6772 %add7.i = add <4 x i32> %shuffle6.i, %add4.i
6773 %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6774 %add10.i = add <4 x i32> %shuffle9.i, %add7.i
6775 %1 = bitcast <4 x i32> %add10.i to <2 x i64>
6776 %vecext.i = extractelement <2 x i64> %1, i32 0
6777 %conv.i = trunc i64 %vecext.i to i32
6781 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6782 ; X86-LABEL: test_mm512_reduce_mul_epi32:
6783 ; X86: # %bb.0: # %entry
6784 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6785 ; X86-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6786 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6787 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6788 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6789 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6790 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6791 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6792 ; X86-NEXT: vmovd %xmm0, %eax
6793 ; X86-NEXT: vzeroupper
6796 ; X64-LABEL: test_mm512_reduce_mul_epi32:
6797 ; X64: # %bb.0: # %entry
6798 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6799 ; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6800 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6801 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6802 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6803 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6804 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6805 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6806 ; X64-NEXT: vmovq %xmm0, %rax
6807 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
6808 ; X64-NEXT: vzeroupper
6811 %0 = bitcast <8 x i64> %__W to <16 x i32>
6812 %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6813 %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6814 %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
6815 %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6816 %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6817 %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
6818 %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
6819 %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
6820 %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6821 %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
6822 %1 = bitcast <4 x i32> %mul10.i to <2 x i64>
6823 %vecext.i = extractelement <2 x i64> %1, i32 0
6824 %conv.i = trunc i64 %vecext.i to i32
6828 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
6829 ; X86-LABEL: test_mm512_reduce_or_epi32:
6830 ; X86: # %bb.0: # %entry
6831 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6832 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6833 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6834 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6835 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6836 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6837 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6838 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6839 ; X86-NEXT: vmovd %xmm0, %eax
6840 ; X86-NEXT: vzeroupper
6843 ; X64-LABEL: test_mm512_reduce_or_epi32:
6844 ; X64: # %bb.0: # %entry
6845 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6846 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6847 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6848 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6849 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6850 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6851 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6852 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6853 ; X64-NEXT: vmovq %xmm0, %rax
6854 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
6855 ; X64-NEXT: vzeroupper
6858 %0 = bitcast <8 x i64> %__W to <16 x i32>
6859 %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6860 %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6861 %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
6862 %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6863 %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6864 %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
6865 %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
6866 %or7.i = or <4 x i32> %shuffle6.i, %or4.i
6867 %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6868 %or10.i = or <4 x i32> %shuffle9.i, %or7.i
6869 %1 = bitcast <4 x i32> %or10.i to <2 x i64>
6870 %vecext.i = extractelement <2 x i64> %1, i32 0
6871 %conv.i = trunc i64 %vecext.i to i32
6875 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
6876 ; X86-LABEL: test_mm512_reduce_and_epi32:
6877 ; X86: # %bb.0: # %entry
6878 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6879 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6880 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6881 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6882 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6883 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6884 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6885 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6886 ; X86-NEXT: vmovd %xmm0, %eax
6887 ; X86-NEXT: vzeroupper
6890 ; X64-LABEL: test_mm512_reduce_and_epi32:
6891 ; X64: # %bb.0: # %entry
6892 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6893 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6894 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6895 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6896 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6897 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6898 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6899 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6900 ; X64-NEXT: vmovq %xmm0, %rax
6901 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
6902 ; X64-NEXT: vzeroupper
6905 %0 = bitcast <8 x i64> %__W to <16 x i32>
6906 %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6907 %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6908 %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
6909 %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6910 %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6911 %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
6912 %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
6913 %and7.i = and <4 x i32> %shuffle6.i, %and4.i
6914 %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6915 %and10.i = and <4 x i32> %shuffle9.i, %and7.i
6916 %1 = bitcast <4 x i32> %and10.i to <2 x i64>
6917 %vecext.i = extractelement <2 x i64> %1, i32 0
6918 %conv.i = trunc i64 %vecext.i to i32
6922 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
6923 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
6924 ; X86: # %bb.0: # %entry
6925 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6926 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
6927 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6928 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6929 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6930 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6931 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6932 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6933 ; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6934 ; X86-NEXT: vmovd %xmm0, %eax
6935 ; X86-NEXT: vzeroupper
6938 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
6939 ; X64: # %bb.0: # %entry
6940 ; X64-NEXT: kmovw %edi, %k1
6941 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
6942 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6943 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6944 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6945 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6946 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6947 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6948 ; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6949 ; X64-NEXT: vmovq %xmm0, %rax
6950 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
6951 ; X64-NEXT: vzeroupper
6954 %0 = bitcast <8 x i64> %__W to <16 x i32>
6955 %1 = bitcast i16 %__M to <16 x i1>
6956 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
6957 %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6958 %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6959 %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
6960 %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6961 %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6962 %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
6963 %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
6964 %add7.i = add <4 x i32> %shuffle6.i, %add4.i
6965 %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6966 %add10.i = add <4 x i32> %shuffle9.i, %add7.i
6967 %3 = bitcast <4 x i32> %add10.i to <2 x i64>
6968 %vecext.i = extractelement <2 x i64> %3, i32 0
6969 %conv.i = trunc i64 %vecext.i to i32
6973 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
6974 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
6975 ; X86: # %bb.0: # %entry
6976 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6977 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
6978 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6979 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6980 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
6981 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6982 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6983 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6984 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6985 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6986 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6987 ; X86-NEXT: vmovd %xmm0, %eax
6988 ; X86-NEXT: vzeroupper
6991 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
6992 ; X64: # %bb.0: # %entry
6993 ; X64-NEXT: kmovw %edi, %k1
6994 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
6995 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6996 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6997 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
6998 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6999 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7000 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7001 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7002 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7003 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7004 ; X64-NEXT: vmovq %xmm0, %rax
7005 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
7006 ; X64-NEXT: vzeroupper
7009 %0 = bitcast <8 x i64> %__W to <16 x i32>
7010 %1 = bitcast i16 %__M to <16 x i1>
7011 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7012 %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7013 %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7014 %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
7015 %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7016 %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7017 %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
7018 %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7019 %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
7020 %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7021 %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
7022 %3 = bitcast <4 x i32> %mul10.i to <2 x i64>
7023 %vecext.i = extractelement <2 x i64> %3, i32 0
7024 %conv.i = trunc i64 %vecext.i to i32
7028 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7029 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7030 ; X86: # %bb.0: # %entry
7031 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7032 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7033 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7034 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7035 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7036 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7037 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7038 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7039 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7040 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7041 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7042 ; X86-NEXT: vmovd %xmm0, %eax
7043 ; X86-NEXT: vzeroupper
7046 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7047 ; X64: # %bb.0: # %entry
7048 ; X64-NEXT: kmovw %edi, %k1
7049 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7050 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7051 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7052 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7053 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7054 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7055 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7056 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7057 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7058 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7059 ; X64-NEXT: vmovq %xmm0, %rax
7060 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
7061 ; X64-NEXT: vzeroupper
7064 %0 = bitcast <8 x i64> %__W to <16 x i32>
7065 %1 = bitcast i16 %__M to <16 x i1>
7066 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7067 %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7068 %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7069 %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
7070 %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7071 %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7072 %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
7073 %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7074 %and7.i = and <4 x i32> %shuffle6.i, %and4.i
7075 %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7076 %and10.i = and <4 x i32> %shuffle9.i, %and7.i
7077 %3 = bitcast <4 x i32> %and10.i to <2 x i64>
7078 %vecext.i = extractelement <2 x i64> %3, i32 0
7079 %conv.i = trunc i64 %vecext.i to i32
7083 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7084 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7085 ; X86: # %bb.0: # %entry
7086 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7087 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7088 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7089 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7090 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7091 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7092 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7093 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7094 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7095 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7096 ; X86-NEXT: vmovd %xmm0, %eax
7097 ; X86-NEXT: vzeroupper
7100 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7101 ; X64: # %bb.0: # %entry
7102 ; X64-NEXT: kmovw %edi, %k1
7103 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7104 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7105 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7106 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7107 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7108 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7109 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7110 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7111 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7112 ; X64-NEXT: vmovq %xmm0, %rax
7113 ; X64-NEXT: # kill: def $eax killed $eax killed $rax
7114 ; X64-NEXT: vzeroupper
7117 %0 = bitcast <8 x i64> %__W to <16 x i32>
7118 %1 = bitcast i16 %__M to <16 x i1>
7119 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7120 %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7121 %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7122 %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
7123 %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7124 %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7125 %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
7126 %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7127 %or7.i = or <4 x i32> %shuffle6.i, %or4.i
7128 %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7129 %or10.i = or <4 x i32> %shuffle9.i, %or7.i
7130 %3 = bitcast <4 x i32> %or10.i to <2 x i64>
7131 %vecext.i = extractelement <2 x i64> %3, i32 0
7132 %conv.i = trunc i64 %vecext.i to i32
7136 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7137 ; X86-LABEL: test_mm512_reduce_add_pd:
7138 ; X86: # %bb.0: # %entry
7139 ; X86-NEXT: pushl %ebp
7140 ; X86-NEXT: .cfi_def_cfa_offset 8
7141 ; X86-NEXT: .cfi_offset %ebp, -8
7142 ; X86-NEXT: movl %esp, %ebp
7143 ; X86-NEXT: .cfi_def_cfa_register %ebp
7144 ; X86-NEXT: andl $-8, %esp
7145 ; X86-NEXT: subl $8, %esp
7146 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7147 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7148 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7149 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7150 ; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7151 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7152 ; X86-NEXT: fldl (%esp)
7153 ; X86-NEXT: movl %ebp, %esp
7154 ; X86-NEXT: popl %ebp
7155 ; X86-NEXT: .cfi_def_cfa %esp, 4
7156 ; X86-NEXT: vzeroupper
7159 ; X64-LABEL: test_mm512_reduce_add_pd:
7160 ; X64: # %bb.0: # %entry
7161 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7162 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7163 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7164 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7165 ; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7166 ; X64-NEXT: vzeroupper
7169 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7170 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7171 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7172 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7173 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7174 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7175 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
7176 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7177 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7178 ret double %vecext.i
7181 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7182 ; X86-LABEL: test_mm512_reduce_mul_pd:
7183 ; X86: # %bb.0: # %entry
7184 ; X86-NEXT: pushl %ebp
7185 ; X86-NEXT: .cfi_def_cfa_offset 8
7186 ; X86-NEXT: .cfi_offset %ebp, -8
7187 ; X86-NEXT: movl %esp, %ebp
7188 ; X86-NEXT: .cfi_def_cfa_register %ebp
7189 ; X86-NEXT: andl $-8, %esp
7190 ; X86-NEXT: subl $8, %esp
7191 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7192 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7193 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7194 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7195 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7196 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7197 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7198 ; X86-NEXT: fldl (%esp)
7199 ; X86-NEXT: movl %ebp, %esp
7200 ; X86-NEXT: popl %ebp
7201 ; X86-NEXT: .cfi_def_cfa %esp, 4
7202 ; X86-NEXT: vzeroupper
7205 ; X64-LABEL: test_mm512_reduce_mul_pd:
7206 ; X64: # %bb.0: # %entry
7207 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7208 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7209 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7210 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7211 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7212 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7213 ; X64-NEXT: vzeroupper
7216 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7217 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7218 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7219 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7220 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7221 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7222 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
7223 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7224 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7225 ret double %vecext.i
7228 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7229 ; X86-LABEL: test_mm512_reduce_add_ps:
7230 ; X86: # %bb.0: # %entry
7231 ; X86-NEXT: pushl %eax
7232 ; X86-NEXT: .cfi_def_cfa_offset 8
7233 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7234 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7235 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7236 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7237 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7238 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7239 ; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0
7240 ; X86-NEXT: vmovss %xmm0, (%esp)
7241 ; X86-NEXT: flds (%esp)
7242 ; X86-NEXT: popl %eax
7243 ; X86-NEXT: .cfi_def_cfa_offset 4
7244 ; X86-NEXT: vzeroupper
7247 ; X64-LABEL: test_mm512_reduce_add_ps:
7248 ; X64: # %bb.0: # %entry
7249 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7250 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7251 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7252 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7253 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7254 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7255 ; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0
7256 ; X64-NEXT: vzeroupper
7259 %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7260 %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7261 %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
7262 %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7263 %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7264 %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
7265 %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7266 %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
7267 %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7268 %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
7269 %vecext.i = extractelement <4 x float> %add10.i, i32 0
7273 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7274 ; X86-LABEL: test_mm512_reduce_mul_ps:
7275 ; X86: # %bb.0: # %entry
7276 ; X86-NEXT: pushl %eax
7277 ; X86-NEXT: .cfi_def_cfa_offset 8
7278 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7279 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7280 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7281 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7282 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7283 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7284 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7285 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7286 ; X86-NEXT: vmovss %xmm0, (%esp)
7287 ; X86-NEXT: flds (%esp)
7288 ; X86-NEXT: popl %eax
7289 ; X86-NEXT: .cfi_def_cfa_offset 4
7290 ; X86-NEXT: vzeroupper
7293 ; X64-LABEL: test_mm512_reduce_mul_ps:
7294 ; X64: # %bb.0: # %entry
7295 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7296 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7297 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7298 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7299 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7300 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7301 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7302 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7303 ; X64-NEXT: vzeroupper
7306 %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7307 %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7308 %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
7309 %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7310 %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7311 %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
7312 %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7313 %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
7314 %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7315 %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
7316 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7320 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7321 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7322 ; X86: # %bb.0: # %entry
7323 ; X86-NEXT: pushl %ebp
7324 ; X86-NEXT: .cfi_def_cfa_offset 8
7325 ; X86-NEXT: .cfi_offset %ebp, -8
7326 ; X86-NEXT: movl %esp, %ebp
7327 ; X86-NEXT: .cfi_def_cfa_register %ebp
7328 ; X86-NEXT: andl $-8, %esp
7329 ; X86-NEXT: subl $8, %esp
7330 ; X86-NEXT: movb 8(%ebp), %al
7331 ; X86-NEXT: kmovw %eax, %k1
7332 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7333 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7334 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7335 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7336 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7337 ; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7338 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7339 ; X86-NEXT: fldl (%esp)
7340 ; X86-NEXT: movl %ebp, %esp
7341 ; X86-NEXT: popl %ebp
7342 ; X86-NEXT: .cfi_def_cfa %esp, 4
7343 ; X86-NEXT: vzeroupper
7346 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7347 ; X64: # %bb.0: # %entry
7348 ; X64-NEXT: kmovw %edi, %k1
7349 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7350 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7351 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7352 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7353 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7354 ; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7355 ; X64-NEXT: vzeroupper
7358 %0 = bitcast i8 %__M to <8 x i1>
7359 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7360 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7361 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7362 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7363 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7364 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7365 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7366 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
7367 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7368 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7369 ret double %vecext.i
7372 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7373 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7374 ; X86: # %bb.0: # %entry
7375 ; X86-NEXT: pushl %ebp
7376 ; X86-NEXT: .cfi_def_cfa_offset 8
7377 ; X86-NEXT: .cfi_offset %ebp, -8
7378 ; X86-NEXT: movl %esp, %ebp
7379 ; X86-NEXT: .cfi_def_cfa_register %ebp
7380 ; X86-NEXT: andl $-8, %esp
7381 ; X86-NEXT: subl $8, %esp
7382 ; X86-NEXT: movb 8(%ebp), %al
7383 ; X86-NEXT: kmovw %eax, %k1
7384 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
7385 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7386 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7387 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7388 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7389 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7390 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7391 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7392 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7393 ; X86-NEXT: fldl (%esp)
7394 ; X86-NEXT: movl %ebp, %esp
7395 ; X86-NEXT: popl %ebp
7396 ; X86-NEXT: .cfi_def_cfa %esp, 4
7397 ; X86-NEXT: vzeroupper
7400 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7401 ; X64: # %bb.0: # %entry
7402 ; X64-NEXT: kmovw %edi, %k1
7403 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
7404 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7405 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7406 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7407 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7408 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7409 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7410 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7411 ; X64-NEXT: vzeroupper
7414 %0 = bitcast i8 %__M to <8 x i1>
7415 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7416 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7417 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7418 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7419 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7420 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7421 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7422 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
7423 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7424 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7425 ret double %vecext.i
7428 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7429 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7430 ; X86: # %bb.0: # %entry
7431 ; X86-NEXT: pushl %eax
7432 ; X86-NEXT: .cfi_def_cfa_offset 8
7433 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7434 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7435 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7436 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7437 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7438 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7439 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7440 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7441 ; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0
7442 ; X86-NEXT: vmovss %xmm0, (%esp)
7443 ; X86-NEXT: flds (%esp)
7444 ; X86-NEXT: popl %eax
7445 ; X86-NEXT: .cfi_def_cfa_offset 4
7446 ; X86-NEXT: vzeroupper
7449 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7450 ; X64: # %bb.0: # %entry
7451 ; X64-NEXT: kmovw %edi, %k1
7452 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7453 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7454 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7455 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7456 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7457 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7458 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7459 ; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0
7460 ; X64-NEXT: vzeroupper
7463 %0 = bitcast i16 %__M to <16 x i1>
7464 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7465 %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7466 %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7467 %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
7468 %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7469 %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7470 %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
7471 %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7472 %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
7473 %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7474 %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
7475 %vecext.i = extractelement <4 x float> %add10.i, i32 0
7479 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7480 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7481 ; X86: # %bb.0: # %entry
7482 ; X86-NEXT: pushl %eax
7483 ; X86-NEXT: .cfi_def_cfa_offset 8
7484 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7485 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7486 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7487 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7488 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7489 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7490 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7491 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7492 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7493 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7494 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7495 ; X86-NEXT: vmovss %xmm0, (%esp)
7496 ; X86-NEXT: flds (%esp)
7497 ; X86-NEXT: popl %eax
7498 ; X86-NEXT: .cfi_def_cfa_offset 4
7499 ; X86-NEXT: vzeroupper
7502 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7503 ; X64: # %bb.0: # %entry
7504 ; X64-NEXT: kmovw %edi, %k1
7505 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7506 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7507 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7508 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7509 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7510 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7511 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7512 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7513 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7514 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7515 ; X64-NEXT: vzeroupper
7518 %0 = bitcast i16 %__M to <16 x i1>
7519 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7520 %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7521 %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7522 %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
7523 %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7524 %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7525 %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
7526 %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
7527 %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
7528 %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7529 %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
7530 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7534 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7535 ; X86-LABEL: test_mm512_reduce_max_epi64:
7536 ; X86: # %bb.0: # %entry
7537 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7538 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7539 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7540 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7541 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7542 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7543 ; X86-NEXT: vmovd %xmm0, %eax
7544 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7545 ; X86-NEXT: vzeroupper
7548 ; X64-LABEL: test_mm512_reduce_max_epi64:
7549 ; X64: # %bb.0: # %entry
7550 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7551 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7552 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7553 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7554 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7555 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7556 ; X64-NEXT: vmovq %xmm0, %rax
7557 ; X64-NEXT: vzeroupper
7560 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7561 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7562 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7563 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7564 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7565 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7566 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7567 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7568 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7569 %vecext.i = extractelement <8 x i64> %5, i32 0
7573 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7574 ; X86-LABEL: test_mm512_reduce_max_epu64:
7575 ; X86: # %bb.0: # %entry
7576 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7577 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7578 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7579 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7580 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7581 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7582 ; X86-NEXT: vmovd %xmm0, %eax
7583 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7584 ; X86-NEXT: vzeroupper
7587 ; X64-LABEL: test_mm512_reduce_max_epu64:
7588 ; X64: # %bb.0: # %entry
7589 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7590 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7591 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7592 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7593 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7594 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7595 ; X64-NEXT: vmovq %xmm0, %rax
7596 ; X64-NEXT: vzeroupper
7599 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7600 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7601 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7602 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7603 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7604 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7605 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7606 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7607 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7608 %vecext.i = extractelement <8 x i64> %5, i32 0
7612 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7613 ; X86-LABEL: test_mm512_reduce_max_pd:
7614 ; X86: # %bb.0: # %entry
7615 ; X86-NEXT: pushl %ebp
7616 ; X86-NEXT: .cfi_def_cfa_offset 8
7617 ; X86-NEXT: .cfi_offset %ebp, -8
7618 ; X86-NEXT: movl %esp, %ebp
7619 ; X86-NEXT: .cfi_def_cfa_register %ebp
7620 ; X86-NEXT: andl $-8, %esp
7621 ; X86-NEXT: subl $8, %esp
7622 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7623 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7624 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7625 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7626 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7627 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7628 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7629 ; X86-NEXT: fldl (%esp)
7630 ; X86-NEXT: movl %ebp, %esp
7631 ; X86-NEXT: popl %ebp
7632 ; X86-NEXT: .cfi_def_cfa %esp, 4
7633 ; X86-NEXT: vzeroupper
7636 ; X64-LABEL: test_mm512_reduce_max_pd:
7637 ; X64: # %bb.0: # %entry
7638 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7639 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7640 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7641 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7642 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7643 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7644 ; X64-NEXT: vzeroupper
7647 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7648 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7649 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7650 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7651 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7652 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7653 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7654 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7655 %vecext.i = extractelement <2 x double> %2, i32 0
7656 ret double %vecext.i
7659 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7660 ; X86-LABEL: test_mm512_reduce_min_epi64:
7661 ; X86: # %bb.0: # %entry
7662 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7663 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7664 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7665 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7666 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7667 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7668 ; X86-NEXT: vmovd %xmm0, %eax
7669 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7670 ; X86-NEXT: vzeroupper
7673 ; X64-LABEL: test_mm512_reduce_min_epi64:
7674 ; X64: # %bb.0: # %entry
7675 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7676 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7677 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7678 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7679 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7680 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7681 ; X64-NEXT: vmovq %xmm0, %rax
7682 ; X64-NEXT: vzeroupper
7685 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7686 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7687 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7688 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7689 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7690 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7691 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7692 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7693 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7694 %vecext.i = extractelement <8 x i64> %5, i32 0
7698 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7699 ; X86-LABEL: test_mm512_reduce_min_epu64:
7700 ; X86: # %bb.0: # %entry
7701 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7702 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7703 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7704 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7705 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7706 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7707 ; X86-NEXT: vmovd %xmm0, %eax
7708 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7709 ; X86-NEXT: vzeroupper
7712 ; X64-LABEL: test_mm512_reduce_min_epu64:
7713 ; X64: # %bb.0: # %entry
7714 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7715 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7716 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7717 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7718 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7719 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7720 ; X64-NEXT: vmovq %xmm0, %rax
7721 ; X64-NEXT: vzeroupper
7724 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7725 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7726 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7727 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7728 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7729 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7730 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7731 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7732 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7733 %vecext.i = extractelement <8 x i64> %5, i32 0
7737 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7738 ; X86-LABEL: test_mm512_reduce_min_pd:
7739 ; X86: # %bb.0: # %entry
7740 ; X86-NEXT: pushl %ebp
7741 ; X86-NEXT: .cfi_def_cfa_offset 8
7742 ; X86-NEXT: .cfi_offset %ebp, -8
7743 ; X86-NEXT: movl %esp, %ebp
7744 ; X86-NEXT: .cfi_def_cfa_register %ebp
7745 ; X86-NEXT: andl $-8, %esp
7746 ; X86-NEXT: subl $8, %esp
7747 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7748 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7749 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7750 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7751 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7752 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7753 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7754 ; X86-NEXT: fldl (%esp)
7755 ; X86-NEXT: movl %ebp, %esp
7756 ; X86-NEXT: popl %ebp
7757 ; X86-NEXT: .cfi_def_cfa %esp, 4
7758 ; X86-NEXT: vzeroupper
7761 ; X64-LABEL: test_mm512_reduce_min_pd:
7762 ; X64: # %bb.0: # %entry
7763 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7764 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7765 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7766 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7767 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7768 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7769 ; X64-NEXT: vzeroupper
7772 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7773 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7774 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7775 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7776 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7777 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7778 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7779 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7780 %vecext.i = extractelement <2 x double> %2, i32 0
7781 ret double %vecext.i
7784 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7785 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7786 ; X86: # %bb.0: # %entry
7787 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7788 ; X86-NEXT: kmovw %eax, %k1
7789 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7790 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7791 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7792 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7793 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7794 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7795 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7796 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7797 ; X86-NEXT: vmovd %xmm0, %eax
7798 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7799 ; X86-NEXT: vzeroupper
7802 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7803 ; X64: # %bb.0: # %entry
7804 ; X64-NEXT: kmovw %edi, %k1
7805 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7806 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7807 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7808 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7809 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7810 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7811 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7812 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7813 ; X64-NEXT: vmovq %xmm0, %rax
7814 ; X64-NEXT: vzeroupper
7817 %0 = bitcast i8 %__M to <8 x i1>
7818 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
7819 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7820 %2 = icmp sgt <8 x i64> %1, %shuffle.i
7821 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7822 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7823 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7824 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7825 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7826 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
7827 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7828 %vecext.i = extractelement <8 x i64> %7, i32 0
7832 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7833 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
7834 ; X86: # %bb.0: # %entry
7835 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7836 ; X86-NEXT: kmovw %eax, %k1
7837 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7838 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7839 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7840 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7841 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7842 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7843 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7844 ; X86-NEXT: vmovd %xmm0, %eax
7845 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7846 ; X86-NEXT: vzeroupper
7849 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
7850 ; X64: # %bb.0: # %entry
7851 ; X64-NEXT: kmovw %edi, %k1
7852 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7853 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7854 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7855 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7856 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7857 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7858 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7859 ; X64-NEXT: vmovq %xmm0, %rax
7860 ; X64-NEXT: vzeroupper
7863 %0 = bitcast i8 %__M to <8 x i1>
7864 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
7865 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7866 %2 = icmp ugt <8 x i64> %1, %shuffle.i
7867 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7868 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7869 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
7870 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
7871 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7872 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
7873 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
7874 %vecext.i = extractelement <8 x i64> %7, i32 0
7878 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
7879 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
7880 ; X86: # %bb.0: # %entry
7881 ; X86-NEXT: pushl %ebp
7882 ; X86-NEXT: .cfi_def_cfa_offset 8
7883 ; X86-NEXT: .cfi_offset %ebp, -8
7884 ; X86-NEXT: movl %esp, %ebp
7885 ; X86-NEXT: .cfi_def_cfa_register %ebp
7886 ; X86-NEXT: andl $-8, %esp
7887 ; X86-NEXT: subl $8, %esp
7888 ; X86-NEXT: movb 8(%ebp), %al
7889 ; X86-NEXT: kmovw %eax, %k1
7890 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
7891 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7892 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7893 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
7894 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7895 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7896 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7897 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7898 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7899 ; X86-NEXT: fldl (%esp)
7900 ; X86-NEXT: movl %ebp, %esp
7901 ; X86-NEXT: popl %ebp
7902 ; X86-NEXT: .cfi_def_cfa %esp, 4
7903 ; X86-NEXT: vzeroupper
7906 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
7907 ; X64: # %bb.0: # %entry
7908 ; X64-NEXT: kmovw %edi, %k1
7909 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
7910 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7911 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7912 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
7913 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7914 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7915 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7916 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7917 ; X64-NEXT: vzeroupper
7920 %0 = bitcast i8 %__M to <8 x i1>
7921 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
7922 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7923 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7924 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
7925 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7926 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7927 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
7928 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7929 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
7930 %vecext.i = extractelement <2 x double> %4, i32 0
7931 ret double %vecext.i
7934 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7935 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
7936 ; X86: # %bb.0: # %entry
7937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7938 ; X86-NEXT: kmovw %eax, %k1
7939 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
7940 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7941 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7942 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7943 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7944 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7945 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7946 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7947 ; X86-NEXT: vmovd %xmm0, %eax
7948 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7949 ; X86-NEXT: vzeroupper
7952 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
7953 ; X64: # %bb.0: # %entry
7954 ; X64-NEXT: kmovw %edi, %k1
7955 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
7956 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7957 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7958 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7959 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7960 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7961 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7962 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7963 ; X64-NEXT: vmovq %xmm0, %rax
7964 ; X64-NEXT: vzeroupper
7967 %0 = bitcast i8 %__M to <8 x i1>
7968 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
7969 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7970 %2 = icmp slt <8 x i64> %1, %shuffle.i
7971 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7972 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7973 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7974 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7975 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7976 %6 = icmp slt <8 x i64> %5, %shuffle5.i
7977 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7978 %vecext.i = extractelement <8 x i64> %7, i32 0
7982 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7983 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
7984 ; X86: # %bb.0: # %entry
7985 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7986 ; X86-NEXT: kmovw %eax, %k1
7987 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7988 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7989 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7990 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7991 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7992 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7993 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7994 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7995 ; X86-NEXT: vmovd %xmm0, %eax
7996 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7997 ; X86-NEXT: vzeroupper
8000 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8001 ; X64: # %bb.0: # %entry
8002 ; X64-NEXT: kmovw %edi, %k1
8003 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8004 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8005 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8006 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8007 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8008 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8009 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8010 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8011 ; X64-NEXT: vmovq %xmm0, %rax
8012 ; X64-NEXT: vzeroupper
8015 %0 = bitcast i8 %__M to <8 x i1>
8016 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8017 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8018 %2 = icmp ult <8 x i64> %1, %shuffle.i
8019 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8020 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8021 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8022 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8023 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8024 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8025 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8026 %vecext.i = extractelement <8 x i64> %7, i32 0
8030 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8031 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8032 ; X86: # %bb.0: # %entry
8033 ; X86-NEXT: pushl %ebp
8034 ; X86-NEXT: .cfi_def_cfa_offset 8
8035 ; X86-NEXT: .cfi_offset %ebp, -8
8036 ; X86-NEXT: movl %esp, %ebp
8037 ; X86-NEXT: .cfi_def_cfa_register %ebp
8038 ; X86-NEXT: andl $-8, %esp
8039 ; X86-NEXT: subl $8, %esp
8040 ; X86-NEXT: movb 8(%ebp), %al
8041 ; X86-NEXT: kmovw %eax, %k1
8042 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8043 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8044 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8045 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8046 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8047 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8048 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8049 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8050 ; X86-NEXT: vmovlpd %xmm0, (%esp)
8051 ; X86-NEXT: fldl (%esp)
8052 ; X86-NEXT: movl %ebp, %esp
8053 ; X86-NEXT: popl %ebp
8054 ; X86-NEXT: .cfi_def_cfa %esp, 4
8055 ; X86-NEXT: vzeroupper
8058 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8059 ; X64: # %bb.0: # %entry
8060 ; X64-NEXT: kmovw %edi, %k1
8061 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8062 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8063 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8064 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8065 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8066 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8067 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8068 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8069 ; X64-NEXT: vzeroupper
8072 %0 = bitcast i8 %__M to <8 x i1>
8073 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8074 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8075 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8076 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8077 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8078 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8079 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8080 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8081 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8082 %vecext.i = extractelement <2 x double> %4, i32 0
8083 ret double %vecext.i
8086 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8087 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8088 ; CHECK: # %bb.0: # %entry
8089 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8090 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8091 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8092 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8093 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8094 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8095 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8096 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8097 ; CHECK-NEXT: vmovd %xmm0, %eax
8098 ; CHECK-NEXT: vzeroupper
8099 ; CHECK-NEXT: ret{{[l|q]}}
8101 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8102 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8103 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8104 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8105 %2 = icmp sgt <8 x i32> %0, %1
8106 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8107 %4 = bitcast <8 x i32> %3 to <4 x i64>
8108 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8109 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8110 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8111 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8112 %7 = icmp sgt <4 x i32> %5, %6
8113 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8114 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8115 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8116 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8117 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8118 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8119 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8120 %vecext.i = extractelement <4 x i32> %12, i32 0
8124 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8125 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8126 ; CHECK: # %bb.0: # %entry
8127 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8128 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8129 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8130 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8131 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8132 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8133 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8134 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8135 ; CHECK-NEXT: vmovd %xmm0, %eax
8136 ; CHECK-NEXT: vzeroupper
8137 ; CHECK-NEXT: ret{{[l|q]}}
8139 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8140 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8141 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8142 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8143 %2 = icmp ugt <8 x i32> %0, %1
8144 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8145 %4 = bitcast <8 x i32> %3 to <4 x i64>
8146 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8147 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8148 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8149 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8150 %7 = icmp ugt <4 x i32> %5, %6
8151 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8152 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8153 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8154 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8155 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8156 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8157 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8158 %vecext.i = extractelement <4 x i32> %12, i32 0
8162 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8163 ; X86-LABEL: test_mm512_reduce_max_ps:
8164 ; X86: # %bb.0: # %entry
8165 ; X86-NEXT: pushl %eax
8166 ; X86-NEXT: .cfi_def_cfa_offset 8
8167 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8168 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8169 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8170 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8171 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8172 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8173 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8174 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8175 ; X86-NEXT: vmovss %xmm0, (%esp)
8176 ; X86-NEXT: flds (%esp)
8177 ; X86-NEXT: popl %eax
8178 ; X86-NEXT: .cfi_def_cfa_offset 4
8179 ; X86-NEXT: vzeroupper
8182 ; X64-LABEL: test_mm512_reduce_max_ps:
8183 ; X64: # %bb.0: # %entry
8184 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8185 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8186 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8187 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8188 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8189 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8190 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8191 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8192 ; X64-NEXT: vzeroupper
8195 %0 = bitcast <16 x float> %__W to <8 x double>
8196 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8197 %1 = bitcast <4 x double> %extract.i to <8 x float>
8198 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8199 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8200 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8201 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8202 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8203 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8204 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8205 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8206 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8207 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8208 %vecext.i = extractelement <4 x float> %6, i32 0
8212 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8213 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8214 ; CHECK: # %bb.0: # %entry
8215 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8216 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8217 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8218 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8219 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8220 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8221 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8222 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8223 ; CHECK-NEXT: vmovd %xmm0, %eax
8224 ; CHECK-NEXT: vzeroupper
8225 ; CHECK-NEXT: ret{{[l|q]}}
8227 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8228 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8229 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8230 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8231 %2 = icmp slt <8 x i32> %0, %1
8232 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8233 %4 = bitcast <8 x i32> %3 to <4 x i64>
8234 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8235 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8236 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8237 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8238 %7 = icmp slt <4 x i32> %5, %6
8239 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8240 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8241 %9 = icmp slt <4 x i32> %8, %shuffle.i
8242 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8243 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8244 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8245 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8246 %vecext.i = extractelement <4 x i32> %12, i32 0
8250 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8251 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8252 ; CHECK: # %bb.0: # %entry
8253 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8254 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8255 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8256 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8257 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8258 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8259 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8260 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8261 ; CHECK-NEXT: vmovd %xmm0, %eax
8262 ; CHECK-NEXT: vzeroupper
8263 ; CHECK-NEXT: ret{{[l|q]}}
8265 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8266 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8267 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8268 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8269 %2 = icmp ult <8 x i32> %0, %1
8270 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8271 %4 = bitcast <8 x i32> %3 to <4 x i64>
8272 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8273 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8274 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8275 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8276 %7 = icmp ult <4 x i32> %5, %6
8277 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8278 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8279 %9 = icmp ult <4 x i32> %8, %shuffle.i
8280 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8281 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8282 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8283 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8284 %vecext.i = extractelement <4 x i32> %12, i32 0
8288 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8289 ; X86-LABEL: test_mm512_reduce_min_ps:
8290 ; X86: # %bb.0: # %entry
8291 ; X86-NEXT: pushl %eax
8292 ; X86-NEXT: .cfi_def_cfa_offset 8
8293 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8294 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8295 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8296 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8297 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8298 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8299 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8300 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8301 ; X86-NEXT: vmovss %xmm0, (%esp)
8302 ; X86-NEXT: flds (%esp)
8303 ; X86-NEXT: popl %eax
8304 ; X86-NEXT: .cfi_def_cfa_offset 4
8305 ; X86-NEXT: vzeroupper
8308 ; X64-LABEL: test_mm512_reduce_min_ps:
8309 ; X64: # %bb.0: # %entry
8310 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8311 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8312 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8313 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8314 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8315 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8316 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8317 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8318 ; X64-NEXT: vzeroupper
8321 %0 = bitcast <16 x float> %__W to <8 x double>
8322 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8323 %1 = bitcast <4 x double> %extract.i to <8 x float>
8324 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8325 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8326 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8327 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8328 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8329 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8330 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8331 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8332 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8333 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8334 %vecext.i = extractelement <4 x float> %6, i32 0
8338 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8339 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8340 ; X86: # %bb.0: # %entry
8341 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8342 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8343 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8344 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8345 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8346 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8347 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8348 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8349 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8350 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8351 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8352 ; X86-NEXT: vmovd %xmm0, %eax
8353 ; X86-NEXT: vzeroupper
8356 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8357 ; X64: # %bb.0: # %entry
8358 ; X64-NEXT: kmovw %edi, %k1
8359 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8360 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8361 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8362 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8363 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8364 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8365 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8366 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8367 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8368 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8369 ; X64-NEXT: vmovd %xmm0, %eax
8370 ; X64-NEXT: vzeroupper
8373 %0 = bitcast <8 x i64> %__W to <16 x i32>
8374 %1 = bitcast i16 %__M to <16 x i1>
8375 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8376 %3 = bitcast <16 x i32> %2 to <8 x i64>
8377 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8378 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8379 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8380 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8381 %6 = icmp sgt <8 x i32> %4, %5
8382 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8383 %8 = bitcast <8 x i32> %7 to <4 x i64>
8384 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8385 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8386 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8387 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8388 %11 = icmp sgt <4 x i32> %9, %10
8389 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8390 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8391 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8392 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8393 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8394 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8395 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8396 %vecext.i = extractelement <4 x i32> %16, i32 0
8400 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8401 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8402 ; X86: # %bb.0: # %entry
8403 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8404 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8405 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8406 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8407 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8408 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8409 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8410 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8411 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8412 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8413 ; X86-NEXT: vmovd %xmm0, %eax
8414 ; X86-NEXT: vzeroupper
8417 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8418 ; X64: # %bb.0: # %entry
8419 ; X64-NEXT: kmovw %edi, %k1
8420 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8421 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8422 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8423 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8424 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8425 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8426 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8427 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8428 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8429 ; X64-NEXT: vmovd %xmm0, %eax
8430 ; X64-NEXT: vzeroupper
8433 %0 = bitcast <8 x i64> %__W to <16 x i32>
8434 %1 = bitcast i16 %__M to <16 x i1>
8435 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8436 %3 = bitcast <16 x i32> %2 to <8 x i64>
8437 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8438 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8439 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8440 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8441 %6 = icmp ugt <8 x i32> %4, %5
8442 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8443 %8 = bitcast <8 x i32> %7 to <4 x i64>
8444 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8445 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8446 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8447 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8448 %11 = icmp ugt <4 x i32> %9, %10
8449 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8450 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8451 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8452 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8453 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8454 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8455 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8456 %vecext.i = extractelement <4 x i32> %16, i32 0
8460 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8461 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8462 ; X86: # %bb.0: # %entry
8463 ; X86-NEXT: pushl %eax
8464 ; X86-NEXT: .cfi_def_cfa_offset 8
8465 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8466 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8467 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8468 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8469 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8470 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8471 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8472 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8473 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8474 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8475 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8476 ; X86-NEXT: vmovss %xmm0, (%esp)
8477 ; X86-NEXT: flds (%esp)
8478 ; X86-NEXT: popl %eax
8479 ; X86-NEXT: .cfi_def_cfa_offset 4
8480 ; X86-NEXT: vzeroupper
8483 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8484 ; X64: # %bb.0: # %entry
8485 ; X64-NEXT: kmovw %edi, %k1
8486 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8487 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8488 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8489 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8490 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8491 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8492 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8493 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8494 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8495 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8496 ; X64-NEXT: vzeroupper
8499 %0 = bitcast i16 %__M to <16 x i1>
8500 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8501 %2 = bitcast <16 x float> %1 to <8 x double>
8502 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8503 %3 = bitcast <4 x double> %extract.i to <8 x float>
8504 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8505 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8506 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8507 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8508 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8509 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8510 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8511 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8512 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8513 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8514 %vecext.i = extractelement <4 x float> %8, i32 0
8518 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8519 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8520 ; X86: # %bb.0: # %entry
8521 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8522 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8523 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8524 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8525 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8526 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8527 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8528 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8529 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8530 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8531 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8532 ; X86-NEXT: vmovd %xmm0, %eax
8533 ; X86-NEXT: vzeroupper
8536 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8537 ; X64: # %bb.0: # %entry
8538 ; X64-NEXT: kmovw %edi, %k1
8539 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8540 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8541 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8542 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8543 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8544 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8545 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8546 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8547 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8548 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8549 ; X64-NEXT: vmovd %xmm0, %eax
8550 ; X64-NEXT: vzeroupper
8553 %0 = bitcast <8 x i64> %__W to <16 x i32>
8554 %1 = bitcast i16 %__M to <16 x i1>
8555 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8556 %3 = bitcast <16 x i32> %2 to <8 x i64>
8557 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8558 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8559 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8560 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8561 %6 = icmp slt <8 x i32> %4, %5
8562 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8563 %8 = bitcast <8 x i32> %7 to <4 x i64>
8564 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8565 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8566 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8567 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8568 %11 = icmp slt <4 x i32> %9, %10
8569 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8570 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8571 %13 = icmp slt <4 x i32> %12, %shuffle.i
8572 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8573 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8574 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8575 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8576 %vecext.i = extractelement <4 x i32> %16, i32 0
8580 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8581 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8582 ; X86: # %bb.0: # %entry
8583 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8584 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8585 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8586 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8587 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8588 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8589 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8590 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8591 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8592 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8593 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8594 ; X86-NEXT: vmovd %xmm0, %eax
8595 ; X86-NEXT: vzeroupper
8598 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8599 ; X64: # %bb.0: # %entry
8600 ; X64-NEXT: kmovw %edi, %k1
8601 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8602 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8603 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8604 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8605 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8606 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8607 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8608 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8609 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8610 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8611 ; X64-NEXT: vmovd %xmm0, %eax
8612 ; X64-NEXT: vzeroupper
8615 %0 = bitcast <8 x i64> %__W to <16 x i32>
8616 %1 = bitcast i16 %__M to <16 x i1>
8617 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8618 %3 = bitcast <16 x i32> %2 to <8 x i64>
8619 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8620 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8621 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8622 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8623 %6 = icmp ult <8 x i32> %4, %5
8624 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8625 %8 = bitcast <8 x i32> %7 to <4 x i64>
8626 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8627 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8628 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8629 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8630 %11 = icmp ult <4 x i32> %9, %10
8631 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8632 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8633 %13 = icmp ult <4 x i32> %12, %shuffle.i
8634 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8635 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8636 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8637 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8638 %vecext.i = extractelement <4 x i32> %16, i32 0
8642 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8643 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8644 ; X86: # %bb.0: # %entry
8645 ; X86-NEXT: pushl %eax
8646 ; X86-NEXT: .cfi_def_cfa_offset 8
8647 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
8648 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8649 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8650 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8651 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8652 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8653 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8654 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8655 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8656 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8657 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8658 ; X86-NEXT: vmovss %xmm0, (%esp)
8659 ; X86-NEXT: flds (%esp)
8660 ; X86-NEXT: popl %eax
8661 ; X86-NEXT: .cfi_def_cfa_offset 4
8662 ; X86-NEXT: vzeroupper
8665 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8666 ; X64: # %bb.0: # %entry
8667 ; X64-NEXT: kmovw %edi, %k1
8668 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8669 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8670 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8671 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8672 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8673 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8674 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8675 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8676 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8677 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8678 ; X64-NEXT: vzeroupper
8681 %0 = bitcast i16 %__M to <16 x i1>
8682 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8683 %2 = bitcast <16 x float> %1 to <8 x double>
8684 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8685 %3 = bitcast <4 x double> %extract.i to <8 x float>
8686 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8687 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8688 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8689 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8690 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8691 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8692 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8693 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8694 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8695 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8696 %vecext.i = extractelement <4 x float> %8, i32 0
8700 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
8701 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
8702 declare float @llvm.fma.f32(float, float, float) #9
8703 declare double @llvm.fma.f64(double, double, double) #9
8704 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
8705 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
8706 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
8707 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
8708 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
8709 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
8710 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
8711 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
8712 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
8713 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
8714 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
8715 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
8716 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
8717 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
8718 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
8719 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)