1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
11 ; Ensure that the backend no longer emits unnecessary vector insert
12 ; instructions immediately after SSE scalar fp instructions
13 ; like addss or mulss.
15 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
16 ; SSE-LABEL: test_add_ss:
18 ; SSE-NEXT: addss %xmm1, %xmm0
19 ; SSE-NEXT: ret{{[l|q]}}
21 ; AVX-LABEL: test_add_ss:
23 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
24 ; AVX-NEXT: ret{{[l|q]}}
25 %1 = extractelement <4 x float> %b, i32 0
26 %2 = extractelement <4 x float> %a, i32 0
27 %add = fadd float %2, %1
28 %3 = insertelement <4 x float> %a, float %add, i32 0
32 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
33 ; SSE-LABEL: test_sub_ss:
35 ; SSE-NEXT: subss %xmm1, %xmm0
36 ; SSE-NEXT: ret{{[l|q]}}
38 ; AVX-LABEL: test_sub_ss:
40 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
41 ; AVX-NEXT: ret{{[l|q]}}
42 %1 = extractelement <4 x float> %b, i32 0
43 %2 = extractelement <4 x float> %a, i32 0
44 %sub = fsub float %2, %1
45 %3 = insertelement <4 x float> %a, float %sub, i32 0
49 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
50 ; SSE-LABEL: test_mul_ss:
52 ; SSE-NEXT: mulss %xmm1, %xmm0
53 ; SSE-NEXT: ret{{[l|q]}}
55 ; AVX-LABEL: test_mul_ss:
57 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
58 ; AVX-NEXT: ret{{[l|q]}}
59 %1 = extractelement <4 x float> %b, i32 0
60 %2 = extractelement <4 x float> %a, i32 0
61 %mul = fmul float %2, %1
62 %3 = insertelement <4 x float> %a, float %mul, i32 0
66 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
67 ; SSE-LABEL: test_div_ss:
69 ; SSE-NEXT: divss %xmm1, %xmm0
70 ; SSE-NEXT: ret{{[l|q]}}
72 ; AVX-LABEL: test_div_ss:
74 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
75 ; AVX-NEXT: ret{{[l|q]}}
76 %1 = extractelement <4 x float> %b, i32 0
77 %2 = extractelement <4 x float> %a, i32 0
78 %div = fdiv float %2, %1
79 %3 = insertelement <4 x float> %a, float %div, i32 0
83 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
84 ; SSE-LABEL: test_sqrt_ss:
86 ; SSE-NEXT: sqrtss %xmm0, %xmm0
87 ; SSE-NEXT: ret{{[l|q]}}
89 ; AVX-LABEL: test_sqrt_ss:
91 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
92 ; AVX-NEXT: ret{{[l|q]}}
93 %1 = extractelement <4 x float> %a, i32 0
94 %2 = call float @llvm.sqrt.f32(float %1)
95 %3 = insertelement <4 x float> %a, float %2, i32 0
98 declare float @llvm.sqrt.f32(float)
100 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
101 ; SSE-LABEL: test_add_sd:
103 ; SSE-NEXT: addsd %xmm1, %xmm0
104 ; SSE-NEXT: ret{{[l|q]}}
106 ; AVX-LABEL: test_add_sd:
108 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
109 ; AVX-NEXT: ret{{[l|q]}}
110 %1 = extractelement <2 x double> %b, i32 0
111 %2 = extractelement <2 x double> %a, i32 0
112 %add = fadd double %2, %1
113 %3 = insertelement <2 x double> %a, double %add, i32 0
117 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
118 ; SSE-LABEL: test_sub_sd:
120 ; SSE-NEXT: subsd %xmm1, %xmm0
121 ; SSE-NEXT: ret{{[l|q]}}
123 ; AVX-LABEL: test_sub_sd:
125 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
126 ; AVX-NEXT: ret{{[l|q]}}
127 %1 = extractelement <2 x double> %b, i32 0
128 %2 = extractelement <2 x double> %a, i32 0
129 %sub = fsub double %2, %1
130 %3 = insertelement <2 x double> %a, double %sub, i32 0
134 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
135 ; SSE-LABEL: test_mul_sd:
137 ; SSE-NEXT: mulsd %xmm1, %xmm0
138 ; SSE-NEXT: ret{{[l|q]}}
140 ; AVX-LABEL: test_mul_sd:
142 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
143 ; AVX-NEXT: ret{{[l|q]}}
144 %1 = extractelement <2 x double> %b, i32 0
145 %2 = extractelement <2 x double> %a, i32 0
146 %mul = fmul double %2, %1
147 %3 = insertelement <2 x double> %a, double %mul, i32 0
151 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
152 ; SSE-LABEL: test_div_sd:
154 ; SSE-NEXT: divsd %xmm1, %xmm0
155 ; SSE-NEXT: ret{{[l|q]}}
157 ; AVX-LABEL: test_div_sd:
159 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
160 ; AVX-NEXT: ret{{[l|q]}}
161 %1 = extractelement <2 x double> %b, i32 0
162 %2 = extractelement <2 x double> %a, i32 0
163 %div = fdiv double %2, %1
164 %3 = insertelement <2 x double> %a, double %div, i32 0
168 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
169 ; SSE-LABEL: test_sqrt_sd:
171 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
172 ; SSE-NEXT: ret{{[l|q]}}
174 ; AVX-LABEL: test_sqrt_sd:
176 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
177 ; AVX-NEXT: ret{{[l|q]}}
178 %1 = extractelement <2 x double> %a, i32 0
179 %2 = call double @llvm.sqrt.f64(double %1)
180 %3 = insertelement <2 x double> %a, double %2, i32 0
183 declare double @llvm.sqrt.f64(double)
185 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
186 ; SSE-LABEL: test2_add_ss:
188 ; SSE-NEXT: addss %xmm0, %xmm1
189 ; SSE-NEXT: movaps %xmm1, %xmm0
190 ; SSE-NEXT: ret{{[l|q]}}
192 ; AVX-LABEL: test2_add_ss:
194 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
195 ; AVX-NEXT: ret{{[l|q]}}
196 %1 = extractelement <4 x float> %a, i32 0
197 %2 = extractelement <4 x float> %b, i32 0
198 %add = fadd float %1, %2
199 %3 = insertelement <4 x float> %b, float %add, i32 0
203 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
204 ; SSE-LABEL: test2_sub_ss:
206 ; SSE-NEXT: subss %xmm0, %xmm1
207 ; SSE-NEXT: movaps %xmm1, %xmm0
208 ; SSE-NEXT: ret{{[l|q]}}
210 ; AVX-LABEL: test2_sub_ss:
212 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
213 ; AVX-NEXT: ret{{[l|q]}}
214 %1 = extractelement <4 x float> %a, i32 0
215 %2 = extractelement <4 x float> %b, i32 0
216 %sub = fsub float %2, %1
217 %3 = insertelement <4 x float> %b, float %sub, i32 0
221 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
222 ; SSE-LABEL: test2_mul_ss:
224 ; SSE-NEXT: mulss %xmm0, %xmm1
225 ; SSE-NEXT: movaps %xmm1, %xmm0
226 ; SSE-NEXT: ret{{[l|q]}}
228 ; AVX-LABEL: test2_mul_ss:
230 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
231 ; AVX-NEXT: ret{{[l|q]}}
232 %1 = extractelement <4 x float> %a, i32 0
233 %2 = extractelement <4 x float> %b, i32 0
234 %mul = fmul float %1, %2
235 %3 = insertelement <4 x float> %b, float %mul, i32 0
239 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
240 ; SSE-LABEL: test2_div_ss:
242 ; SSE-NEXT: divss %xmm0, %xmm1
243 ; SSE-NEXT: movaps %xmm1, %xmm0
244 ; SSE-NEXT: ret{{[l|q]}}
246 ; AVX-LABEL: test2_div_ss:
248 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
249 ; AVX-NEXT: ret{{[l|q]}}
250 %1 = extractelement <4 x float> %a, i32 0
251 %2 = extractelement <4 x float> %b, i32 0
252 %div = fdiv float %2, %1
253 %3 = insertelement <4 x float> %b, float %div, i32 0
257 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
258 ; SSE-LABEL: test2_add_sd:
260 ; SSE-NEXT: addsd %xmm0, %xmm1
261 ; SSE-NEXT: movapd %xmm1, %xmm0
262 ; SSE-NEXT: ret{{[l|q]}}
264 ; AVX-LABEL: test2_add_sd:
266 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
267 ; AVX-NEXT: ret{{[l|q]}}
268 %1 = extractelement <2 x double> %a, i32 0
269 %2 = extractelement <2 x double> %b, i32 0
270 %add = fadd double %1, %2
271 %3 = insertelement <2 x double> %b, double %add, i32 0
275 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
276 ; SSE-LABEL: test2_sub_sd:
278 ; SSE-NEXT: subsd %xmm0, %xmm1
279 ; SSE-NEXT: movapd %xmm1, %xmm0
280 ; SSE-NEXT: ret{{[l|q]}}
282 ; AVX-LABEL: test2_sub_sd:
284 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
285 ; AVX-NEXT: ret{{[l|q]}}
286 %1 = extractelement <2 x double> %a, i32 0
287 %2 = extractelement <2 x double> %b, i32 0
288 %sub = fsub double %2, %1
289 %3 = insertelement <2 x double> %b, double %sub, i32 0
293 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
294 ; SSE-LABEL: test2_mul_sd:
296 ; SSE-NEXT: mulsd %xmm0, %xmm1
297 ; SSE-NEXT: movapd %xmm1, %xmm0
298 ; SSE-NEXT: ret{{[l|q]}}
300 ; AVX-LABEL: test2_mul_sd:
302 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
303 ; AVX-NEXT: ret{{[l|q]}}
304 %1 = extractelement <2 x double> %a, i32 0
305 %2 = extractelement <2 x double> %b, i32 0
306 %mul = fmul double %1, %2
307 %3 = insertelement <2 x double> %b, double %mul, i32 0
311 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
312 ; SSE-LABEL: test2_div_sd:
314 ; SSE-NEXT: divsd %xmm0, %xmm1
315 ; SSE-NEXT: movapd %xmm1, %xmm0
316 ; SSE-NEXT: ret{{[l|q]}}
318 ; AVX-LABEL: test2_div_sd:
320 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
321 ; AVX-NEXT: ret{{[l|q]}}
322 %1 = extractelement <2 x double> %a, i32 0
323 %2 = extractelement <2 x double> %b, i32 0
324 %div = fdiv double %2, %1
325 %3 = insertelement <2 x double> %b, double %div, i32 0
329 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
330 ; SSE-LABEL: test_multiple_add_ss:
332 ; SSE-NEXT: addss %xmm0, %xmm1
333 ; SSE-NEXT: addss %xmm1, %xmm0
334 ; SSE-NEXT: ret{{[l|q]}}
336 ; AVX-LABEL: test_multiple_add_ss:
338 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
339 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
340 ; AVX-NEXT: ret{{[l|q]}}
341 %1 = extractelement <4 x float> %b, i32 0
342 %2 = extractelement <4 x float> %a, i32 0
343 %add = fadd float %2, %1
344 %add2 = fadd float %2, %add
345 %3 = insertelement <4 x float> %a, float %add2, i32 0
349 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
350 ; SSE-LABEL: test_multiple_sub_ss:
352 ; SSE-NEXT: movaps %xmm0, %xmm2
353 ; SSE-NEXT: subss %xmm1, %xmm2
354 ; SSE-NEXT: subss %xmm2, %xmm0
355 ; SSE-NEXT: ret{{[l|q]}}
357 ; AVX-LABEL: test_multiple_sub_ss:
359 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
360 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
361 ; AVX-NEXT: ret{{[l|q]}}
362 %1 = extractelement <4 x float> %b, i32 0
363 %2 = extractelement <4 x float> %a, i32 0
364 %sub = fsub float %2, %1
365 %sub2 = fsub float %2, %sub
366 %3 = insertelement <4 x float> %a, float %sub2, i32 0
370 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
371 ; SSE-LABEL: test_multiple_mul_ss:
373 ; SSE-NEXT: mulss %xmm0, %xmm1
374 ; SSE-NEXT: mulss %xmm1, %xmm0
375 ; SSE-NEXT: ret{{[l|q]}}
377 ; AVX-LABEL: test_multiple_mul_ss:
379 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
380 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
381 ; AVX-NEXT: ret{{[l|q]}}
382 %1 = extractelement <4 x float> %b, i32 0
383 %2 = extractelement <4 x float> %a, i32 0
384 %mul = fmul float %2, %1
385 %mul2 = fmul float %2, %mul
386 %3 = insertelement <4 x float> %a, float %mul2, i32 0
390 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
391 ; SSE-LABEL: test_multiple_div_ss:
393 ; SSE-NEXT: movaps %xmm0, %xmm2
394 ; SSE-NEXT: divss %xmm1, %xmm2
395 ; SSE-NEXT: divss %xmm2, %xmm0
396 ; SSE-NEXT: ret{{[l|q]}}
398 ; AVX-LABEL: test_multiple_div_ss:
400 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
401 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
402 ; AVX-NEXT: ret{{[l|q]}}
403 %1 = extractelement <4 x float> %b, i32 0
404 %2 = extractelement <4 x float> %a, i32 0
405 %div = fdiv float %2, %1
406 %div2 = fdiv float %2, %div
407 %3 = insertelement <4 x float> %a, float %div2, i32 0
411 ; With SSE4.1 or greater, the shuffles in the following tests may
412 ; be lowered to X86Blendi nodes.
414 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
415 ; X86-SSE-LABEL: blend_add_ss:
417 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
418 ; X86-SSE-NEXT: addss %xmm1, %xmm0
421 ; X86-AVX-LABEL: blend_add_ss:
423 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
424 ; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
427 ; X64-SSE-LABEL: blend_add_ss:
429 ; X64-SSE-NEXT: addss %xmm1, %xmm0
432 ; X64-AVX-LABEL: blend_add_ss:
434 ; X64-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
437 %ext = extractelement <4 x float> %a, i32 0
438 %op = fadd float %b, %ext
439 %ins = insertelement <4 x float> undef, float %op, i32 0
440 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
441 ret <4 x float> %shuf
444 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
445 ; X86-SSE-LABEL: blend_sub_ss:
447 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
448 ; X86-SSE-NEXT: subss %xmm1, %xmm0
451 ; X86-AVX-LABEL: blend_sub_ss:
453 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
454 ; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
457 ; X64-SSE-LABEL: blend_sub_ss:
459 ; X64-SSE-NEXT: subss %xmm1, %xmm0
462 ; X64-AVX-LABEL: blend_sub_ss:
464 ; X64-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
467 %ext = extractelement <4 x float> %a, i32 0
468 %op = fsub float %ext, %b
469 %ins = insertelement <4 x float> undef, float %op, i32 0
470 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
471 ret <4 x float> %shuf
474 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
475 ; X86-SSE-LABEL: blend_mul_ss:
477 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
478 ; X86-SSE-NEXT: mulss %xmm1, %xmm0
481 ; X86-AVX-LABEL: blend_mul_ss:
483 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
484 ; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
487 ; X64-SSE-LABEL: blend_mul_ss:
489 ; X64-SSE-NEXT: mulss %xmm1, %xmm0
492 ; X64-AVX-LABEL: blend_mul_ss:
494 ; X64-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
497 %ext = extractelement <4 x float> %a, i32 0
498 %op = fmul float %b, %ext
499 %ins = insertelement <4 x float> undef, float %op, i32 0
500 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
501 ret <4 x float> %shuf
504 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
505 ; X86-SSE-LABEL: blend_div_ss:
507 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
508 ; X86-SSE-NEXT: divss %xmm1, %xmm0
511 ; X86-AVX-LABEL: blend_div_ss:
513 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
514 ; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
517 ; X64-SSE-LABEL: blend_div_ss:
519 ; X64-SSE-NEXT: divss %xmm1, %xmm0
522 ; X64-AVX-LABEL: blend_div_ss:
524 ; X64-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
527 %ext = extractelement <4 x float> %a, i32 0
528 %op = fdiv float %ext, %b
529 %ins = insertelement <4 x float> undef, float %op, i32 0
530 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
531 ret <4 x float> %shuf
534 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
535 ; X86-SSE-LABEL: blend_add_sd:
537 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
538 ; X86-SSE-NEXT: addsd %xmm1, %xmm0
541 ; X86-AVX-LABEL: blend_add_sd:
543 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
544 ; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
547 ; X64-SSE-LABEL: blend_add_sd:
549 ; X64-SSE-NEXT: addsd %xmm1, %xmm0
552 ; X64-AVX-LABEL: blend_add_sd:
554 ; X64-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
557 %ext = extractelement <2 x double> %a, i32 0
558 %op = fadd double %b, %ext
559 %ins = insertelement <2 x double> undef, double %op, i32 0
560 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
561 ret <2 x double> %shuf
564 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
565 ; X86-SSE-LABEL: blend_sub_sd:
567 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
568 ; X86-SSE-NEXT: subsd %xmm1, %xmm0
571 ; X86-AVX-LABEL: blend_sub_sd:
573 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
574 ; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
577 ; X64-SSE-LABEL: blend_sub_sd:
579 ; X64-SSE-NEXT: subsd %xmm1, %xmm0
582 ; X64-AVX-LABEL: blend_sub_sd:
584 ; X64-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
587 %ext = extractelement <2 x double> %a, i32 0
588 %op = fsub double %ext, %b
589 %ins = insertelement <2 x double> undef, double %op, i32 0
590 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
591 ret <2 x double> %shuf
594 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
595 ; X86-SSE-LABEL: blend_mul_sd:
597 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
598 ; X86-SSE-NEXT: mulsd %xmm1, %xmm0
601 ; X86-AVX-LABEL: blend_mul_sd:
603 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
604 ; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
607 ; X64-SSE-LABEL: blend_mul_sd:
609 ; X64-SSE-NEXT: mulsd %xmm1, %xmm0
612 ; X64-AVX-LABEL: blend_mul_sd:
614 ; X64-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
617 %ext = extractelement <2 x double> %a, i32 0
618 %op = fmul double %b, %ext
619 %ins = insertelement <2 x double> undef, double %op, i32 0
620 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
621 ret <2 x double> %shuf
624 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
625 ; X86-SSE-LABEL: blend_div_sd:
627 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
628 ; X86-SSE-NEXT: divsd %xmm1, %xmm0
631 ; X86-AVX-LABEL: blend_div_sd:
633 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
634 ; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
637 ; X64-SSE-LABEL: blend_div_sd:
639 ; X64-SSE-NEXT: divsd %xmm1, %xmm0
642 ; X64-AVX-LABEL: blend_div_sd:
644 ; X64-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
647 %ext = extractelement <2 x double> %a, i32 0
648 %op = fdiv double %ext, %b
649 %ins = insertelement <2 x double> undef, double %op, i32 0
650 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
651 ret <2 x double> %shuf
654 ; Ensure that the backend selects SSE/AVX scalar fp instructions
655 ; from a packed fp instruction plus a vector insert.
657 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
658 ; SSE2-LABEL: insert_test_add_ss:
660 ; SSE2-NEXT: addps %xmm0, %xmm1
661 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
662 ; SSE2-NEXT: ret{{[l|q]}}
664 ; SSE41-LABEL: insert_test_add_ss:
666 ; SSE41-NEXT: addps %xmm0, %xmm1
667 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
668 ; SSE41-NEXT: ret{{[l|q]}}
670 ; AVX-LABEL: insert_test_add_ss:
672 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm1
673 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
674 ; AVX-NEXT: ret{{[l|q]}}
675 %1 = fadd <4 x float> %a, %b
676 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
680 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
681 ; SSE2-LABEL: insert_test_sub_ss:
683 ; SSE2-NEXT: movaps %xmm0, %xmm2
684 ; SSE2-NEXT: subps %xmm1, %xmm2
685 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
686 ; SSE2-NEXT: ret{{[l|q]}}
688 ; SSE41-LABEL: insert_test_sub_ss:
690 ; SSE41-NEXT: movaps %xmm0, %xmm2
691 ; SSE41-NEXT: subps %xmm1, %xmm2
692 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
693 ; SSE41-NEXT: movaps %xmm2, %xmm0
694 ; SSE41-NEXT: ret{{[l|q]}}
696 ; AVX-LABEL: insert_test_sub_ss:
698 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm1
699 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
700 ; AVX-NEXT: ret{{[l|q]}}
701 %1 = fsub <4 x float> %a, %b
702 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
706 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
707 ; SSE2-LABEL: insert_test_mul_ss:
709 ; SSE2-NEXT: mulps %xmm0, %xmm1
710 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
711 ; SSE2-NEXT: ret{{[l|q]}}
713 ; SSE41-LABEL: insert_test_mul_ss:
715 ; SSE41-NEXT: mulps %xmm0, %xmm1
716 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
717 ; SSE41-NEXT: ret{{[l|q]}}
719 ; AVX-LABEL: insert_test_mul_ss:
721 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm1
722 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
723 ; AVX-NEXT: ret{{[l|q]}}
724 %1 = fmul <4 x float> %a, %b
725 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
729 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
730 ; SSE2-LABEL: insert_test_div_ss:
732 ; SSE2-NEXT: movaps %xmm0, %xmm2
733 ; SSE2-NEXT: divps %xmm1, %xmm2
734 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
735 ; SSE2-NEXT: ret{{[l|q]}}
737 ; SSE41-LABEL: insert_test_div_ss:
739 ; SSE41-NEXT: movaps %xmm0, %xmm2
740 ; SSE41-NEXT: divps %xmm1, %xmm2
741 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
742 ; SSE41-NEXT: movaps %xmm2, %xmm0
743 ; SSE41-NEXT: ret{{[l|q]}}
745 ; AVX-LABEL: insert_test_div_ss:
747 ; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm1
748 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
749 ; AVX-NEXT: ret{{[l|q]}}
750 %1 = fdiv <4 x float> %a, %b
751 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
755 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
756 ; SSE2-LABEL: insert_test_add_sd:
758 ; SSE2-NEXT: addpd %xmm0, %xmm1
759 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
760 ; SSE2-NEXT: ret{{[l|q]}}
762 ; SSE41-LABEL: insert_test_add_sd:
764 ; SSE41-NEXT: addpd %xmm0, %xmm1
765 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
766 ; SSE41-NEXT: ret{{[l|q]}}
768 ; AVX-LABEL: insert_test_add_sd:
770 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm1
771 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
772 ; AVX-NEXT: ret{{[l|q]}}
773 %1 = fadd <2 x double> %a, %b
774 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
778 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
779 ; SSE2-LABEL: insert_test_sub_sd:
781 ; SSE2-NEXT: movapd %xmm0, %xmm2
782 ; SSE2-NEXT: subpd %xmm1, %xmm2
783 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
784 ; SSE2-NEXT: ret{{[l|q]}}
786 ; SSE41-LABEL: insert_test_sub_sd:
788 ; SSE41-NEXT: movapd %xmm0, %xmm2
789 ; SSE41-NEXT: subpd %xmm1, %xmm2
790 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm0[1]
791 ; SSE41-NEXT: movapd %xmm2, %xmm0
792 ; SSE41-NEXT: ret{{[l|q]}}
794 ; AVX-LABEL: insert_test_sub_sd:
796 ; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm1
797 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
798 ; AVX-NEXT: ret{{[l|q]}}
799 %1 = fsub <2 x double> %a, %b
800 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
804 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
805 ; SSE2-LABEL: insert_test_mul_sd:
807 ; SSE2-NEXT: mulpd %xmm0, %xmm1
808 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
809 ; SSE2-NEXT: ret{{[l|q]}}
811 ; SSE41-LABEL: insert_test_mul_sd:
813 ; SSE41-NEXT: mulpd %xmm0, %xmm1
814 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
815 ; SSE41-NEXT: ret{{[l|q]}}
817 ; AVX-LABEL: insert_test_mul_sd:
819 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm1
820 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
821 ; AVX-NEXT: ret{{[l|q]}}
822 %1 = fmul <2 x double> %a, %b
823 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
827 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
828 ; SSE2-LABEL: insert_test_div_sd:
830 ; SSE2-NEXT: movapd %xmm0, %xmm2
831 ; SSE2-NEXT: divpd %xmm1, %xmm2
832 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
833 ; SSE2-NEXT: ret{{[l|q]}}
835 ; SSE41-LABEL: insert_test_div_sd:
837 ; SSE41-NEXT: movapd %xmm0, %xmm2
838 ; SSE41-NEXT: divpd %xmm1, %xmm2
839 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm0[1]
840 ; SSE41-NEXT: movapd %xmm2, %xmm0
841 ; SSE41-NEXT: ret{{[l|q]}}
843 ; AVX-LABEL: insert_test_div_sd:
845 ; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm1
846 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
847 ; AVX-NEXT: ret{{[l|q]}}
848 %1 = fdiv <2 x double> %a, %b
849 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
853 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
854 ; SSE2-LABEL: insert_test2_add_ss:
856 ; SSE2-NEXT: addps %xmm1, %xmm0
857 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
858 ; SSE2-NEXT: movaps %xmm1, %xmm0
859 ; SSE2-NEXT: ret{{[l|q]}}
861 ; SSE41-LABEL: insert_test2_add_ss:
863 ; SSE41-NEXT: addps %xmm1, %xmm0
864 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
865 ; SSE41-NEXT: ret{{[l|q]}}
867 ; AVX-LABEL: insert_test2_add_ss:
869 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
870 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
871 ; AVX-NEXT: ret{{[l|q]}}
872 %1 = fadd <4 x float> %b, %a
873 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
877 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
878 ; SSE2-LABEL: insert_test2_sub_ss:
880 ; SSE2-NEXT: movaps %xmm1, %xmm2
881 ; SSE2-NEXT: subps %xmm0, %xmm2
882 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
883 ; SSE2-NEXT: movaps %xmm1, %xmm0
884 ; SSE2-NEXT: ret{{[l|q]}}
886 ; SSE41-LABEL: insert_test2_sub_ss:
888 ; SSE41-NEXT: movaps %xmm1, %xmm2
889 ; SSE41-NEXT: subps %xmm0, %xmm2
890 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
891 ; SSE41-NEXT: movaps %xmm2, %xmm0
892 ; SSE41-NEXT: ret{{[l|q]}}
894 ; AVX-LABEL: insert_test2_sub_ss:
896 ; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
897 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
898 ; AVX-NEXT: ret{{[l|q]}}
899 %1 = fsub <4 x float> %b, %a
900 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
904 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
905 ; SSE2-LABEL: insert_test2_mul_ss:
907 ; SSE2-NEXT: mulps %xmm1, %xmm0
908 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
909 ; SSE2-NEXT: movaps %xmm1, %xmm0
910 ; SSE2-NEXT: ret{{[l|q]}}
912 ; SSE41-LABEL: insert_test2_mul_ss:
914 ; SSE41-NEXT: mulps %xmm1, %xmm0
915 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
916 ; SSE41-NEXT: ret{{[l|q]}}
918 ; AVX-LABEL: insert_test2_mul_ss:
920 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
921 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
922 ; AVX-NEXT: ret{{[l|q]}}
923 %1 = fmul <4 x float> %b, %a
924 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
928 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
929 ; SSE2-LABEL: insert_test2_div_ss:
931 ; SSE2-NEXT: movaps %xmm1, %xmm2
932 ; SSE2-NEXT: divps %xmm0, %xmm2
933 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
934 ; SSE2-NEXT: movaps %xmm1, %xmm0
935 ; SSE2-NEXT: ret{{[l|q]}}
937 ; SSE41-LABEL: insert_test2_div_ss:
939 ; SSE41-NEXT: movaps %xmm1, %xmm2
940 ; SSE41-NEXT: divps %xmm0, %xmm2
941 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
942 ; SSE41-NEXT: movaps %xmm2, %xmm0
943 ; SSE41-NEXT: ret{{[l|q]}}
945 ; AVX-LABEL: insert_test2_div_ss:
947 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
948 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
949 ; AVX-NEXT: ret{{[l|q]}}
950 %1 = fdiv <4 x float> %b, %a
951 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
955 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
956 ; SSE2-LABEL: insert_test2_add_sd:
958 ; SSE2-NEXT: addpd %xmm1, %xmm0
959 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
960 ; SSE2-NEXT: movapd %xmm1, %xmm0
961 ; SSE2-NEXT: ret{{[l|q]}}
963 ; SSE41-LABEL: insert_test2_add_sd:
965 ; SSE41-NEXT: addpd %xmm1, %xmm0
966 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
967 ; SSE41-NEXT: ret{{[l|q]}}
969 ; AVX-LABEL: insert_test2_add_sd:
971 ; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0
972 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
973 ; AVX-NEXT: ret{{[l|q]}}
974 %1 = fadd <2 x double> %b, %a
975 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
979 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
980 ; SSE2-LABEL: insert_test2_sub_sd:
982 ; SSE2-NEXT: movapd %xmm1, %xmm2
983 ; SSE2-NEXT: subpd %xmm0, %xmm2
984 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
985 ; SSE2-NEXT: movapd %xmm1, %xmm0
986 ; SSE2-NEXT: ret{{[l|q]}}
988 ; SSE41-LABEL: insert_test2_sub_sd:
990 ; SSE41-NEXT: movapd %xmm1, %xmm2
991 ; SSE41-NEXT: subpd %xmm0, %xmm2
992 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
993 ; SSE41-NEXT: movapd %xmm2, %xmm0
994 ; SSE41-NEXT: ret{{[l|q]}}
996 ; AVX-LABEL: insert_test2_sub_sd:
998 ; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
999 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1000 ; AVX-NEXT: ret{{[l|q]}}
1001 %1 = fsub <2 x double> %b, %a
1002 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
1006 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
1007 ; SSE2-LABEL: insert_test2_mul_sd:
1009 ; SSE2-NEXT: mulpd %xmm1, %xmm0
1010 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1011 ; SSE2-NEXT: movapd %xmm1, %xmm0
1012 ; SSE2-NEXT: ret{{[l|q]}}
1014 ; SSE41-LABEL: insert_test2_mul_sd:
1016 ; SSE41-NEXT: mulpd %xmm1, %xmm0
1017 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1018 ; SSE41-NEXT: ret{{[l|q]}}
1020 ; AVX-LABEL: insert_test2_mul_sd:
1022 ; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
1023 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1024 ; AVX-NEXT: ret{{[l|q]}}
1025 %1 = fmul <2 x double> %b, %a
1026 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
1030 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
1031 ; SSE2-LABEL: insert_test2_div_sd:
1033 ; SSE2-NEXT: movapd %xmm1, %xmm2
1034 ; SSE2-NEXT: divpd %xmm0, %xmm2
1035 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1036 ; SSE2-NEXT: movapd %xmm1, %xmm0
1037 ; SSE2-NEXT: ret{{[l|q]}}
1039 ; SSE41-LABEL: insert_test2_div_sd:
1041 ; SSE41-NEXT: movapd %xmm1, %xmm2
1042 ; SSE41-NEXT: divpd %xmm0, %xmm2
1043 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1044 ; SSE41-NEXT: movapd %xmm2, %xmm0
1045 ; SSE41-NEXT: ret{{[l|q]}}
1047 ; AVX-LABEL: insert_test2_div_sd:
1049 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
1050 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1051 ; AVX-NEXT: ret{{[l|q]}}
1052 %1 = fdiv <2 x double> %b, %a
1053 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
1057 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
1058 ; SSE2-LABEL: insert_test3_add_ss:
1060 ; SSE2-NEXT: addps %xmm0, %xmm1
1061 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1062 ; SSE2-NEXT: ret{{[l|q]}}
1064 ; SSE41-LABEL: insert_test3_add_ss:
1066 ; SSE41-NEXT: addps %xmm0, %xmm1
1067 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1068 ; SSE41-NEXT: ret{{[l|q]}}
1070 ; AVX-LABEL: insert_test3_add_ss:
1072 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm1
1073 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1074 ; AVX-NEXT: ret{{[l|q]}}
1075 %1 = fadd <4 x float> %a, %b
1076 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
1080 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
1081 ; SSE2-LABEL: insert_test3_sub_ss:
1083 ; SSE2-NEXT: movaps %xmm0, %xmm2
1084 ; SSE2-NEXT: subps %xmm1, %xmm2
1085 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1086 ; SSE2-NEXT: ret{{[l|q]}}
1088 ; SSE41-LABEL: insert_test3_sub_ss:
1090 ; SSE41-NEXT: movaps %xmm0, %xmm2
1091 ; SSE41-NEXT: subps %xmm1, %xmm2
1092 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
1093 ; SSE41-NEXT: movaps %xmm2, %xmm0
1094 ; SSE41-NEXT: ret{{[l|q]}}
1096 ; AVX-LABEL: insert_test3_sub_ss:
1098 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm1
1099 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1100 ; AVX-NEXT: ret{{[l|q]}}
1101 %1 = fsub <4 x float> %a, %b
1102 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
1106 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
1107 ; SSE2-LABEL: insert_test3_mul_ss:
1109 ; SSE2-NEXT: mulps %xmm0, %xmm1
1110 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1111 ; SSE2-NEXT: ret{{[l|q]}}
1113 ; SSE41-LABEL: insert_test3_mul_ss:
1115 ; SSE41-NEXT: mulps %xmm0, %xmm1
1116 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1117 ; SSE41-NEXT: ret{{[l|q]}}
1119 ; AVX-LABEL: insert_test3_mul_ss:
1121 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm1
1122 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1123 ; AVX-NEXT: ret{{[l|q]}}
1124 %1 = fmul <4 x float> %a, %b
1125 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
1129 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
1130 ; SSE2-LABEL: insert_test3_div_ss:
1132 ; SSE2-NEXT: movaps %xmm0, %xmm2
1133 ; SSE2-NEXT: divps %xmm1, %xmm2
1134 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1135 ; SSE2-NEXT: ret{{[l|q]}}
1137 ; SSE41-LABEL: insert_test3_div_ss:
1139 ; SSE41-NEXT: movaps %xmm0, %xmm2
1140 ; SSE41-NEXT: divps %xmm1, %xmm2
1141 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
1142 ; SSE41-NEXT: movaps %xmm2, %xmm0
1143 ; SSE41-NEXT: ret{{[l|q]}}
1145 ; AVX-LABEL: insert_test3_div_ss:
1147 ; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm1
1148 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1149 ; AVX-NEXT: ret{{[l|q]}}
1150 %1 = fdiv <4 x float> %a, %b
1151 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
1155 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
1156 ; SSE2-LABEL: insert_test3_add_sd:
1158 ; SSE2-NEXT: addpd %xmm0, %xmm1
1159 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1160 ; SSE2-NEXT: ret{{[l|q]}}
1162 ; SSE41-LABEL: insert_test3_add_sd:
1164 ; SSE41-NEXT: addpd %xmm0, %xmm1
1165 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1166 ; SSE41-NEXT: ret{{[l|q]}}
1168 ; AVX-LABEL: insert_test3_add_sd:
1170 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm1
1171 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1172 ; AVX-NEXT: ret{{[l|q]}}
1173 %1 = fadd <2 x double> %a, %b
1174 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1178 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
1179 ; SSE2-LABEL: insert_test3_sub_sd:
1181 ; SSE2-NEXT: movapd %xmm0, %xmm2
1182 ; SSE2-NEXT: subpd %xmm1, %xmm2
1183 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1184 ; SSE2-NEXT: ret{{[l|q]}}
1186 ; SSE41-LABEL: insert_test3_sub_sd:
1188 ; SSE41-NEXT: movapd %xmm0, %xmm2
1189 ; SSE41-NEXT: subpd %xmm1, %xmm2
1190 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm0[1]
1191 ; SSE41-NEXT: movapd %xmm2, %xmm0
1192 ; SSE41-NEXT: ret{{[l|q]}}
1194 ; AVX-LABEL: insert_test3_sub_sd:
1196 ; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm1
1197 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1198 ; AVX-NEXT: ret{{[l|q]}}
1199 %1 = fsub <2 x double> %a, %b
1200 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1204 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
1205 ; SSE2-LABEL: insert_test3_mul_sd:
1207 ; SSE2-NEXT: mulpd %xmm0, %xmm1
1208 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1209 ; SSE2-NEXT: ret{{[l|q]}}
1211 ; SSE41-LABEL: insert_test3_mul_sd:
1213 ; SSE41-NEXT: mulpd %xmm0, %xmm1
1214 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1215 ; SSE41-NEXT: ret{{[l|q]}}
1217 ; AVX-LABEL: insert_test3_mul_sd:
1219 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm1
1220 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1221 ; AVX-NEXT: ret{{[l|q]}}
1222 %1 = fmul <2 x double> %a, %b
1223 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1227 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
1228 ; SSE2-LABEL: insert_test3_div_sd:
1230 ; SSE2-NEXT: movapd %xmm0, %xmm2
1231 ; SSE2-NEXT: divpd %xmm1, %xmm2
1232 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1233 ; SSE2-NEXT: ret{{[l|q]}}
1235 ; SSE41-LABEL: insert_test3_div_sd:
1237 ; SSE41-NEXT: movapd %xmm0, %xmm2
1238 ; SSE41-NEXT: divpd %xmm1, %xmm2
1239 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm0[1]
1240 ; SSE41-NEXT: movapd %xmm2, %xmm0
1241 ; SSE41-NEXT: ret{{[l|q]}}
1243 ; AVX-LABEL: insert_test3_div_sd:
1245 ; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm1
1246 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1247 ; AVX-NEXT: ret{{[l|q]}}
1248 %1 = fdiv <2 x double> %a, %b
1249 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1253 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
1254 ; SSE2-LABEL: insert_test4_add_ss:
1256 ; SSE2-NEXT: addps %xmm1, %xmm0
1257 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1258 ; SSE2-NEXT: movaps %xmm1, %xmm0
1259 ; SSE2-NEXT: ret{{[l|q]}}
1261 ; SSE41-LABEL: insert_test4_add_ss:
1263 ; SSE41-NEXT: addps %xmm1, %xmm0
1264 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1265 ; SSE41-NEXT: ret{{[l|q]}}
1267 ; AVX-LABEL: insert_test4_add_ss:
1269 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
1270 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1271 ; AVX-NEXT: ret{{[l|q]}}
1272 %1 = fadd <4 x float> %b, %a
1273 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1277 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
1278 ; SSE2-LABEL: insert_test4_sub_ss:
1280 ; SSE2-NEXT: movaps %xmm1, %xmm2
1281 ; SSE2-NEXT: subps %xmm0, %xmm2
1282 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1283 ; SSE2-NEXT: movaps %xmm1, %xmm0
1284 ; SSE2-NEXT: ret{{[l|q]}}
1286 ; SSE41-LABEL: insert_test4_sub_ss:
1288 ; SSE41-NEXT: movaps %xmm1, %xmm2
1289 ; SSE41-NEXT: subps %xmm0, %xmm2
1290 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
1291 ; SSE41-NEXT: movaps %xmm2, %xmm0
1292 ; SSE41-NEXT: ret{{[l|q]}}
1294 ; AVX-LABEL: insert_test4_sub_ss:
1296 ; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
1297 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1298 ; AVX-NEXT: ret{{[l|q]}}
1299 %1 = fsub <4 x float> %b, %a
1300 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1304 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
1305 ; SSE2-LABEL: insert_test4_mul_ss:
1307 ; SSE2-NEXT: mulps %xmm1, %xmm0
1308 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1309 ; SSE2-NEXT: movaps %xmm1, %xmm0
1310 ; SSE2-NEXT: ret{{[l|q]}}
1312 ; SSE41-LABEL: insert_test4_mul_ss:
1314 ; SSE41-NEXT: mulps %xmm1, %xmm0
1315 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1316 ; SSE41-NEXT: ret{{[l|q]}}
1318 ; AVX-LABEL: insert_test4_mul_ss:
1320 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
1321 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1322 ; AVX-NEXT: ret{{[l|q]}}
1323 %1 = fmul <4 x float> %b, %a
1324 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1328 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
1329 ; SSE2-LABEL: insert_test4_div_ss:
1331 ; SSE2-NEXT: movaps %xmm1, %xmm2
1332 ; SSE2-NEXT: divps %xmm0, %xmm2
1333 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1334 ; SSE2-NEXT: movaps %xmm1, %xmm0
1335 ; SSE2-NEXT: ret{{[l|q]}}
1337 ; SSE41-LABEL: insert_test4_div_ss:
1339 ; SSE41-NEXT: movaps %xmm1, %xmm2
1340 ; SSE41-NEXT: divps %xmm0, %xmm2
1341 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
1342 ; SSE41-NEXT: movaps %xmm2, %xmm0
1343 ; SSE41-NEXT: ret{{[l|q]}}
1345 ; AVX-LABEL: insert_test4_div_ss:
1347 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
1348 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1349 ; AVX-NEXT: ret{{[l|q]}}
1350 %1 = fdiv <4 x float> %b, %a
1351 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1355 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1356 ; SSE2-LABEL: insert_test4_add_sd:
1358 ; SSE2-NEXT: addpd %xmm1, %xmm0
1359 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1360 ; SSE2-NEXT: movapd %xmm1, %xmm0
1361 ; SSE2-NEXT: ret{{[l|q]}}
1363 ; SSE41-LABEL: insert_test4_add_sd:
1365 ; SSE41-NEXT: addpd %xmm1, %xmm0
1366 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1367 ; SSE41-NEXT: ret{{[l|q]}}
1369 ; AVX-LABEL: insert_test4_add_sd:
1371 ; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1372 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1373 ; AVX-NEXT: ret{{[l|q]}}
1374 %1 = fadd <2 x double> %b, %a
1375 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1379 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1380 ; SSE2-LABEL: insert_test4_sub_sd:
1382 ; SSE2-NEXT: movapd %xmm1, %xmm2
1383 ; SSE2-NEXT: subpd %xmm0, %xmm2
1384 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1385 ; SSE2-NEXT: movapd %xmm1, %xmm0
1386 ; SSE2-NEXT: ret{{[l|q]}}
1388 ; SSE41-LABEL: insert_test4_sub_sd:
1390 ; SSE41-NEXT: movapd %xmm1, %xmm2
1391 ; SSE41-NEXT: subpd %xmm0, %xmm2
1392 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1393 ; SSE41-NEXT: movapd %xmm2, %xmm0
1394 ; SSE41-NEXT: ret{{[l|q]}}
1396 ; AVX-LABEL: insert_test4_sub_sd:
1398 ; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
1399 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1400 ; AVX-NEXT: ret{{[l|q]}}
1401 %1 = fsub <2 x double> %b, %a
1402 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1406 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1407 ; SSE2-LABEL: insert_test4_mul_sd:
1409 ; SSE2-NEXT: mulpd %xmm1, %xmm0
1410 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1411 ; SSE2-NEXT: movapd %xmm1, %xmm0
1412 ; SSE2-NEXT: ret{{[l|q]}}
1414 ; SSE41-LABEL: insert_test4_mul_sd:
1416 ; SSE41-NEXT: mulpd %xmm1, %xmm0
1417 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1418 ; SSE41-NEXT: ret{{[l|q]}}
1420 ; AVX-LABEL: insert_test4_mul_sd:
1422 ; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
1423 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1424 ; AVX-NEXT: ret{{[l|q]}}
1425 %1 = fmul <2 x double> %b, %a
1426 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1430 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1431 ; SSE2-LABEL: insert_test4_div_sd:
1433 ; SSE2-NEXT: movapd %xmm1, %xmm2
1434 ; SSE2-NEXT: divpd %xmm0, %xmm2
1435 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1436 ; SSE2-NEXT: movapd %xmm1, %xmm0
1437 ; SSE2-NEXT: ret{{[l|q]}}
1439 ; SSE41-LABEL: insert_test4_div_sd:
1441 ; SSE41-NEXT: movapd %xmm1, %xmm2
1442 ; SSE41-NEXT: divpd %xmm0, %xmm2
1443 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1444 ; SSE41-NEXT: movapd %xmm2, %xmm0
1445 ; SSE41-NEXT: ret{{[l|q]}}
1447 ; AVX-LABEL: insert_test4_div_sd:
1449 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
1450 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1451 ; AVX-NEXT: ret{{[l|q]}}
1452 %1 = fdiv <2 x double> %b, %a
1453 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1457 define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
1458 ; SSE2-LABEL: insert_test5_add_ss:
1460 ; SSE2-NEXT: addps %xmm0, %xmm1
1461 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1462 ; SSE2-NEXT: ret{{[l|q]}}
1464 ; SSE41-LABEL: insert_test5_add_ss:
1466 ; SSE41-NEXT: addps %xmm0, %xmm1
1467 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1468 ; SSE41-NEXT: ret{{[l|q]}}
1470 ; AVX-LABEL: insert_test5_add_ss:
1472 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1
1473 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1474 ; AVX-NEXT: ret{{[l|q]}}
1475 %1 = fadd <4 x float> %b, %a
1476 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1480 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
1481 ; SSE2-LABEL: insert_test5_sub_ss:
1483 ; SSE2-NEXT: subps %xmm0, %xmm1
1484 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1485 ; SSE2-NEXT: ret{{[l|q]}}
1487 ; SSE41-LABEL: insert_test5_sub_ss:
1489 ; SSE41-NEXT: subps %xmm0, %xmm1
1490 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1491 ; SSE41-NEXT: ret{{[l|q]}}
1493 ; AVX-LABEL: insert_test5_sub_ss:
1495 ; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm1
1496 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1497 ; AVX-NEXT: ret{{[l|q]}}
1498 %1 = fsub <4 x float> %b, %a
1499 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1503 define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
1504 ; SSE2-LABEL: insert_test5_mul_ss:
1506 ; SSE2-NEXT: mulps %xmm0, %xmm1
1507 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1508 ; SSE2-NEXT: ret{{[l|q]}}
1510 ; SSE41-LABEL: insert_test5_mul_ss:
1512 ; SSE41-NEXT: mulps %xmm0, %xmm1
1513 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1514 ; SSE41-NEXT: ret{{[l|q]}}
1516 ; AVX-LABEL: insert_test5_mul_ss:
1518 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm1
1519 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1520 ; AVX-NEXT: ret{{[l|q]}}
1521 %1 = fmul <4 x float> %b, %a
1522 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1526 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
1527 ; SSE2-LABEL: insert_test5_div_ss:
1529 ; SSE2-NEXT: divps %xmm0, %xmm1
1530 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1531 ; SSE2-NEXT: ret{{[l|q]}}
1533 ; SSE41-LABEL: insert_test5_div_ss:
1535 ; SSE41-NEXT: divps %xmm0, %xmm1
1536 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1537 ; SSE41-NEXT: ret{{[l|q]}}
1539 ; AVX-LABEL: insert_test5_div_ss:
1541 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm1
1542 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1543 ; AVX-NEXT: ret{{[l|q]}}
1544 %1 = fdiv <4 x float> %b, %a
1545 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1549 define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
1550 ; SSE2-LABEL: insert_test5_add_sd:
1552 ; SSE2-NEXT: addpd %xmm0, %xmm1
1553 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1554 ; SSE2-NEXT: ret{{[l|q]}}
1556 ; SSE41-LABEL: insert_test5_add_sd:
1558 ; SSE41-NEXT: addpd %xmm0, %xmm1
1559 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1560 ; SSE41-NEXT: ret{{[l|q]}}
1562 ; AVX-LABEL: insert_test5_add_sd:
1564 ; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm1
1565 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1566 ; AVX-NEXT: ret{{[l|q]}}
1567 %1 = fadd <2 x double> %b, %a
1568 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1572 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
1573 ; SSE2-LABEL: insert_test5_sub_sd:
1575 ; SSE2-NEXT: subpd %xmm0, %xmm1
1576 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1577 ; SSE2-NEXT: ret{{[l|q]}}
1579 ; SSE41-LABEL: insert_test5_sub_sd:
1581 ; SSE41-NEXT: subpd %xmm0, %xmm1
1582 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1583 ; SSE41-NEXT: ret{{[l|q]}}
1585 ; AVX-LABEL: insert_test5_sub_sd:
1587 ; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm1
1588 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1589 ; AVX-NEXT: ret{{[l|q]}}
1590 %1 = fsub <2 x double> %b, %a
1591 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1595 define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
1596 ; SSE2-LABEL: insert_test5_mul_sd:
1598 ; SSE2-NEXT: mulpd %xmm0, %xmm1
1599 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1600 ; SSE2-NEXT: ret{{[l|q]}}
1602 ; SSE41-LABEL: insert_test5_mul_sd:
1604 ; SSE41-NEXT: mulpd %xmm0, %xmm1
1605 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1606 ; SSE41-NEXT: ret{{[l|q]}}
1608 ; AVX-LABEL: insert_test5_mul_sd:
1610 ; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm1
1611 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1612 ; AVX-NEXT: ret{{[l|q]}}
1613 %1 = fmul <2 x double> %b, %a
1614 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1618 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
1619 ; SSE2-LABEL: insert_test5_div_sd:
1621 ; SSE2-NEXT: divpd %xmm0, %xmm1
1622 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1623 ; SSE2-NEXT: ret{{[l|q]}}
1625 ; SSE41-LABEL: insert_test5_div_sd:
1627 ; SSE41-NEXT: divpd %xmm0, %xmm1
1628 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1629 ; SSE41-NEXT: ret{{[l|q]}}
1631 ; AVX-LABEL: insert_test5_div_sd:
1633 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1634 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1635 ; AVX-NEXT: ret{{[l|q]}}
1636 %1 = fdiv <2 x double> %b, %a
1637 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1641 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1642 ; X86-SSE2-LABEL: add_ss_mask:
1643 ; X86-SSE2: # %bb.0:
1644 ; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
1645 ; X86-SSE2-NEXT: jne .LBB70_1
1646 ; X86-SSE2-NEXT: # %bb.2:
1647 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1648 ; X86-SSE2-NEXT: retl
1649 ; X86-SSE2-NEXT: .LBB70_1:
1650 ; X86-SSE2-NEXT: addss %xmm0, %xmm1
1651 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1652 ; X86-SSE2-NEXT: retl
1654 ; X86-SSE41-LABEL: add_ss_mask:
1655 ; X86-SSE41: # %bb.0:
1656 ; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp)
1657 ; X86-SSE41-NEXT: jne .LBB70_1
1658 ; X86-SSE41-NEXT: # %bb.2:
1659 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1660 ; X86-SSE41-NEXT: retl
1661 ; X86-SSE41-NEXT: .LBB70_1:
1662 ; X86-SSE41-NEXT: addss %xmm0, %xmm1
1663 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1664 ; X86-SSE41-NEXT: retl
1666 ; X86-AVX1-LABEL: add_ss_mask:
1667 ; X86-AVX1: # %bb.0:
1668 ; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp)
1669 ; X86-AVX1-NEXT: je .LBB70_2
1670 ; X86-AVX1-NEXT: # %bb.1:
1671 ; X86-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
1672 ; X86-AVX1-NEXT: .LBB70_2:
1673 ; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1674 ; X86-AVX1-NEXT: retl
1676 ; X86-AVX512-LABEL: add_ss_mask:
1677 ; X86-AVX512: # %bb.0:
1678 ; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al
1679 ; X86-AVX512-NEXT: kmovw %eax, %k1
1680 ; X86-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
1681 ; X86-AVX512-NEXT: vmovaps %xmm2, %xmm0
1682 ; X86-AVX512-NEXT: retl
1684 ; X64-SSE2-LABEL: add_ss_mask:
1685 ; X64-SSE2: # %bb.0:
1686 ; X64-SSE2-NEXT: testb $1, %dil
1687 ; X64-SSE2-NEXT: jne .LBB70_1
1688 ; X64-SSE2-NEXT: # %bb.2:
1689 ; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1690 ; X64-SSE2-NEXT: retq
1691 ; X64-SSE2-NEXT: .LBB70_1:
1692 ; X64-SSE2-NEXT: addss %xmm0, %xmm1
1693 ; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1694 ; X64-SSE2-NEXT: retq
1696 ; X64-SSE41-LABEL: add_ss_mask:
1697 ; X64-SSE41: # %bb.0:
1698 ; X64-SSE41-NEXT: testb $1, %dil
1699 ; X64-SSE41-NEXT: jne .LBB70_1
1700 ; X64-SSE41-NEXT: # %bb.2:
1701 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1702 ; X64-SSE41-NEXT: retq
1703 ; X64-SSE41-NEXT: .LBB70_1:
1704 ; X64-SSE41-NEXT: addss %xmm0, %xmm1
1705 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1706 ; X64-SSE41-NEXT: retq
1708 ; X64-AVX1-LABEL: add_ss_mask:
1709 ; X64-AVX1: # %bb.0:
1710 ; X64-AVX1-NEXT: testb $1, %dil
1711 ; X64-AVX1-NEXT: je .LBB70_2
1712 ; X64-AVX1-NEXT: # %bb.1:
1713 ; X64-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
1714 ; X64-AVX1-NEXT: .LBB70_2:
1715 ; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1716 ; X64-AVX1-NEXT: retq
1718 ; X64-AVX512-LABEL: add_ss_mask:
1719 ; X64-AVX512: # %bb.0:
1720 ; X64-AVX512-NEXT: kmovw %edi, %k1
1721 ; X64-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
1722 ; X64-AVX512-NEXT: vmovaps %xmm2, %xmm0
1723 ; X64-AVX512-NEXT: retq
1724 %1 = extractelement <4 x float> %a, i64 0
1725 %2 = extractelement <4 x float> %b, i64 0
1726 %3 = fadd float %1, %2
1727 %4 = extractelement <4 x float> %c, i32 0
1728 %5 = bitcast i8 %mask to <8 x i1>
1729 %6 = extractelement <8 x i1> %5, i64 0
1730 %7 = select i1 %6, float %3, float %4
1731 %8 = insertelement <4 x float> %a, float %7, i64 0
1735 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1736 ; X86-SSE2-LABEL: add_sd_mask:
1737 ; X86-SSE2: # %bb.0:
1738 ; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
1739 ; X86-SSE2-NEXT: jne .LBB71_1
1740 ; X86-SSE2-NEXT: # %bb.2:
1741 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1742 ; X86-SSE2-NEXT: retl
1743 ; X86-SSE2-NEXT: .LBB71_1:
1744 ; X86-SSE2-NEXT: addsd %xmm0, %xmm1
1745 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1746 ; X86-SSE2-NEXT: retl
1748 ; X86-SSE41-LABEL: add_sd_mask:
1749 ; X86-SSE41: # %bb.0:
1750 ; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp)
1751 ; X86-SSE41-NEXT: jne .LBB71_1
1752 ; X86-SSE41-NEXT: # %bb.2:
1753 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1754 ; X86-SSE41-NEXT: retl
1755 ; X86-SSE41-NEXT: .LBB71_1:
1756 ; X86-SSE41-NEXT: addsd %xmm0, %xmm1
1757 ; X86-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1758 ; X86-SSE41-NEXT: retl
1760 ; X86-AVX1-LABEL: add_sd_mask:
1761 ; X86-AVX1: # %bb.0:
1762 ; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp)
1763 ; X86-AVX1-NEXT: je .LBB71_2
1764 ; X86-AVX1-NEXT: # %bb.1:
1765 ; X86-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
1766 ; X86-AVX1-NEXT: .LBB71_2:
1767 ; X86-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1768 ; X86-AVX1-NEXT: retl
1770 ; X86-AVX512-LABEL: add_sd_mask:
1771 ; X86-AVX512: # %bb.0:
1772 ; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al
1773 ; X86-AVX512-NEXT: kmovw %eax, %k1
1774 ; X86-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1775 ; X86-AVX512-NEXT: vmovapd %xmm2, %xmm0
1776 ; X86-AVX512-NEXT: retl
1778 ; X64-SSE2-LABEL: add_sd_mask:
1779 ; X64-SSE2: # %bb.0:
1780 ; X64-SSE2-NEXT: testb $1, %dil
1781 ; X64-SSE2-NEXT: jne .LBB71_1
1782 ; X64-SSE2-NEXT: # %bb.2:
1783 ; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1784 ; X64-SSE2-NEXT: retq
1785 ; X64-SSE2-NEXT: .LBB71_1:
1786 ; X64-SSE2-NEXT: addsd %xmm0, %xmm1
1787 ; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1788 ; X64-SSE2-NEXT: retq
1790 ; X64-SSE41-LABEL: add_sd_mask:
1791 ; X64-SSE41: # %bb.0:
1792 ; X64-SSE41-NEXT: testb $1, %dil
1793 ; X64-SSE41-NEXT: jne .LBB71_1
1794 ; X64-SSE41-NEXT: # %bb.2:
1795 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1796 ; X64-SSE41-NEXT: retq
1797 ; X64-SSE41-NEXT: .LBB71_1:
1798 ; X64-SSE41-NEXT: addsd %xmm0, %xmm1
1799 ; X64-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1800 ; X64-SSE41-NEXT: retq
1802 ; X64-AVX1-LABEL: add_sd_mask:
1803 ; X64-AVX1: # %bb.0:
1804 ; X64-AVX1-NEXT: testb $1, %dil
1805 ; X64-AVX1-NEXT: je .LBB71_2
1806 ; X64-AVX1-NEXT: # %bb.1:
1807 ; X64-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
1808 ; X64-AVX1-NEXT: .LBB71_2:
1809 ; X64-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1810 ; X64-AVX1-NEXT: retq
1812 ; X64-AVX512-LABEL: add_sd_mask:
1813 ; X64-AVX512: # %bb.0:
1814 ; X64-AVX512-NEXT: kmovw %edi, %k1
1815 ; X64-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1816 ; X64-AVX512-NEXT: vmovapd %xmm2, %xmm0
1817 ; X64-AVX512-NEXT: retq
1818 %1 = extractelement <2 x double> %a, i64 0
1819 %2 = extractelement <2 x double> %b, i64 0
1820 %3 = fadd double %1, %2
1821 %4 = extractelement <2 x double> %c, i32 0
1822 %5 = bitcast i8 %mask to <8 x i1>
1823 %6 = extractelement <8 x i1> %5, i64 0
1824 %7 = select i1 %6, double %3, double %4
1825 %8 = insertelement <2 x double> %a, double %7, i64 0