1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
20 define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i64:
23 ; SSE2-NEXT: movdqa %xmm0, %xmm2
24 ; SSE2-NEXT: psllq %xmm1, %xmm2
25 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26 ; SSE2-NEXT: psllq %xmm1, %xmm0
27 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
30 ; SSE41-LABEL: var_shift_v2i64:
32 ; SSE41-NEXT: movdqa %xmm0, %xmm2
33 ; SSE41-NEXT: psllq %xmm1, %xmm2
34 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
35 ; SSE41-NEXT: psllq %xmm1, %xmm0
36 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
39 ; AVX1-LABEL: var_shift_v2i64:
41 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
42 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
43 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
44 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
47 ; AVX2-LABEL: var_shift_v2i64:
49 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
52 ; XOPAVX1-LABEL: var_shift_v2i64:
54 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
57 ; XOPAVX2-LABEL: var_shift_v2i64:
59 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
62 ; AVX512-LABEL: var_shift_v2i64:
64 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
67 ; AVX512VL-LABEL: var_shift_v2i64:
69 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
72 ; X32-SSE-LABEL: var_shift_v2i64:
74 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
75 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
76 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
77 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
78 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
80 %shift = shl <2 x i64> %a, %b
84 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
85 ; SSE2-LABEL: var_shift_v4i32:
87 ; SSE2-NEXT: pslld $23, %xmm1
88 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
89 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
91 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
92 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
93 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
94 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
95 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
96 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
99 ; SSE41-LABEL: var_shift_v4i32:
101 ; SSE41-NEXT: pslld $23, %xmm1
102 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
103 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
104 ; SSE41-NEXT: pmulld %xmm1, %xmm0
107 ; AVX1-LABEL: var_shift_v4i32:
109 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
110 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
111 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
112 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
115 ; AVX2-LABEL: var_shift_v4i32:
117 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
120 ; XOPAVX1-LABEL: var_shift_v4i32:
122 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
125 ; XOPAVX2-LABEL: var_shift_v4i32:
127 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
130 ; AVX512-LABEL: var_shift_v4i32:
132 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
135 ; AVX512VL-LABEL: var_shift_v4i32:
137 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
138 ; AVX512VL-NEXT: retq
140 ; X32-SSE-LABEL: var_shift_v4i32:
142 ; X32-SSE-NEXT: pslld $23, %xmm1
143 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
144 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
145 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
146 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
147 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
148 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
149 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
150 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
151 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153 %shift = shl <4 x i32> %a, %b
157 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
158 ; SSE2-LABEL: var_shift_v8i16:
160 ; SSE2-NEXT: pxor %xmm2, %xmm2
161 ; SSE2-NEXT: movdqa %xmm1, %xmm3
162 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
163 ; SSE2-NEXT: pslld $23, %xmm3
164 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
165 ; SSE2-NEXT: paddd %xmm4, %xmm3
166 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
167 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
168 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
169 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
170 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
171 ; SSE2-NEXT: pslld $23, %xmm1
172 ; SSE2-NEXT: paddd %xmm4, %xmm1
173 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
174 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
175 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
176 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
177 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
178 ; SSE2-NEXT: pmullw %xmm1, %xmm0
181 ; SSE41-LABEL: var_shift_v8i16:
183 ; SSE41-NEXT: pxor %xmm2, %xmm2
184 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
185 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
186 ; SSE41-NEXT: pslld $23, %xmm1
187 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
188 ; SSE41-NEXT: paddd %xmm2, %xmm1
189 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
190 ; SSE41-NEXT: pslld $23, %xmm3
191 ; SSE41-NEXT: paddd %xmm2, %xmm3
192 ; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
193 ; SSE41-NEXT: packusdw %xmm1, %xmm2
194 ; SSE41-NEXT: pmullw %xmm2, %xmm0
197 ; AVX1-LABEL: var_shift_v8i16:
199 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
200 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
201 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
202 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
203 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
204 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
205 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
206 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
207 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
208 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
209 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
210 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
213 ; AVX2-LABEL: var_shift_v8i16:
215 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
216 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
217 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
218 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
219 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
220 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
221 ; AVX2-NEXT: vzeroupper
224 ; XOP-LABEL: var_shift_v8i16:
226 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
229 ; AVX512DQ-LABEL: var_shift_v8i16:
231 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
232 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
233 ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
234 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
235 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
236 ; AVX512DQ-NEXT: vzeroupper
237 ; AVX512DQ-NEXT: retq
239 ; AVX512BW-LABEL: var_shift_v8i16:
241 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
242 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
243 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
244 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
245 ; AVX512BW-NEXT: vzeroupper
246 ; AVX512BW-NEXT: retq
248 ; AVX512DQVL-LABEL: var_shift_v8i16:
249 ; AVX512DQVL: # %bb.0:
250 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
251 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
252 ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
253 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
254 ; AVX512DQVL-NEXT: vzeroupper
255 ; AVX512DQVL-NEXT: retq
257 ; AVX512BWVL-LABEL: var_shift_v8i16:
258 ; AVX512BWVL: # %bb.0:
259 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
260 ; AVX512BWVL-NEXT: retq
262 ; X32-SSE-LABEL: var_shift_v8i16:
264 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
265 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
266 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
267 ; X32-SSE-NEXT: pslld $23, %xmm3
268 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
269 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
270 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
271 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
272 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
273 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
274 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
275 ; X32-SSE-NEXT: pslld $23, %xmm1
276 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
277 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
278 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
279 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
280 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
281 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
282 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
284 %shift = shl <8 x i16> %a, %b
288 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
289 ; SSE2-LABEL: var_shift_v16i8:
291 ; SSE2-NEXT: psllw $5, %xmm1
292 ; SSE2-NEXT: pxor %xmm2, %xmm2
293 ; SSE2-NEXT: pxor %xmm3, %xmm3
294 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
295 ; SSE2-NEXT: movdqa %xmm3, %xmm4
296 ; SSE2-NEXT: pandn %xmm0, %xmm4
297 ; SSE2-NEXT: psllw $4, %xmm0
298 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
299 ; SSE2-NEXT: pand %xmm3, %xmm0
300 ; SSE2-NEXT: por %xmm4, %xmm0
301 ; SSE2-NEXT: paddb %xmm1, %xmm1
302 ; SSE2-NEXT: pxor %xmm3, %xmm3
303 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
304 ; SSE2-NEXT: movdqa %xmm3, %xmm4
305 ; SSE2-NEXT: pandn %xmm0, %xmm4
306 ; SSE2-NEXT: psllw $2, %xmm0
307 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
308 ; SSE2-NEXT: pand %xmm3, %xmm0
309 ; SSE2-NEXT: por %xmm4, %xmm0
310 ; SSE2-NEXT: paddb %xmm1, %xmm1
311 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
312 ; SSE2-NEXT: movdqa %xmm2, %xmm1
313 ; SSE2-NEXT: pandn %xmm0, %xmm1
314 ; SSE2-NEXT: paddb %xmm0, %xmm0
315 ; SSE2-NEXT: pand %xmm2, %xmm0
316 ; SSE2-NEXT: por %xmm1, %xmm0
319 ; SSE41-LABEL: var_shift_v16i8:
321 ; SSE41-NEXT: movdqa %xmm0, %xmm2
322 ; SSE41-NEXT: psllw $5, %xmm1
323 ; SSE41-NEXT: movdqa %xmm0, %xmm3
324 ; SSE41-NEXT: psllw $4, %xmm3
325 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
326 ; SSE41-NEXT: movdqa %xmm1, %xmm0
327 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
328 ; SSE41-NEXT: movdqa %xmm2, %xmm3
329 ; SSE41-NEXT: psllw $2, %xmm3
330 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
331 ; SSE41-NEXT: paddb %xmm1, %xmm1
332 ; SSE41-NEXT: movdqa %xmm1, %xmm0
333 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
334 ; SSE41-NEXT: movdqa %xmm2, %xmm3
335 ; SSE41-NEXT: paddb %xmm2, %xmm3
336 ; SSE41-NEXT: paddb %xmm1, %xmm1
337 ; SSE41-NEXT: movdqa %xmm1, %xmm0
338 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
339 ; SSE41-NEXT: movdqa %xmm2, %xmm0
342 ; AVX-LABEL: var_shift_v16i8:
344 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
345 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
346 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
347 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
348 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
349 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
350 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
351 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
352 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
353 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
354 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
357 ; XOP-LABEL: var_shift_v16i8:
359 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
362 ; AVX512DQ-LABEL: var_shift_v16i8:
364 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
365 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
366 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
367 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
368 ; AVX512DQ-NEXT: vzeroupper
369 ; AVX512DQ-NEXT: retq
371 ; AVX512BW-LABEL: var_shift_v16i8:
373 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
374 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
375 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
376 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
377 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
378 ; AVX512BW-NEXT: vzeroupper
379 ; AVX512BW-NEXT: retq
381 ; AVX512DQVL-LABEL: var_shift_v16i8:
382 ; AVX512DQVL: # %bb.0:
383 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
384 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
385 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
386 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
387 ; AVX512DQVL-NEXT: vzeroupper
388 ; AVX512DQVL-NEXT: retq
390 ; AVX512BWVL-LABEL: var_shift_v16i8:
391 ; AVX512BWVL: # %bb.0:
392 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
393 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
394 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
395 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
396 ; AVX512BWVL-NEXT: vzeroupper
397 ; AVX512BWVL-NEXT: retq
399 ; X32-SSE-LABEL: var_shift_v16i8:
401 ; X32-SSE-NEXT: psllw $5, %xmm1
402 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
403 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
404 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
405 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
406 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
407 ; X32-SSE-NEXT: psllw $4, %xmm0
408 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
409 ; X32-SSE-NEXT: pand %xmm3, %xmm0
410 ; X32-SSE-NEXT: por %xmm4, %xmm0
411 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
412 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
413 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
414 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
415 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
416 ; X32-SSE-NEXT: psllw $2, %xmm0
417 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
418 ; X32-SSE-NEXT: pand %xmm3, %xmm0
419 ; X32-SSE-NEXT: por %xmm4, %xmm0
420 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
421 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
422 ; X32-SSE-NEXT: movdqa %xmm2, %xmm1
423 ; X32-SSE-NEXT: pandn %xmm0, %xmm1
424 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
425 ; X32-SSE-NEXT: pand %xmm2, %xmm0
426 ; X32-SSE-NEXT: por %xmm1, %xmm0
428 %shift = shl <16 x i8> %a, %b
433 ; Uniform Variable Shifts
436 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
437 ; SSE-LABEL: splatvar_shift_v2i64:
439 ; SSE-NEXT: psllq %xmm1, %xmm0
442 ; AVX-LABEL: splatvar_shift_v2i64:
444 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
447 ; XOP-LABEL: splatvar_shift_v2i64:
449 ; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0
452 ; AVX512-LABEL: splatvar_shift_v2i64:
454 ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0
457 ; AVX512VL-LABEL: splatvar_shift_v2i64:
459 ; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
460 ; AVX512VL-NEXT: retq
462 ; X32-SSE-LABEL: splatvar_shift_v2i64:
464 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
466 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
467 %shift = shl <2 x i64> %a, %splat
471 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
472 ; SSE2-LABEL: splatvar_shift_v4i32:
474 ; SSE2-NEXT: xorps %xmm2, %xmm2
475 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
476 ; SSE2-NEXT: pslld %xmm2, %xmm0
479 ; SSE41-LABEL: splatvar_shift_v4i32:
481 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
482 ; SSE41-NEXT: pslld %xmm1, %xmm0
485 ; AVX-LABEL: splatvar_shift_v4i32:
487 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
488 ; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
491 ; XOP-LABEL: splatvar_shift_v4i32:
493 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
494 ; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
497 ; AVX512-LABEL: splatvar_shift_v4i32:
499 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
500 ; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
503 ; AVX512VL-LABEL: splatvar_shift_v4i32:
505 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
506 ; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
507 ; AVX512VL-NEXT: retq
509 ; X32-SSE-LABEL: splatvar_shift_v4i32:
511 ; X32-SSE-NEXT: xorps %xmm2, %xmm2
512 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
513 ; X32-SSE-NEXT: pslld %xmm2, %xmm0
515 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
516 %shift = shl <4 x i32> %a, %splat
520 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
521 ; SSE2-LABEL: splatvar_shift_v8i16:
523 ; SSE2-NEXT: pextrw $0, %xmm1, %eax
524 ; SSE2-NEXT: movd %eax, %xmm1
525 ; SSE2-NEXT: psllw %xmm1, %xmm0
528 ; SSE41-LABEL: splatvar_shift_v8i16:
530 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
531 ; SSE41-NEXT: psllw %xmm1, %xmm0
534 ; AVX-LABEL: splatvar_shift_v8i16:
536 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
537 ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
540 ; XOP-LABEL: splatvar_shift_v8i16:
542 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
543 ; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
546 ; AVX512-LABEL: splatvar_shift_v8i16:
548 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
549 ; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
552 ; AVX512VL-LABEL: splatvar_shift_v8i16:
554 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
555 ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
556 ; AVX512VL-NEXT: retq
558 ; X32-SSE-LABEL: splatvar_shift_v8i16:
560 ; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
561 ; X32-SSE-NEXT: movd %eax, %xmm1
562 ; X32-SSE-NEXT: psllw %xmm1, %xmm0
564 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
565 %shift = shl <8 x i16> %a, %splat
569 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
570 ; SSE2-LABEL: splatvar_shift_v16i8:
572 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
573 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
574 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
575 ; SSE2-NEXT: psllw $5, %xmm2
576 ; SSE2-NEXT: pxor %xmm1, %xmm1
577 ; SSE2-NEXT: pxor %xmm3, %xmm3
578 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
579 ; SSE2-NEXT: movdqa %xmm3, %xmm4
580 ; SSE2-NEXT: pandn %xmm0, %xmm4
581 ; SSE2-NEXT: psllw $4, %xmm0
582 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
583 ; SSE2-NEXT: pand %xmm3, %xmm0
584 ; SSE2-NEXT: por %xmm4, %xmm0
585 ; SSE2-NEXT: paddb %xmm2, %xmm2
586 ; SSE2-NEXT: pxor %xmm3, %xmm3
587 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
588 ; SSE2-NEXT: movdqa %xmm3, %xmm4
589 ; SSE2-NEXT: pandn %xmm0, %xmm4
590 ; SSE2-NEXT: psllw $2, %xmm0
591 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
592 ; SSE2-NEXT: pand %xmm3, %xmm0
593 ; SSE2-NEXT: por %xmm4, %xmm0
594 ; SSE2-NEXT: paddb %xmm2, %xmm2
595 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
596 ; SSE2-NEXT: movdqa %xmm1, %xmm2
597 ; SSE2-NEXT: pandn %xmm0, %xmm2
598 ; SSE2-NEXT: paddb %xmm0, %xmm0
599 ; SSE2-NEXT: pand %xmm1, %xmm0
600 ; SSE2-NEXT: por %xmm2, %xmm0
603 ; SSE41-LABEL: splatvar_shift_v16i8:
605 ; SSE41-NEXT: movdqa %xmm0, %xmm2
606 ; SSE41-NEXT: pxor %xmm0, %xmm0
607 ; SSE41-NEXT: pshufb %xmm0, %xmm1
608 ; SSE41-NEXT: psllw $5, %xmm1
609 ; SSE41-NEXT: movdqa %xmm1, %xmm3
610 ; SSE41-NEXT: paddb %xmm1, %xmm3
611 ; SSE41-NEXT: movdqa %xmm2, %xmm4
612 ; SSE41-NEXT: psllw $4, %xmm4
613 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
614 ; SSE41-NEXT: movdqa %xmm1, %xmm0
615 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2
616 ; SSE41-NEXT: movdqa %xmm2, %xmm1
617 ; SSE41-NEXT: psllw $2, %xmm1
618 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
619 ; SSE41-NEXT: movdqa %xmm3, %xmm0
620 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
621 ; SSE41-NEXT: movdqa %xmm2, %xmm1
622 ; SSE41-NEXT: paddb %xmm2, %xmm1
623 ; SSE41-NEXT: paddb %xmm3, %xmm3
624 ; SSE41-NEXT: movdqa %xmm3, %xmm0
625 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
626 ; SSE41-NEXT: movdqa %xmm2, %xmm0
629 ; AVX1-LABEL: splatvar_shift_v16i8:
631 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
632 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
633 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
634 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
635 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
636 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
637 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
638 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
639 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
640 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
641 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
642 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
643 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
646 ; AVX2-LABEL: splatvar_shift_v16i8:
648 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
649 ; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
650 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
651 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
652 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
653 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
654 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
655 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
656 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
657 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
658 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
659 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
662 ; XOPAVX1-LABEL: splatvar_shift_v16i8:
664 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
665 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
666 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
669 ; XOPAVX2-LABEL: splatvar_shift_v16i8:
671 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
672 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
675 ; AVX512DQ-LABEL: splatvar_shift_v16i8:
677 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
678 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
679 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
680 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
681 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
682 ; AVX512DQ-NEXT: vzeroupper
683 ; AVX512DQ-NEXT: retq
685 ; AVX512BW-LABEL: splatvar_shift_v16i8:
687 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
688 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
689 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
690 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
691 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
692 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
693 ; AVX512BW-NEXT: vzeroupper
694 ; AVX512BW-NEXT: retq
696 ; AVX512DQVL-LABEL: splatvar_shift_v16i8:
697 ; AVX512DQVL: # %bb.0:
698 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
699 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
700 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
701 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
702 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
703 ; AVX512DQVL-NEXT: vzeroupper
704 ; AVX512DQVL-NEXT: retq
706 ; AVX512BWVL-LABEL: splatvar_shift_v16i8:
707 ; AVX512BWVL: # %bb.0:
708 ; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
709 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
710 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
711 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
712 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
713 ; AVX512BWVL-NEXT: vzeroupper
714 ; AVX512BWVL-NEXT: retq
716 ; X32-SSE-LABEL: splatvar_shift_v16i8:
718 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
719 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
720 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
721 ; X32-SSE-NEXT: psllw $5, %xmm2
722 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
723 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
724 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
725 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
726 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
727 ; X32-SSE-NEXT: psllw $4, %xmm0
728 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
729 ; X32-SSE-NEXT: pand %xmm3, %xmm0
730 ; X32-SSE-NEXT: por %xmm4, %xmm0
731 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
732 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
733 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
734 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
735 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
736 ; X32-SSE-NEXT: psllw $2, %xmm0
737 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
738 ; X32-SSE-NEXT: pand %xmm3, %xmm0
739 ; X32-SSE-NEXT: por %xmm4, %xmm0
740 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
741 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
742 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
743 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
744 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
745 ; X32-SSE-NEXT: pand %xmm1, %xmm0
746 ; X32-SSE-NEXT: por %xmm2, %xmm0
748 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
749 %shift = shl <16 x i8> %a, %splat
757 define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
758 ; SSE2-LABEL: constant_shift_v2i64:
760 ; SSE2-NEXT: movdqa %xmm0, %xmm1
761 ; SSE2-NEXT: psllq $1, %xmm1
762 ; SSE2-NEXT: psllq $7, %xmm0
763 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
766 ; SSE41-LABEL: constant_shift_v2i64:
768 ; SSE41-NEXT: movdqa %xmm0, %xmm1
769 ; SSE41-NEXT: psllq $7, %xmm1
770 ; SSE41-NEXT: psllq $1, %xmm0
771 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
774 ; AVX1-LABEL: constant_shift_v2i64:
776 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
777 ; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
778 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
781 ; AVX2-LABEL: constant_shift_v2i64:
783 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
786 ; XOPAVX1-LABEL: constant_shift_v2i64:
788 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
791 ; XOPAVX2-LABEL: constant_shift_v2i64:
793 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
796 ; AVX512-LABEL: constant_shift_v2i64:
798 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
801 ; AVX512VL-LABEL: constant_shift_v2i64:
803 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
804 ; AVX512VL-NEXT: retq
806 ; X32-SSE-LABEL: constant_shift_v2i64:
808 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
809 ; X32-SSE-NEXT: psllq $1, %xmm1
810 ; X32-SSE-NEXT: psllq $7, %xmm0
811 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
813 %shift = shl <2 x i64> %a, <i64 1, i64 7>
817 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
818 ; SSE2-LABEL: constant_shift_v4i32:
820 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
821 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
822 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
823 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
824 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
825 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
826 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
827 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
830 ; SSE41-LABEL: constant_shift_v4i32:
832 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
835 ; AVX1-LABEL: constant_shift_v4i32:
837 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
840 ; AVX2-LABEL: constant_shift_v4i32:
842 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
845 ; XOPAVX1-LABEL: constant_shift_v4i32:
847 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
850 ; XOPAVX2-LABEL: constant_shift_v4i32:
852 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
855 ; AVX512-LABEL: constant_shift_v4i32:
857 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
860 ; AVX512VL-LABEL: constant_shift_v4i32:
862 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
863 ; AVX512VL-NEXT: retq
865 ; X32-SSE-LABEL: constant_shift_v4i32:
867 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
868 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
869 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
870 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
871 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
872 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
873 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
874 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
876 %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
880 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
881 ; SSE-LABEL: constant_shift_v8i16:
883 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
886 ; AVX-LABEL: constant_shift_v8i16:
888 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
891 ; XOP-LABEL: constant_shift_v8i16:
893 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
896 ; AVX512DQ-LABEL: constant_shift_v8i16:
898 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
899 ; AVX512DQ-NEXT: retq
901 ; AVX512BW-LABEL: constant_shift_v8i16:
903 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
904 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
905 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
906 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
907 ; AVX512BW-NEXT: vzeroupper
908 ; AVX512BW-NEXT: retq
910 ; AVX512DQVL-LABEL: constant_shift_v8i16:
911 ; AVX512DQVL: # %bb.0:
912 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
913 ; AVX512DQVL-NEXT: retq
915 ; AVX512BWVL-LABEL: constant_shift_v8i16:
916 ; AVX512BWVL: # %bb.0:
917 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
918 ; AVX512BWVL-NEXT: retq
920 ; X32-SSE-LABEL: constant_shift_v8i16:
922 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
924 %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
928 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
\r
929 ; SSE2-LABEL: constant_shift_v16i8:
\r
931 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
\r
932 ; SSE2-NEXT: movdqa %xmm1, %xmm2
\r
933 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
\r
934 ; SSE2-NEXT: movdqa %xmm0, %xmm3
\r
935 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
\r
936 ; SSE2-NEXT: pmullw %xmm2, %xmm3
\r
937 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
\r
938 ; SSE2-NEXT: pand %xmm2, %xmm3
\r
939 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
\r
940 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
\r
941 ; SSE2-NEXT: pmullw %xmm1, %xmm0
\r
942 ; SSE2-NEXT: pand %xmm2, %xmm0
\r
943 ; SSE2-NEXT: packuswb %xmm3, %xmm0
\r
946 ; SSE41-LABEL: constant_shift_v16i8:
\r
948 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
\r
949 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
\r
950 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
\r
951 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
\r
952 ; SSE41-NEXT: pmullw %xmm2, %xmm0
\r
953 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
\r
954 ; SSE41-NEXT: pand %xmm2, %xmm0
\r
955 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
\r
956 ; SSE41-NEXT: pand %xmm2, %xmm1
\r
957 ; SSE41-NEXT: packuswb %xmm0, %xmm1
\r
958 ; SSE41-NEXT: movdqa %xmm1, %xmm0
\r
961 ; AVX1-LABEL: constant_shift_v16i8:
\r
963 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
\r
964 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
\r
965 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
\r
966 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
\r
967 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
\r
968 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
\r
969 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
\r
970 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
\r
971 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
\r
972 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
\r
975 ; AVX2-LABEL: constant_shift_v16i8:
\r
977 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
\r
978 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
\r
979 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
\r
980 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
\r
981 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
\r
982 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
\r
983 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
\r
984 ; AVX2-NEXT: vzeroupper
\r
987 ; XOP-LABEL: constant_shift_v16i8:
\r
989 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
992 ; AVX512DQ-LABEL: constant_shift_v16i8:
994 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
995 ; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
996 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
997 ; AVX512DQ-NEXT: vzeroupper
998 ; AVX512DQ-NEXT: retq
1000 ; AVX512BW-LABEL: constant_shift_v16i8:
1001 ; AVX512BW: # %bb.0:
1002 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1003 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1004 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1005 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1006 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1007 ; AVX512BW-NEXT: vzeroupper
1008 ; AVX512BW-NEXT: retq
1010 ; AVX512DQVL-LABEL: constant_shift_v16i8:
1011 ; AVX512DQVL: # %bb.0:
1012 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1013 ; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1014 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1015 ; AVX512DQVL-NEXT: vzeroupper
1016 ; AVX512DQVL-NEXT: retq
1018 ; AVX512BWVL-LABEL: constant_shift_v16i8:
1019 ; AVX512BWVL: # %bb.0:
1020 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1021 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1022 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1023 ; AVX512BWVL-NEXT: vzeroupper
1024 ; AVX512BWVL-NEXT: retq
1026 ; X32-SSE-LABEL: constant_shift_v16i8:
\r
1027 ; X32-SSE: # %bb.0:
\r
1028 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
\r
1029 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
\r
1030 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
\r
1031 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
\r
1032 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
\r
1033 ; X32-SSE-NEXT: pmullw %xmm2, %xmm3
\r
1034 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
\r
1035 ; X32-SSE-NEXT: pand %xmm2, %xmm3
\r
1036 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
\r
1037 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
\r
1038 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
\r
1039 ; X32-SSE-NEXT: pand %xmm2, %xmm0
\r
1040 ; X32-SSE-NEXT: packuswb %xmm3, %xmm0
\r
1041 ; X32-SSE-NEXT: retl
\r
1042 %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
\r
1043 ret <16 x i8> %shift
\r
1047 ; Uniform Constant Shifts
1050 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1051 ; SSE-LABEL: splatconstant_shift_v2i64:
1053 ; SSE-NEXT: psllq $7, %xmm0
1056 ; AVX-LABEL: splatconstant_shift_v2i64:
1058 ; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
1061 ; XOP-LABEL: splatconstant_shift_v2i64:
1063 ; XOP-NEXT: vpsllq $7, %xmm0, %xmm0
1066 ; AVX512-LABEL: splatconstant_shift_v2i64:
1068 ; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0
1071 ; AVX512VL-LABEL: splatconstant_shift_v2i64:
1072 ; AVX512VL: # %bb.0:
1073 ; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
1074 ; AVX512VL-NEXT: retq
1076 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
1078 ; X32-SSE-NEXT: psllq $7, %xmm0
1079 ; X32-SSE-NEXT: retl
1080 %shift = shl <2 x i64> %a, <i64 7, i64 7>
1081 ret <2 x i64> %shift
1084 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1085 ; SSE-LABEL: splatconstant_shift_v4i32:
1087 ; SSE-NEXT: pslld $5, %xmm0
1090 ; AVX-LABEL: splatconstant_shift_v4i32:
1092 ; AVX-NEXT: vpslld $5, %xmm0, %xmm0
1095 ; XOP-LABEL: splatconstant_shift_v4i32:
1097 ; XOP-NEXT: vpslld $5, %xmm0, %xmm0
1100 ; AVX512-LABEL: splatconstant_shift_v4i32:
1102 ; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
1105 ; AVX512VL-LABEL: splatconstant_shift_v4i32:
1106 ; AVX512VL: # %bb.0:
1107 ; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
1108 ; AVX512VL-NEXT: retq
1110 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
1112 ; X32-SSE-NEXT: pslld $5, %xmm0
1113 ; X32-SSE-NEXT: retl
1114 %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1115 ret <4 x i32> %shift
1118 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1119 ; SSE-LABEL: splatconstant_shift_v8i16:
1121 ; SSE-NEXT: psllw $3, %xmm0
1124 ; AVX-LABEL: splatconstant_shift_v8i16:
1126 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
1129 ; XOP-LABEL: splatconstant_shift_v8i16:
1131 ; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
1134 ; AVX512-LABEL: splatconstant_shift_v8i16:
1136 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
1139 ; AVX512VL-LABEL: splatconstant_shift_v8i16:
1140 ; AVX512VL: # %bb.0:
1141 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
1142 ; AVX512VL-NEXT: retq
1144 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
1146 ; X32-SSE-NEXT: psllw $3, %xmm0
1147 ; X32-SSE-NEXT: retl
1148 %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1149 ret <8 x i16> %shift
1152 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1153 ; SSE-LABEL: splatconstant_shift_v16i8:
1155 ; SSE-NEXT: psllw $3, %xmm0
1156 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1159 ; AVX-LABEL: splatconstant_shift_v16i8:
1161 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
1162 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1165 ; XOP-LABEL: splatconstant_shift_v16i8:
1167 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
1170 ; AVX512-LABEL: splatconstant_shift_v16i8:
1172 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
1173 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1176 ; AVX512VL-LABEL: splatconstant_shift_v16i8:
1177 ; AVX512VL: # %bb.0:
1178 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
1179 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1180 ; AVX512VL-NEXT: retq
1182 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
1184 ; X32-SSE-NEXT: psllw $3, %xmm0
1185 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1186 ; X32-SSE-NEXT: retl
1187 %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1188 ret <16 x i8> %shift