From: Simon Pilgrim Date: Fri, 5 Oct 2018 14:41:00 +0000 (+0000) Subject: [X86][AVX] getFauxShuffleMask - add support for INSERT_SUBVECTOR subvector shuffles X-Git-Tag: android-x86-9.0-r1~12169 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=d327e07926eecfa668e036dcd290da33ffb68856;p=android-x86%2Fexternal-llvm.git [X86][AVX] getFauxShuffleMask - add support for INSERT_SUBVECTOR subvector shuffles Decode subvector shuffles from INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) This was found necessary while investigating PR39161 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@343853 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 45f87bbaada..91fa216c053 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6380,6 +6380,42 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, Ops.push_back(Op); return true; } + case ISD::INSERT_SUBVECTOR: { + // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where + // SRC0/SRC1 are both of the same valuetype VT. + // TODO - add peekThroughOneUseBitcasts support. + SDValue Src = N.getOperand(0); + SDValue Sub = N.getOperand(1); + EVT SubVT = Sub.getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + if (!isa(N.getOperand(2)) || + !N->isOnlyUserOf(Sub.getNode())) + return false; + SmallVector SubMask; + SmallVector SubInputs; + if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG)) + return false; + Ops.push_back(Src); + for (SDValue &SubInput : SubInputs) { + if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR || + SubInput.getOperand(0).getValueType() != VT || + !isa(SubInput.getOperand(1))) + return false; + Ops.push_back(SubInput.getOperand(0)); + } + int InsertIdx = N.getConstantOperandVal(2); + for (int i = 0; i != (int)NumElts; ++i) + Mask.push_back(i); + for (int i = 0; i != (int)NumSubElts; ++i) { + int M = SubMask[i]; + if (0 <= M) { + int InputIdx = M / NumSubElts; + int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1); + Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M; + } + } + return true; + } case ISD::SCALAR_TO_VECTOR: { // Match against a scalar_to_vector of an extract from a vector, // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 56319dc55cb..d198fe87bed 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -3138,8 +3138,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 @@ -3154,9 +3153,8 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0] -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z} @@ -3459,10 +3457,9 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,0,0,3] -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [8,5,2,3,2,9,10,1] -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} @@ -3477,10 +3474,9 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,0,0,3] -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [8,5,2,3,2,9,10,1] +; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index def822aac5e..1e12ed9778a 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -1648,8 +1648,8 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4 diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 159335f6325..3e36b4a3b6a 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -32,11 +32,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -50,11 +68,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -68,11 +104,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -86,11 +140,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -104,11 +176,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -122,11 +212,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -140,11 +248,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -158,11 +284,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -176,11 +320,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -194,11 +356,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -212,11 +392,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -230,11 +428,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -248,11 +464,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -266,11 +500,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -286,11 +538,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1118,11 +1388,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1134,11 +1422,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1150,11 +1456,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1166,11 +1490,29 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1182,11 +1524,29 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1200,13 +1560,35 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: movl $15, %eax -; AVX2OR512VL-NEXT: vmovd %eax, %xmm1 -; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # %bb.0: +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: movl $15, %eax +; AVX512VLBW-NEXT: vmovd %eax, %xmm1 +; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: movl $15, %eax +; AVX512VLVBMI-SLOW-NEXT: vmovd %eax, %xmm1 +; AVX512VLVBMI-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index fc1abdf6b1f..ed281c31d46 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -937,19 +937,11 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v4i64_0412: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v4i64_0412: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,2] +; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 2abf0ef58fe..74c477300d6 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -28,11 +28,29 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_00000010: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_00000010: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_00000010: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_00000010: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -45,11 +63,29 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_00000200: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_00000200: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_00000200: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_00000200: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -62,11 +98,29 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_00003000: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_00003000: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_00003000: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_00003000: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -175,11 +229,29 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_00001111: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_00001111: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_00001111: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_00001111: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -292,20 +364,29 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_08991abb: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_08991abb: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_08991abb: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,1,1,3,1,3,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3] -; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -344,26 +425,26 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_09ab1def: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_09ab1def: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX512VL-SLOW-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8f32_09ab1def: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,1,1,3,1,3,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v8f32_09ab1def: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -740,11 +821,29 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_32103210: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_32103210: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_32103210: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_32103210: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1097,11 +1196,29 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_00000010: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_00000010: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_00000010: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_00000010: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1114,11 +1231,29 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_00000200: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_00000200: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_00000200: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_00000200: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1131,11 +1266,29 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_00003000: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_00003000: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_00003000: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_00003000: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1249,11 +1402,29 @@ define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_00001111: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_00001111: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_00001111: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_00001111: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1897,11 +2068,29 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_32103210: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_32103210: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_32103210: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_32103210: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index 07d9107009b..0da2bd82063 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -541,35 +541,33 @@ define <8 x float> @expand14(<4 x float> %a) { define <8 x float> @expand15(<4 x float> %a) { ; SKX64-LABEL: expand15: ; SKX64: # %bb.0: -; SKX64-NEXT: vmovaps {{.*#+}} xmm1 = [0,2,0,0] -; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX64-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SKX64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,8,3,10,5,6,7] +; SKX64-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; SKX64-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand15: ; KNL64: # %bb.0: -; KNL64-NEXT: vmovaps {{.*#+}} xmm1 = [0,2,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL64-NEXT: vbroadcastss {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; ; SKX32-LABEL: expand15: ; SKX32: # %bb.0: -; SKX32-NEXT: vmovaps {{.*#+}} xmm1 = [0,2,0,0] -; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX32-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SKX32-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,8,3,10,5,6,7] +; SKX32-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; SKX32-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand15: ; KNL32: # %bb.0: -; KNL32-NEXT: vmovaps {{.*#+}} xmm1 = [0,2,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL32-NEXT: vbroadcastss {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 5283d5be395..19c3deb31e7 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -540,9 +540,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 @@ -555,9 +554,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3] +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax @@ -677,9 +675,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax @@ -690,9 +686,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax