}
}
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] == -1)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
/// unblended shuffles followed by an unshuffled blend.
///
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4 != Mask[1] < 4))
+ return false;
+ if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4 != Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
/// \brief Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
return V;
+
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
}
// Otherwise fall back to a SHUFPS lowering strategy.
define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08991abb:
; AVX1: # BB#0:
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
+; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_08991abb:
define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08991abb:
; AVX1: # BB#0:
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
+; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08991abb:
}
define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test16:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_nested_undef_test16:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test16:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test16:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
}
define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test17:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_nested_undef_test17:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test17:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test17:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test17:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
}
define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test19:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_nested_undef_test19:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test19:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test19:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test19:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,0]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
}
define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test20:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_nested_undef_test20:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test20:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test20:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test20:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
}
define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test21:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_nested_undef_test21:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test21:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test21:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test21:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[1,1]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
}
define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: combine_test3b:
-; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_test3b:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test3b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test3b:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test3b:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>