/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
- ArrayRef<int> OriginalMask, SDValue V1,
+ ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
- SmallVector<int, 4> Mask(OriginalMask.begin(), OriginalMask.end());
- SmallVector<int, 4> NewMask = Mask;
-
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
}
} else if (NumV2Elements == 2) {
- // If we are likely to fold V1 but not V2, then commute the shuffle.
- if (MayFoldLoad(V1) && !MayFoldLoad(V2)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- NewMask = Mask;
- std::swap(V1, V2);
- std::swap(LowV, HighV);
- }
-
if (Mask[0] < 4 && Mask[1] < 4) {
// Handle the easy case where we have V1 in the low lanes and V2 in the
// high lanes.
}
}
+ // Attempt to commute shufps LHS loads:
+ // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+ if (VT == MVT::v4f32 &&
+ (X86ISD::VPERMILPI == Opcode ||
+ (X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) {
+ SDValue N0 = N.getOperand(0);
+ unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2);
+ if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) &&
+ !MayFoldLoad(peekThroughOneUseBitcasts(N01))) {
+ unsigned Imm1 = N0.getConstantOperandVal(2);
+ Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4);
+ SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00,
+ DAG.getTargetConstant(Imm1, DL, MVT::i8));
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ }
+ }
+ }
+
switch (Opcode) {
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
; SSE-32: # %bb.0: # %L.entry
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; SSE-32-NEXT: movaps 304(%ecx), %xmm0
+; SSE-32-NEXT: xorps %xmm0, %xmm0
; SSE-32-NEXT: xorps %xmm1, %xmm1
-; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
-; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-32-NEXT: movups %xmm1, 624(%eax)
-; SSE-32-NEXT: movups %xmm0, 608(%eax)
+; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0]
+; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE-32-NEXT: movups %xmm0, 624(%eax)
+; SSE-32-NEXT: movups %xmm1, 608(%eax)
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: PR15298:
; SSE-64: # %bb.0: # %L.entry
-; SSE-64-NEXT: movaps 304(%rdi), %xmm0
+; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: xorps %xmm1, %xmm1
-; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
-; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-64-NEXT: movups %xmm1, 624(%rsi)
-; SSE-64-NEXT: movups %xmm0, 608(%rsi)
+; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0]
+; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE-64-NEXT: movups %xmm0, 624(%rsi)
+; SSE-64-NEXT: movups %xmm1, 608(%rsi)
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: PR15298: