return false;
}
+ /// Return true if all the users of N are contained in Nodes.
+ /// NOTE: Requires at least one match, but doesn't require them all.
+ static bool areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N);
+
/// Return the number of values used by this operation.
unsigned getNumOperands() const { return NumOperands; }
return Seen;
}
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+ bool Seen = false;
+ for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+ SDNode *User = *I;
+ if (llvm::any_of(Nodes,
+ [&User](const SDNode *Node) { return User == Node; }))
+ Seen = true;
+ else
+ return false;
+ }
+
+ return Seen;
+}
+
/// isOperand - Return true if this node is an operand of N.
///
bool SDValue::isOperandOf(const SDNode *N) const {
static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask,
+ ArrayRef<const SDNode*> SrcNodes,
int Depth, bool HasVariableMask,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
- // See if we can recurse into each shuffle source op (if it's a target shuffle).
+ // Update the list of shuffle nodes that have been combined so far.
+ SmallVector<const SDNode *, 8> CombinedNodes(SrcNodes.begin(),
+ SrcNodes.end());
+ CombinedNodes.push_back(Op.getNode());
+
+ // See if we can recurse into each shuffle source op (if it's a target
+ // shuffle). The source op should only be combined if it either has a
+ // single use (i.e. current Op) or all its users have already been combined.
for (int i = 0, e = Ops.size(); i < e; ++i)
- if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
- if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
- HasVariableMask, DAG, DCI, Subtarget))
+ if (Ops[i].getNode()->hasOneUse() ||
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+ if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+ Depth + 1, HasVariableMask, DAG, DCI,
+ Subtarget))
return true;
// Attempt to constant fold all of the constant source ops.
// a particular chain.
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget);
return SDValue();
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
; SSE-LABEL: _clearupper2xi64b:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: _clearupper2xi64b:
define i16 @select_xor_1(i16 %A, i8 %cond) {
; CHECK-LABEL: select_xor_1:
-; MCU: andl $1, %edx\r
-; MCU-NEXT: negl %edx\r
-; MCU-NEXT: andl $43, %edx\r
+; MCU: andl $1, %edx
+; MCU-NEXT: negl %edx
+; MCU-NEXT: andl $43, %edx
; MCU-NEXT: xorl %edx, %eax
entry:
%and = and i8 %cond, 1
define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
; X32-LABEL: combine_unpack_unpack_pshufb:
; X32: # BB#0:
-; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; X32-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7]
-; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
; X32-NEXT: retl
;
; X64-LABEL: combine_unpack_unpack_pshufb:
; X64: # BB#0:
-; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; X64-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7]
-; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
%2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
; SSE-LABEL: shuffle_combine_unpack_insert:
; SSE: # BB#0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: pextrw $4, %xmm0, %ecx
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pinsrw $4, %eax, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_combine_unpack_insert:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm1
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm2
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
; AVX-NEXT: retq
%1 = extractelement <8 x i16> %a0, i32 2
%2 = extractelement <8 x i16> %a0, i32 4