return Op;
}
+static bool isZeroVector(SDValue N) {
+ if (N->getOpcode() == ISD::SPLAT_VECTOR)
+ if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+ return Op->getZExtValue() == 0;
+ return ISD::isBuildVectorAllZeros(N.getNode());
+}
+
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
// VSLDB or VPERM.
Ops[OpNo1],
DAG.getTargetConstant(StartIndex, DL, MVT::i32));
- // Fall back on VPERM. Construct an SDNode for the permute vector.
+ // Fall back on VPERM. Construct an SDNode for the permute vector. Try to
+ // eliminate a zero vector by reusing any zero index in the permute vector.
+ unsigned ZeroVecIdx =
+ isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX);
+ if (ZeroVecIdx != UINT_MAX) {
+ bool MaskFirst = true;
+ int ZeroIdx = -1;
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+ unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+ unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
+ if (OpNo == ZeroVecIdx && I == 0) {
+ // If the first byte is zero, use mask as first operand.
+ ZeroIdx = 0;
+ break;
+ }
+ if (OpNo != ZeroVecIdx && Byte == 0) {
+ // If mask contains a zero, use it by placing that vector first.
+ ZeroIdx = I + SystemZ::VectorBytes;
+ MaskFirst = false;
+ break;
+ }
+ }
+ if (ZeroIdx != -1) {
+ SDValue IndexNodes[SystemZ::VectorBytes];
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+ if (Bytes[I] >= 0) {
+ unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+ unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
+ if (OpNo == ZeroVecIdx)
+ IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32);
+ else {
+ unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte;
+ IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32);
+ }
+ } else
+ IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+ }
+ SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+ SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0];
+ if (MaskFirst)
+ return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src,
+ Mask);
+ else
+ return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask,
+ Mask);
+ }
+ }
+
SDValue IndexNodes[SystemZ::VectorBytes];
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
if (Bytes[I] >= 0)
; Test that only one vperm of the vector compare is needed for both extracts.
define void @fun() {
-; CHECK-LABEL: fun
+; CHECK-LABEL: fun:
; CHECK: vperm
; CHECK-NOT: vperm
bb:
bb4:
unreachable
}
+
+; Test that a zero index in the permute vector is used instead of VGBM, with
+; a zero index into the other source operand.
+define <4 x i8> @fun1(<2 x i8> %arg) {
+; CHECK-LABEL:.LCPI1_0:
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 18 # 0x12
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 18 # 0x12
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .text
+; CHECK-NEXT: .globl fun1
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .type fun1,@function
+; CHECK-NEXT: fun1: # @fun1
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI1_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer,
+ <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ ret <4 x i8> %res
+}
+
+; Same, but with the first byte indexing into an element of the zero vector.
+define <4 x i8> @fun2(<2 x i8> %arg) {
+; CHECK-LABEL:.LCPI2_0:
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 17 # 0x11
+; CHECK-NEXT: .byte 17 # 0x11
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .space 1
+; CHECK-NEXT: .text
+; CHECK-NEXT: .globl fun2
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .type fun2,@function
+; CHECK-NEXT:fun2: # @fun2
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT:# %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI2_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
+; CHECK-NEXT: br %r14
+ %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer,
+ <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+ ret <4 x i8> %res
+}