const SDLoc &DL, EVT VT, uint64_t Value,
unsigned BitsPerElement) {
// Signed 16-bit values can be replicated using VREPI.
+ // Mark the constants as opaque or DAGCombiner will convert back to
+ // BUILD_VECTOR.
int64_t SignedValue = SignExtend64(Value, BitsPerElement);
if (isInt<16>(SignedValue)) {
MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
- DAG.getConstant(SignedValue, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::REPLICATE, DL, VecVT,
+ DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
// See whether rotating the constant left some N places gives a value that
End -= 64 - BitsPerElement;
MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
- DAG.getConstant(Start, DL, MVT::i32),
- DAG.getConstant(End, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::ROTATE_MASK, DL, VecVT,
+ DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
+ DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
return SDValue();
// priority over other methods below.
uint64_t Mask = 0;
if (tryBuildVectorByteMask(BVN, Mask)) {
- SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(Mask, DL, MVT::i32));
+ SDValue Op = DAG.getNode(
+ SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+ DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
return SDValue();
}
+// Return the demanded elements for the OpNo source operand of Op. DemandedElts
+// are for Op.
+static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
+ unsigned OpNo) {
+ EVT VT = Op.getValueType();
+ unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
+ APInt SrcDemE;
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ // VECTOR PACK truncates the elements of two source vectors into one.
+ SrcDemE = DemandedElts;
+ if (OpNo == 2)
+ SrcDemE.lshrInPlace(NumElts / 2);
+ SrcDemE = SrcDemE.trunc(NumElts / 2);
+ break;
+ // VECTOR UNPACK extends half the elements of the source vector.
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+ case Intrinsic::s390_vuplhh:
+ case Intrinsic::s390_vuplhf:
+ SrcDemE = APInt(NumElts * 2, 0);
+ SrcDemE.insertBits(DemandedElts, 0);
+ break;
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf:
+ case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+ case Intrinsic::s390_vupllh:
+ case Intrinsic::s390_vupllf:
+ SrcDemE = APInt(NumElts * 2, 0);
+ SrcDemE.insertBits(DemandedElts, NumElts);
+ break;
+ case Intrinsic::s390_vpdi: {
+ // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
+ SrcDemE = APInt(NumElts, 0);
+ if (!DemandedElts[OpNo - 1])
+ break;
+ unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
+ // Demand input element 0 or 1, given by the mask bit value.
+ SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
+ break;
+ }
+ case Intrinsic::s390_vsldb: {
+ // VECTOR SHIFT LEFT DOUBLE BY BYTE
+ assert(VT == MVT::v16i8 && "Unexpected type.");
+ unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
+ unsigned NumSrc0Els = 16 - FirstIdx;
+ SrcDemE = APInt(NumElts, 0);
+ if (OpNo == 1) {
+ APInt DemEls = DemandedElts.trunc(NumSrc0Els);
+ SrcDemE.insertBits(DemEls, FirstIdx);
+ } else {
+ APInt DemEls = DemandedElts.lshr(NumSrc0Els);
+ SrcDemE.insertBits(DemEls, 0);
+ }
+ break;
+ }
+ case Intrinsic::s390_vperm:
+ SrcDemE = APInt(NumElts, 1);
+ break;
+ default:
+ llvm_unreachable("Unhandled intrinsic.");
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::JOIN_DWORDS:
+ // Scalar operand.
+ SrcDemE = APInt(1, 1);
+ break;
+ case SystemZISD::SELECT_CCMASK:
+ SrcDemE = DemandedElts;
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode.");
+ break;
+ }
+ }
+ return SrcDemE;
+}
+
+static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth,
+ unsigned OpNo) {
+ APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+ APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+ unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+ KnownBits LHSKnown(SrcBitWidth), RHSKnown(SrcBitWidth);
+ DAG.computeKnownBits(Op.getOperand(OpNo), LHSKnown, Src0DemE, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(OpNo + 1), RHSKnown, Src1DemE, Depth + 1);
+ Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
+ Known.One = LHSKnown.One & RHSKnown.One;
+}
+
void
SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
- unsigned BitWidth = Known.getBitWidth();
-
Known.resetAll();
- switch (Op.getOpcode()) {
- case SystemZISD::SELECT_CCMASK: {
- KnownBits TrueKnown(BitWidth), FalseKnown(BitWidth);
- DAG.computeKnownBits(Op.getOperand(0), TrueKnown, Depth + 1);
- DAG.computeKnownBits(Op.getOperand(1), FalseKnown, Depth + 1);
- Known.Zero = TrueKnown.Zero & FalseKnown.Zero;
- Known.One = TrueKnown.One & FalseKnown.One;
- break;
+
+ // Intrinsic CC result is returned in the two low bits.
+ unsigned tmp0, tmp1; // not used
+ if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
+ Known.Zero.setBitsFrom(2);
+ return;
+ }
+ EVT VT = Op.getValueType();
+ if (Op.getResNo() != 0 || VT == MVT::Untyped)
+ return;
+ assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
+ "KnownBits does not match VT in bitwidth");
+ assert ((!VT.isVector() ||
+ (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
+ "DemandedElts does not match VT number of elements");
+ unsigned BitWidth = Known.getBitWidth();
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ bool IsLogical = false;
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ case Intrinsic::s390_vpdi:
+ case Intrinsic::s390_vsldb:
+ case Intrinsic::s390_vperm:
+ computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
+ break;
+ case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+ case Intrinsic::s390_vuplhh:
+ case Intrinsic::s390_vuplhf:
+ case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+ case Intrinsic::s390_vupllh:
+ case Intrinsic::s390_vupllf:
+ IsLogical = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf: {
+ SDValue SrcOp = Op.getOperand(1);
+ unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
+ Known = KnownBits(SrcBitWidth);
+ APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
+ DAG.computeKnownBits(SrcOp, Known, SrcDemE, Depth + 1);
+ if (IsLogical) {
+ Known = Known.zext(BitWidth);
+ Known.Zero.setBitsFrom(SrcBitWidth);
+ } else
+ Known = Known.sext(BitWidth);
+ break;
+ }
+ default:
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::JOIN_DWORDS:
+ case SystemZISD::SELECT_CCMASK:
+ computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
+ break;
+ case SystemZISD::REPLICATE: {
+ SDValue SrcOp = Op.getOperand(0);
+ DAG.computeKnownBits(SrcOp, Known, Depth + 1);
+ if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
+ Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
+ break;
+ }
+ default:
+ break;
+ }
}
- default:
- break;
+ // Known has the width of the source operand(s). Adjust if needed to match
+ // the passed bitwidth.
+ if (Known.getBitWidth() != BitWidth)
+ Known = Known.zextOrTrunc(BitWidth);
+}
+
+static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth,
+ unsigned OpNo) {
+ APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+ unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
+ if (LHS == 1) return 1; // Early out.
+ APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+ unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
+ if (RHS == 1) return 1; // Early out.
+ unsigned Common = std::min(LHS, RHS);
+ unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
+ if (SrcBitWidth > VTBits) { // PACK
+ unsigned SrcExtraBits = SrcBitWidth - VTBits;
+ if (Common > SrcExtraBits)
+ return (Common - SrcExtraBits);
+ return 1;
+ }
+ assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
+ return Common;
+}
+
+unsigned
+SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ if (Op.getResNo() != 0)
+ return 1;
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+ unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (Id) {
+ case Intrinsic::s390_vpksh: // PACKS
+ case Intrinsic::s390_vpksf:
+ case Intrinsic::s390_vpksg:
+ case Intrinsic::s390_vpkshs: // PACKS_CC
+ case Intrinsic::s390_vpksfs:
+ case Intrinsic::s390_vpksgs:
+ case Intrinsic::s390_vpklsh: // PACKLS
+ case Intrinsic::s390_vpklsf:
+ case Intrinsic::s390_vpklsg:
+ case Intrinsic::s390_vpklshs: // PACKLS_CC
+ case Intrinsic::s390_vpklsfs:
+ case Intrinsic::s390_vpklsgs:
+ case Intrinsic::s390_vpdi:
+ case Intrinsic::s390_vsldb:
+ case Intrinsic::s390_vperm:
+ return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
+ case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
+ case Intrinsic::s390_vuphh:
+ case Intrinsic::s390_vuphf:
+ case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
+ case Intrinsic::s390_vuplhw:
+ case Intrinsic::s390_vuplf: {
+ SDValue PackedOp = Op.getOperand(1);
+ APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
+ unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
+ Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
+ return Tmp;
+ }
+ default:
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case SystemZISD::SELECT_CCMASK:
+ return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
+ default:
+ break;
+ }
}
+
+ return 1;
}
//===----------------------------------------------------------------------===//
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
ISD::NodeType getExtendForAtomicOps() const override {
return ISD::ANY_EXTEND;
}
--- /dev/null
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 1): i64 -> i32
+; NOTE: The vector AND is optimized away, but vrepig+vpksgs is used instead
+; of vrepif. Similarly for more test cases below.
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpksgs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepif %v0, 1
+; CHECK-NEXT: vpksfs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+; PACKS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepih %v0, 1
+; CHECK-NEXT: vpkshs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 1): i64 -> i32
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpklsgs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepif %v0, 1
+; CHECK-NEXT: vpklsfs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+; PACKLS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepih %v0, 1
+; CHECK-NEXT: vpklshs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS (operand elements are 0): i64 -> i32
+define <4 x i32> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+ %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 1): i64 -> i32
+define <4 x i32> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpksg %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+ %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 0): i32 -> i16
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+ %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 1): i32 -> i16
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepif %v0, 1
+; CHECK-NEXT: vpksf %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 0): i16 -> i8
+define <16 x i8> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpksh(
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+ %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+; PACKS (operand elements are 1): i16 -> i8
+define <16 x i8> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepih %v0, 1
+; CHECK-NEXT: vpksh %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpksh(
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS (operand elements are 0): i64 -> i32
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+ %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 1): i64 -> i32
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpklsg %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+ %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 0): i32 -> i16
+define <8 x i16> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+ %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 1): i32 -> i16
+define <8 x i16> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepif %v0, 1
+; CHECK-NEXT: vpklsf %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 0): i16 -> i8
+define <16 x i8> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpklsh(
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+ %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+; PACKLS (operand elements are 1): i16 -> i8
+define <16 x i8> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepih %v0, 1
+; CHECK-NEXT: vpklsh %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpklsh(
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI (operand elements are 0):
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 0>,
+ <2 x i64> <i64 0, i64 0>, i32 0)
+ %res = and <2 x i64> %perm, <i64 1, i64 1>
+ ret <2 x i64> %res
+}
+
+; VPDI (operand elements are 1):
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpdi %v24, %v0, %v0, 0
+; CHECK-NEXT: br %r14
+ %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 1, i64 1>,
+ <2 x i64> <i64 1, i64 1>, i32 0)
+ %res = and <2 x i64> %perm, <i64 1, i64 1>
+ ret <2 x i64> %res
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB (operand elements are 0):
+define <16 x i8> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+ i32 1)
+
+ %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %res
+}
+
+; VSLDB (operand elements are 1):
+define <16 x i8> @f27() {
+; CHECK-LABEL: f27:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepib %v0, 1
+; CHECK-NEXT: vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+ %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+ i32 1)
+
+ %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %res
+}
+
+; Test that intrinsic CC result is recognized.
+define i32 @f28(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: f28:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: lhi %r2, 0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
+ %cc = extractvalue {<8 x i16>, i32} %call, 1
+ %res = and i32 %cc, -4
+ ret i32 %res
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM (operand elements are 0):
+define <16 x i8> @f29() {
+; CHECK-LABEL: f29:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %perm = call <16 x i8> @llvm.s390.vperm(
+ <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+ <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+ <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %res
+}
+
+; Test VPERM (operand elements are 1):
+define <16 x i8> @f30() {
+; CHECK-LABEL: f30:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vrepib %v1, 1
+; CHECK-NEXT: vperm %v24, %v1, %v1, %v0
+; CHECK-NEXT: br %r14
+ %perm = call <16 x i8> @llvm.s390.vperm(
+ <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+ <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+ <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %res
+}
--- /dev/null
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
+
+; VUPHB (used operand elements are 0)
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPHB (used operand elements are 1)
+; NOTE: The AND is optimized away, but instead of replicating '1' into <8 x
+; i16>, the original vector constant is put in the constant pool and then
+; unpacked (repeated in more test cases below).
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuphb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 0)
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 1)
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplhb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
+
+; VUPHH (used operand elements are 0)
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+ <i16 0, i16 0, i16 0, i16 0,
+ i16 1, i16 1, i16 1, i16 1>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPHH (used operand elements are 1)
+define <4 x i32> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuphh %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+ <i16 1, i16 1, i16 1, i16 1,
+ i16 0, i16 0, i16 0, i16 0>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 0)
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+ <i16 0, i16 0, i16 0, i16 0,
+ i16 1, i16 1, i16 1, i16 1>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 1)
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplhh %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+ <i16 1, i16 1, i16 1, i16 1,
+ i16 0, i16 0, i16 0, i16 0>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
+
+; VUPHF (used operand elements are 0)
+define <2 x i64> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPHF (used operand elements are 1)
+define <2 x i64> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuphf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 0)
+define <2 x i64> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 1)
+define <2 x i64> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplhf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
+
+; VUPLB (used operand elements are 0)
+define <8 x i16> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPLB (used operand elements are 1)
+define <8 x i16> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 0)
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 1)
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vupllb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+ <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
+
+; VUPLHW (used operand elements are 0)
+define <4 x i32> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+ <i16 1, i16 1, i16 1, i16 1,
+ i16 0, i16 0, i16 0, i16 0>)
+
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPLHW (used operand elements are 1)
+define <4 x i32> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplhw %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+ <i16 0, i16 0, i16 0, i16 0,
+ i16 1, i16 1, i16 1, i16 1>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 0)
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+ <i16 1, i16 1, i16 1, i16 1,
+ i16 0, i16 0, i16 0, i16 0>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 1)
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vupllh %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+ <i16 0, i16 0, i16 0, i16 0,
+ i16 1, i16 1, i16 1, i16 1>)
+ %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
+
+; VUPLF (used operand elements are 0)
+define <2 x i64> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPLF (used operand elements are 1)
+define <2 x i64> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vuplf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 0)
+define <2 x i64> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 1)
+define <2 x i64> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI
+; CHECK-NEXT: vl %v0, 0(%r1)
+; CHECK-NEXT: vupllf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ %and = and <2 x i64> %unp, <i64 1, i64 1>
+ ret <2 x i64> %and
+}
+
+; Test that signed unpacking of positive elements gives known zeros in high part.
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+ i64 -4294967296>
+ ret <2 x i64> %and
+}
+
+; Test that signed unpacking of negative elements gives known ones in high part.
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+; 61680 = 0xf0f0
+; CHECK-NEXT: vgbm %v24, 61680
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+ i64 -4294967296>
+ ret <2 x i64> %and
+}
+
+; Test that logical unpacking of negative elements gives known zeros in high part.
+define <2 x i64> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+ %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+ i64 -4294967296>
+ ret <2 x i64> %and
+}
--- /dev/null
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode().
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck %s
+
+; SystemZISD::REPLICATE
+define i32 @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vlgvf
+; CHECK-NOT: lhi %r2, 0
+; CHECK-NOT: chi %r0, 0
+; CHECK-NOT: lochilh %r2, 1
+; CHECK: br %r14
+ %cmp0 = icmp ne <4 x i32> undef, zeroinitializer
+ %zxt0 = zext <4 x i1> %cmp0 to <4 x i32>
+ %ext0 = extractelement <4 x i32> %zxt0, i32 3
+ br label %exit
+
+exit:
+; The vector icmp+zext involves a REPLICATE of 1's. If KnownBits reflects
+; this, DAGCombiner can see that the i32 icmp and zext here are not needed.
+ %cmp1 = icmp ne i32 %ext0, 0
+ %zxt1 = zext i1 %cmp1 to i32
+ ret i32 %zxt1
+}
+
+; SystemZISD::JOIN_DWORDS (and REPLICATE)
+define void @f1() {
+; The DAG XOR has JOIN_DWORDS and REPLICATE operands. With KnownBits properly set
+; for both these nodes, ICMP is used instead of TM during lowering because
+; adjustForRedundantAnd() succeeds.
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NOT: tmll
+; CHECK-NOT: jne
+; CHECK: cijlh
+ %1 = load i16, i16* null, align 2
+ %2 = icmp eq i16 %1, 0
+ %3 = insertelement <2 x i1> undef, i1 %2, i32 0
+ %4 = insertelement <2 x i1> %3, i1 true, i32 1
+ %5 = xor <2 x i1> %4, <i1 true, i1 true>
+ %6 = extractelement <2 x i1> %5, i32 0
+ %7 = or i1 %6, undef
+ br i1 %7, label %9, label %8
+
+; <label>:8: ; preds = %0
+ unreachable
+
+; <label>:9: ; preds = %0
+ unreachable
+}
--- /dev/null
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC: i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpksgs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 0, i64 1>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %trunc = trunc <4 x i32> %extr to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; PACKS_CC: i32 -> i16
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpksfs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+ <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %trunc = trunc <8 x i16> %extr to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+; PACKS_CC: i16 -> i8
+define <16 x i8> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpkshs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %trunc = trunc <16 x i8> %extr to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC: i64 -> i32
+define <4 x i32> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklsgs %v24, %v1, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+ %extr = extractvalue {<4 x i32>, i32} %call, 0
+ %trunc = trunc <4 x i32> %extr to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; PACKLS_CC: i32 -> i16
+define <8 x i16> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklsfs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+ <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+ %extr = extractvalue {<8 x i16>, i32} %call, 0
+ %trunc = trunc <8 x i16> %extr to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+; PACKLS_CC: i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklshs %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+ %extr = extractvalue {<16 x i8>, i32} %call, 0
+ %trunc = trunc <16 x i8> %extr to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS: i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpksg %v24, %v1, %v0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+ %trunc = trunc <4 x i32> %call to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; PACKS: i32 -> i16
+define <8 x i16> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpksf %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+ <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+ %trunc = trunc <8 x i16> %call to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+; PACKS: i16 -> i8
+define <16 x i8> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpksh %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpksh(
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+ %trunc = trunc <16 x i8> %call to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS: i64 -> i32
+define <4 x i32> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklsg %v24, %v1, %v0
+; CHECK-NEXT: br %r14
+ %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+ %trunc = trunc <4 x i32> %call to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; PACKLS: i32 -> i16
+define <8 x i16> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklsf %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+ <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+ %trunc = trunc <8 x i16> %call to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+; PACKLS: i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpklsh %v24, %v0, %v0
+; CHECK-NEXT: br %r14
+ %call = call <16 x i8> @llvm.s390.vpklsh(
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+ <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+ %trunc = trunc <16 x i8> %call to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI:
+define <2 x i64> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vpdi %v24, %v1, %v0, 0
+; CHECK-NEXT: br %r14
+ %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 1>,
+ <2 x i64> <i64 1, i64 0>, i32 0)
+ %trunc = trunc <2 x i64> %perm to <2 x i32>
+ %ret = sext <2 x i32> %trunc to <2 x i64>
+ ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB:
+define <16 x i8> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+ %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+ <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>, <16 x i8>
+ <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+ i32 1)
+ %trunc = trunc <16 x i8> %shfd to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM:
+define <16 x i8> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vperm %v24, %v0, %v0, %v0
+; CHECK-NEXT: br %r14
+ %perm = call <16 x i8> @llvm.s390.vperm(
+ <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+ <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+ <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+ i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>)
+ %trunc = trunc <16 x i8> %perm to <16 x i4>
+ %ret = sext <16 x i4> %trunc to <16 x i8>
+ ret <16 x i8> %ret
+}
--- /dev/null
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+
+; VUPHB
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuphb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+ <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+ i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ %trunc = trunc <8 x i16> %unp to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+
+; VUPHH
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuphh %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+ <i16 0, i16 1, i16 0, i16 1,
+ i16 0, i16 1, i16 0, i16 1>)
+ %trunc = trunc <4 x i32> %unp to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+
+; VUPHF
+define <2 x i64> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuphf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+ %trunc = trunc <2 x i64> %unp to <2 x i32>
+ %ret = sext <2 x i32> %trunc to <2 x i64>
+ ret <2 x i64> %ret
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+
+; VUPLB
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuplb %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+ <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+ i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ %trunc = trunc <8 x i16> %unp to <8 x i8>
+ %ret = sext <8 x i8> %trunc to <8 x i16>
+ ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+
+; VUPLHW
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuplhw %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+ <i16 1, i16 0, i16 1, i16 0,
+ i16 1, i16 0, i16 1, i16 0>)
+ %trunc = trunc <4 x i32> %unp to <4 x i16>
+ %ret = sext <4 x i16> %trunc to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+
+; VUPLF
+define <2 x i64> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK: vuplf %v24, %v0
+; CHECK-NEXT: br %r14
+ %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 0, i32 1, i32 0>)
+ %trunc = trunc <2 x i64> %unp to <2 x i32>
+ %ret = sext <2 x i32> %trunc to <2 x i64>
+ ret <2 x i64> %ret
+}
+
--- /dev/null
+; Test that ComputeNumSignBitsForTargetNode() (SELECT_CCMASK) will help
+; DAGCombiner so that it knows that %sel0 is already sign extended.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -debug-only=isel < %s 2>&1 | FileCheck %s
+
+%0 = type <{ %1*, i16, [6 x i8] }>
+%1 = type { i32 (...)** }
+
+define signext i16 @fun(%0* %Arg0, i16 signext %Arg1) {
+entry:
+ br i1 undef, label %lab0, label %lab1
+
+lab0:
+ %icmp0 = icmp eq i32 undef, 0
+ %sel0 = select i1 %icmp0, i16 %Arg1, i16 1
+ br label %lab1
+
+lab1:
+; CHECK: *** MachineFunction at end of ISel ***
+; CHECK-LABEL: bb.2.lab1:
+; CHECK-NOT: LHR
+; CHECK: BRC
+ %phi0 = phi i16 [ 2, %entry ], [ %sel0, %lab0 ]
+ %sext0 = sext i16 %phi0 to i32
+ br i1 undef, label %lab2, label %lab3
+
+lab2:
+ %and0 = and i32 %sext0, 8
+ %icmp1 = icmp eq i32 %and0, 0
+ %sel1 = select i1 %icmp1, i16 %phi0, i16 4
+ ret i16 %sel1
+
+lab3:
+ ret i16 8
+}
+
; CHECK-NEXT: vlvgf [[REG2]], [[REG3]], 2
; CHECK-NEXT: vn [[REG2]], [[REG2]], [[REG0]]
; CHECK-NEXT: vlgvf [[REG4:%r[0-9]]], [[REG2]], 3
-; CHECK-NEXT: tmll [[REG4]], 1
-; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: cijlh [[REG4]], 0, .LBB0_1
; CHECK-NEXT: # %bb.2: # %CF36
; CHECK-NEXT: br %r14
BB: