[SystemZ] computeKnownBitsForTargetNode() / ComputeNumSignBitsForTargetNode()

author Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)

committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)
author Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)
committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp

index 7dac303..bf2d183 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4185,12 +4185,15 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
                                         const SDLoc &DL, EVT VT, uint64_t Value,
                                         unsigned BitsPerElement) {
    // Signed 16-bit values can be replicated using VREPI.
+  // Mark the constants as opaque or DAGCombiner will convert back to
+  // BUILD_VECTOR.
    int64_t SignedValue = SignExtend64(Value, BitsPerElement);
    if (isInt<16>(SignedValue)) {
      MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                   SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
-                             DAG.getConstant(SignedValue, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::REPLICATE, DL, VecVT,
+        DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
    }
    // See whether rotating the constant left some N places gives a value that
@@ -4206,9 +4209,10 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
      End -= 64 - BitsPerElement;
      MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                   SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
-                             DAG.getConstant(Start, DL, MVT::i32),
-                             DAG.getConstant(End, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::ROTATE_MASK, DL, VecVT,
+        DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
+        DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
    }
    return SDValue();
@@ -4421,8 +4425,9 @@ SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
      // priority over other methods below.
      uint64_t Mask = 0;
      if (tryBuildVectorByteMask(BVN, Mask)) {
-      SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
-                               DAG.getConstant(Mask, DL, MVT::i32));
+      SDValue Op = DAG.getNode(
+          SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+          DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
        return DAG.getNode(ISD::BITCAST, DL, VT, Op);
      }
  
@@ -5605,28 +5610,293 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
    return SDValue();
  }
  
+// Return the demanded elements for the OpNo source operand of Op. DemandedElts
+// are for Op.
+static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
+                                    unsigned OpNo) {
+  EVT VT = Op.getValueType();
+  unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
+  APInt SrcDemE;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+      // VECTOR PACK truncates the elements of two source vectors into one.
+      SrcDemE = DemandedElts;
+      if (OpNo == 2)
+        SrcDemE.lshrInPlace(NumElts / 2);
+      SrcDemE = SrcDemE.trunc(NumElts / 2);
+      break;
+      // VECTOR UNPACK extends half the elements of the source vector.
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, 0);
+      break;
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, NumElts);
+      break;
+    case Intrinsic::s390_vpdi: {
+      // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
+      SrcDemE = APInt(NumElts, 0);
+      if (!DemandedElts[OpNo - 1])
+        break;
+      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
+      // Demand input element 0 or 1, given by the mask bit value.
+      SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
+      break;
+    }
+    case Intrinsic::s390_vsldb: {
+      // VECTOR SHIFT LEFT DOUBLE BY BYTE
+      assert(VT == MVT::v16i8 && "Unexpected type.");
+      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
+      unsigned NumSrc0Els = 16 - FirstIdx;
+      SrcDemE = APInt(NumElts, 0);
+      if (OpNo == 1) {
+        APInt DemEls = DemandedElts.trunc(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, FirstIdx);
+      } else {
+        APInt DemEls = DemandedElts.lshr(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, 0);
+      }
+      break;
+    }
+    case Intrinsic::s390_vperm:
+      SrcDemE = APInt(NumElts, 1);
+      break;
+    default:
+      llvm_unreachable("Unhandled intrinsic.");
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+      // Scalar operand.
+      SrcDemE = APInt(1, 1);
+      break;
+    case SystemZISD::SELECT_CCMASK:
+      SrcDemE = DemandedElts;
+      break;
+    default:
+      llvm_unreachable("Unhandled opcode.");
+      break;
+    }
+  }
+  return SrcDemE;
+}
+
+static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
+                                  const APInt &DemandedElts,
+                                  const SelectionDAG &DAG, unsigned Depth,
+                                  unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  KnownBits LHSKnown(SrcBitWidth), RHSKnown(SrcBitWidth);
+  DAG.computeKnownBits(Op.getOperand(OpNo), LHSKnown, Src0DemE, Depth + 1);
+  DAG.computeKnownBits(Op.getOperand(OpNo + 1), RHSKnown, Src1DemE, Depth + 1);
+  Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
+  Known.One = LHSKnown.One & RHSKnown.One;
+}
+
  void
  SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
-  unsigned BitWidth = Known.getBitWidth();
-
    Known.resetAll();
-  switch (Op.getOpcode()) {
-  case SystemZISD::SELECT_CCMASK: {
-    KnownBits TrueKnown(BitWidth), FalseKnown(BitWidth);
-    DAG.computeKnownBits(Op.getOperand(0), TrueKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), FalseKnown, Depth + 1);
-    Known.Zero = TrueKnown.Zero & FalseKnown.Zero;
-    Known.One = TrueKnown.One & FalseKnown.One;
-    break;
+
+  // Intrinsic CC result is returned in the two low bits.
+  unsigned tmp0, tmp1; // not used
+  if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
+    Known.Zero.setBitsFrom(2);
+    return;
+  }
+  EVT VT = Op.getValueType();
+  if (Op.getResNo() != 0 || VT == MVT::Untyped)
+    return;
+  assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
+          "KnownBits does not match VT in bitwidth");
+  assert ((!VT.isVector() ||
+           (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
+          "DemandedElts does not match VT number of elements");
+  unsigned BitWidth = Known.getBitWidth();
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    bool IsLogical = false;
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
+      break;
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      IsLogical = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue SrcOp = Op.getOperand(1);
+      unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
+      Known = KnownBits(SrcBitWidth);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
+      DAG.computeKnownBits(SrcOp, Known, SrcDemE, Depth + 1);
+      if (IsLogical) {
+        Known = Known.zext(BitWidth);
+        Known.Zero.setBitsFrom(SrcBitWidth);
+      } else
+        Known = Known.sext(BitWidth);
+      break;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+    case SystemZISD::SELECT_CCMASK:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
+      break;
+    case SystemZISD::REPLICATE: {
+      SDValue SrcOp = Op.getOperand(0);
+      DAG.computeKnownBits(SrcOp, Known, Depth + 1);
+      if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
+        Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
+      break;
+    }
+    default:
+      break;
+    }
    }
  
-  default:
-    break;
+  // Known has the width of the source operand(s). Adjust if needed to match
+  // the passed bitwidth.
+  if (Known.getBitWidth() != BitWidth)
+    Known = Known.zextOrTrunc(BitWidth);
+}
+
+static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
+                                        const SelectionDAG &DAG, unsigned Depth,
+                                        unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
+  if (LHS == 1) return 1; // Early out.
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
+  if (RHS == 1) return 1; // Early out.
+  unsigned Common = std::min(LHS, RHS);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getScalarSizeInBits();
+  if (SrcBitWidth > VTBits) { // PACK
+    unsigned SrcExtraBits = SrcBitWidth - VTBits;
+    if (Common > SrcExtraBits)
+      return (Common - SrcExtraBits);
+    return 1;
+  }
+  assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
+  return Common;
+}
+
+unsigned
+SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  if (Op.getResNo() != 0)
+    return 1;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue PackedOp = Op.getOperand(1);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
+      unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
+      EVT VT = Op.getValueType();
+      unsigned VTBits = VT.getScalarSizeInBits();
+      Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
+      return Tmp;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::SELECT_CCMASK:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
+    default:
+      break;
+    }
    }
+
+  return 1;
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h

index fcb2643..dc53d2e 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -499,6 +499,12 @@ public:
                                       const SelectionDAG &DAG,
                                       unsigned Depth = 0) const override;
  
+  /// Determine the number of bits in the operation that are sign bits.
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           unsigned Depth) const override;
+
    ISD::NodeType getExtendForAtomicOps() const override {
      return ISD::ANY_EXTEND;
    }
diff --git a/test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll b/test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll

new file mode 100644 (file)

index 0000000..3bcbbb4
--- /dev/null
+++ b/test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll
@@ -0,0 +1,460 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 1): i64 -> i32
+; NOTE: The vector AND is optimized away, but vrepig+vpksgs is used instead
+; of vrepif. Similarly for more test cases below.
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpksgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                                  <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpksfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                                  <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpkshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 1): i64 -> i32
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpklsgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                                  <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpklsfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                                  <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKLS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpklshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS (operand elements are 0): i64 -> i32
+define <4 x i32> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 1): i64 -> i32
+define <4 x i32> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpksg %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 0): i32 -> i16
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                          <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 1): i32 -> i16
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpksf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                          <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 0): i16 -> i8
+define <16 x i8> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKS (operand elements are 1): i16 -> i8
+define <16 x i8> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpksh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS (operand elements are 0): i64 -> i32
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 1): i64 -> i32
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpklsg %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 0): i32 -> i16
+define <8 x i16> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                           <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 1): i32 -> i16
+define <8 x i16> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpklsf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                           <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 0): i16 -> i8
+define <16 x i8> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKLS (operand elements are 1): i16 -> i8
+define <16 x i8> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpklsh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI (operand elements are 0):
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 0>,
+                                         <2 x i64> <i64 0, i64 0>, i32 0)
+  %res = and <2 x i64> %perm, <i64 1, i64 1>
+  ret <2 x i64> %res
+}
+
+; VPDI (operand elements are 1):
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpdi %v24, %v0, %v0, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 1, i64 1>,
+                                         <2 x i64> <i64 1, i64 1>, i32 0)
+  %res = and <2 x i64> %perm, <i64 1, i64 1>
+  ret <2 x i64> %res
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB (operand elements are 0):
+define <16 x i8> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                  i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>
+                 <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                  i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  i32 1)
+
+  %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; VSLDB (operand elements are 1):
+define <16 x i8> @f27() {
+; CHECK-LABEL: f27:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepib %v0, 1
+; CHECK-NEXT: vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                  i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>
+                 <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                  i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  i32 1)
+
+  %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; Test that intrinsic CC result is recognized.
+define i32 @f28(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: f28:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: lhi %r2, 0
+; CHECK-NEXT: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %res = and i32 %cc, -4
+  ret i32 %res
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM (operand elements are 0):
+define <16 x i8> @f29() {
+; CHECK-LABEL: f29:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; Test VPERM (operand elements are 1):
+define <16 x i8> @f30() {
+; CHECK-LABEL: f30:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vrepib %v1, 1
+; CHECK-NEXT: vperm %v24, %v1, %v1, %v0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                             i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                             i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll b/test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll

new file mode 100644 (file)

index 0000000..1966340
--- /dev/null
+++ b/test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll
@@ -0,0 +1,384 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
+
+; VUPHB (used operand elements are 0)
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPHB (used operand elements are 1)
+; NOTE: The AND is optimized away, but instead of replicating '1' into <8 x
+; i16>, the original vector constant is put in the constant pool and then
+; unpacked (repeated in more test cases below).
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 0)
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+                                          <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                           i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 1)
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+                                          <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                           i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
+
+; VUPHH (used operand elements are 0)
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 0, i16 0, i16 0, i16 0,
+                                          i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPHH (used operand elements are 1)
+define <4 x i32> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 1, i16 1, i16 1, i16 1,
+                                          i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 0)
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 1)
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
+
+; VUPHF (used operand elements are 0)
+define <2 x i64> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPHF (used operand elements are 1)
+define <2 x i64> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 0)
+define <2 x i64> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 1)
+define <2 x i64> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
+
+; VUPLB (used operand elements are 0)
+define <8 x i16> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLB (used operand elements are 1)
+define <8 x i16> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 0)
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 1)
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
+
+; VUPLHW (used operand elements are 0)
+define <4 x i32> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHW (used operand elements are 1)
+define <4 x i32> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhw %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 0)
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 1)
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
+
+; VUPLF (used operand elements are 0)
+define <2 x i64> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLF (used operand elements are 1)
+define <2 x i64> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 0)
+define <2 x i64> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 1)
+define <2 x i64> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; Test that signed unpacking of positive elements gives known zeros in high part.
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
+
+; Test that signed unpacking of negative elements gives known ones in high part.
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+;                         61680 = 0xf0f0
+; CHECK-NEXT:  vgbm %v24, 61680
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
+
+; Test that logical unpacking of negative elements gives known zeros in high part.
+define <2 x i64> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
diff --git a/test/CodeGen/SystemZ/knownbits.ll b/test/CodeGen/SystemZ/knownbits.ll

new file mode 100644 (file)

index 0000000..703c0bf
--- /dev/null
+++ b/test/CodeGen/SystemZ/knownbits.ll
@@ -0,0 +1,51 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode().
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+; SystemZISD::REPLICATE
+define i32 @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vlgvf
+; CHECK-NOT:   lhi %r2, 0
+; CHECK-NOT:   chi %r0, 0
+; CHECK-NOT:   lochilh %r2, 1
+; CHECK: br %r14
+  %cmp0 = icmp ne <4 x i32> undef, zeroinitializer
+  %zxt0 = zext <4 x i1> %cmp0 to <4 x i32>
+  %ext0 = extractelement <4 x i32> %zxt0, i32 3
+  br label %exit
+
+exit:
+; The vector icmp+zext involves a REPLICATE of 1's. If KnownBits reflects
+; this, DAGCombiner can see that the i32 icmp and zext here are not needed.
+  %cmp1 = icmp ne i32 %ext0, 0
+  %zxt1 = zext i1 %cmp1 to i32
+  ret i32 %zxt1
+}
+
+; SystemZISD::JOIN_DWORDS (and REPLICATE)
+define void @f1() {
+; The DAG XOR has JOIN_DWORDS and REPLICATE operands. With KnownBits properly set
+; for both these nodes, ICMP is used instead of TM during lowering because
+; adjustForRedundantAnd() succeeds.
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NOT:   tmll
+; CHECK-NOT:   jne
+; CHECK:       cijlh
+  %1 = load i16, i16* null, align 2
+  %2 = icmp eq i16 %1, 0
+  %3 = insertelement <2 x i1> undef, i1 %2, i32 0
+  %4 = insertelement <2 x i1> %3, i1 true, i32 1
+  %5 = xor <2 x i1> %4, <i1 true, i1 true>
+  %6 = extractelement <2 x i1> %5, i32 0
+  %7 = or i1 %6, undef
+  br i1 %7, label %9, label %8
+
+; <label>:8:                                      ; preds = %0
+  unreachable
+
+; <label>:9:                                      ; preds = %0
+  unreachable
+}
diff --git a/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll b/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll

new file mode 100644 (file)

index 0000000..1fc1496
--- /dev/null
+++ b/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll
@@ -0,0 +1,236 @@
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC: i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 0, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %trunc = trunc <4 x i32> %extr to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKS_CC: i32 -> i16
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                                  <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %trunc = trunc <8 x i16> %extr to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKS_CC: i16 -> i8
+define <16 x i8> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpkshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %trunc = trunc <16 x i8> %extr to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC: i64 -> i32
+define <4 x i32> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsgs %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %trunc = trunc <4 x i32> %extr to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKLS_CC: i32 -> i16
+define <8 x i16> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                                   <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %trunc = trunc <8 x i16> %extr to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKLS_CC: i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %trunc = trunc <16 x i8> %extr to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS: i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksg %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %trunc = trunc <4 x i32> %call to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKS: i32 -> i16
+define <8 x i16> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                          <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %trunc = trunc <8 x i16> %call to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKS: i16 -> i8
+define <16 x i8> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %trunc = trunc <16 x i8> %call to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS: i64 -> i32
+define <4 x i32> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsg %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %trunc = trunc <4 x i32> %call to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKLS: i32 -> i16
+define <8 x i16> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                           <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %trunc = trunc <8 x i16> %call to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKLS: i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %trunc = trunc <16 x i8> %call to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI:
+define <2 x i64> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vpdi %v24, %v1, %v0, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 1>,
+                                         <2 x i64> <i64 1, i64 0>, i32 0)
+  %trunc = trunc <2 x i64> %perm to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB:
+define <16 x i8> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                  i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>, <16 x i8>
+                 <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                  i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  i32 1)
+  %trunc = trunc <16 x i8> %shfd to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM:
+define <16 x i8> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vperm %v24, %v0, %v0, %v0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>)
+  %trunc = trunc <16 x i8> %perm to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
diff --git a/test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll b/test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll

new file mode 100644 (file)

index 0000000..b37c1c7
--- /dev/null
+++ b/test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll
@@ -0,0 +1,97 @@
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+
+; VUPHB
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+                                          i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %trunc = trunc <8 x i16> %unp to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+
+; VUPHH
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 0, i16 1, i16 0, i16 1,
+                                          i16 0, i16 1, i16 0, i16 1>)
+  %trunc = trunc <4 x i32> %unp to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+
+; VUPHF
+define <2 x i64> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+  %trunc = trunc <2 x i64> %unp to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+
+; VUPLB
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+                                          i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %trunc = trunc <8 x i16> %unp to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+
+; VUPLHW
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplhw %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 1, i16 0, i16 1, i16 0,
+                                           i16 1, i16 0, i16 1, i16 0>)
+  %trunc = trunc <4 x i32> %unp to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+
+; VUPLF
+define <2 x i64> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 0, i32 1, i32 0>)
+  %trunc = trunc <2 x i64> %unp to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
diff --git a/test/CodeGen/SystemZ/signbits.ll b/test/CodeGen/SystemZ/signbits.ll

new file mode 100644 (file)

index 0000000..4c019a6
--- /dev/null
+++ b/test/CodeGen/SystemZ/signbits.ll
@@ -0,0 +1,36 @@
+; Test that ComputeNumSignBitsForTargetNode() (SELECT_CCMASK) will help
+; DAGCombiner so that it knows that %sel0 is already sign extended.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -debug-only=isel < %s 2>&1 | FileCheck %s
+
+%0 = type <{ %1*, i16, [6 x i8] }>
+%1 = type { i32 (...)** }
+
+define signext i16 @fun(%0* %Arg0, i16 signext %Arg1) {
+entry:
+  br i1 undef, label %lab0, label %lab1
+
+lab0:
+  %icmp0 = icmp eq i32 undef, 0
+  %sel0 = select i1 %icmp0, i16 %Arg1, i16 1
+  br label %lab1
+
+lab1:
+; CHECK: *** MachineFunction at end of ISel ***
+; CHECK-LABEL: bb.2.lab1:
+; CHECK-NOT:   LHR
+; CHECK:       BRC
+  %phi0 = phi i16 [ 2, %entry ], [ %sel0, %lab0 ]
+  %sext0 = sext i16 %phi0 to i32
+  br i1 undef, label %lab2, label %lab3
+
+lab2:
+  %and0 = and i32 %sext0, 8
+  %icmp1 = icmp eq i32 %and0, 0
+  %sel1 = select i1 %icmp1, i16 %phi0, i16 4
+  ret i16 %sel1
+
+lab3:
+  ret i16 8
+}
+
diff --git a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll

index fe4ae45..7da1ded 100644 (file)
--- a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
+++ b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
@@ -17,8 +17,7 @@ define void @pr32275(<4 x i8> %B15) {
  ; CHECK-NEXT:    vlvgf [[REG2]], [[REG3]], 2
  ; CHECK-NEXT:    vn [[REG2]], [[REG2]], [[REG0]]
  ; CHECK-NEXT:    vlgvf [[REG4:%r[0-9]]], [[REG2]], 3
-; CHECK-NEXT:    tmll [[REG4]], 1
-; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:    cijlh [[REG4]], 0, .LBB0_1
  ; CHECK-NEXT:  # %bb.2: # %CF36
  ; CHECK-NEXT:    br %r14
  BB:
author	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)
committer	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Sat, 17 Mar 2018 08:32:12 +0000 (08:32 +0000)
lib/Target/SystemZ/SystemZISelLowering.cpp		patch \| blob \| history
lib/Target/SystemZ/SystemZISelLowering.h		patch \| blob \| history
test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/knownbits.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/signbits-intrinsics-binop.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/signbits.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/SystemZ/vec-trunc-to-i1.ll		patch \| blob \| history