[X86] Rewrite LowerAVXCONCAT_VECTORS similar to how we handle vXi1 concats.

author Craig Topper <craig.topper@intel.com>

Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)

committer Craig Topper <craig.topper@intel.com>

Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)
author Craig Topper <craig.topper@intel.com>
Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)
committer Craig Topper <craig.topper@intel.com>
Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index df9d5bc..e316b7b 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5075,12 +5075,6 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
    return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
  }
  
-static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
-  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
-}
-
  /// Widen a vector to a larger size with the same scalar type, with the new
  /// elements either zero or undef.
  static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
@@ -5291,24 +5285,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
  }
  
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
-static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
  static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
                                  unsigned NumElems, SelectionDAG &DAG,
                                  const SDLoc &dl, unsigned VectorWidth) {
@@ -8609,30 +8585,63 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
  // 256-bit AVX can use the vinsertf128 instruction
  // to create 256-bit vectors from two other 128-bit ones.
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
    SDLoc dl(Op);
    MVT ResVT = Op.getSimpleValueType();
  
    assert((ResVT.is256BitVector() ||
            ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
  
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  unsigned NumElems = ResVT.getVectorNumElements();
-  if (ResVT.is256BitVector())
-    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+  unsigned NumOperands = Op.getNumOperands();
+  unsigned NumZero = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    SDValue SubVec = Op.getOperand(i);
+    if (SubVec.isUndef())
+      continue;
+    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      ++NumZero;
+    else {
+      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+      NonZeros |= 1 << i;
+      ++NumNonZero;
+    }
+  }
+
+  // If there are zero or one non-zeros we can handle this very simply.
+  if (NumNonZero <= 1) {
+    SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+                          : DAG.getUNDEF(ResVT);
+    if (!NumNonZero)
+      return Vec;
+    unsigned Idx = countTrailingZeros(NonZeros);
+    SDValue SubVec = Op.getOperand(Idx);
+    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
+                       DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
+  }
  
-  if (Op.getNumOperands() == 4) {
+  if (NumOperands > 2) {
      MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                    ResVT.getVectorNumElements()/2);
-    SDValue V3 = Op.getOperand(2);
-    SDValue V4 = Op.getOperand(3);
-    return concat256BitVectors(
-        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
-        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
-        NumElems, DAG, dl);
+    ArrayRef<SDUse> Ops = Op->ops();
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(0, NumOperands/2));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(NumOperands/2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
    }
-  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  assert(NumNonZero == 2 && "Simple cases not handled?");
+
+  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
+                            DAG.getUNDEF(ResVT), Op.getOperand(0),
+                            DAG.getIntPtrConstant(0, dl));
+  unsigned NumElems = ResVT.getVectorNumElements();
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
+                     DAG.getIntPtrConstant(NumElems/2, dl));
  }
  
  // Return true if all the operands of the given CONCAT_VECTORS node are zeros
@@ -8689,6 +8698,7 @@ static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
    return SDValue();
  }
  
+// TODO: Merge this with LowerAVXCONCAT_VECTORS?
  static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG & DAG) {
@@ -8775,7 +8785,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
    // from two other 128-bit ones.
  
    // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG);
+  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td

index 42da15e..663f2d1 100644 (file)
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -3871,9 +3871,7 @@ multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
  
  def : Pat<(masked_store addr:$dst, Mask,
               (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
                                 (iPTR 0)))),
            (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                        (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@@ -3888,9 +3886,7 @@ multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
  
  def : Pat<(masked_store addr:$dst, Mask,
               (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
                                 (iPTR 0)))),
            (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                        (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -3913,9 +3909,7 @@ def : Pat<(_.info128.VT (extract_subvector
  def : Pat<(_.info128.VT (extract_subvector
                  (_.info512.VT (masked_load addr:$srcAddr, Mask,
                        (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                              (iPTR 0))))),
                  (iPTR 0))),
            (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@@ -3941,9 +3935,7 @@ def : Pat<(_.info128.VT (extract_subvector
  def : Pat<(_.info128.VT (extract_subvector
                  (_.info512.VT (masked_load addr:$srcAddr, Mask,
                        (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                              (iPTR 0))))),
                  (iPTR 0))),
            (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll

index 2758353..e0715b5 100644 (file)
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1764,15 +1764,11 @@ define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
  ; X32-LABEL: test_mm512_zextps128_ps512:
  ; X32:       # %bb.0:
  ; X32-NEXT:    vmovaps %xmm0, %xmm0
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: test_mm512_zextps128_ps512:
  ; X64:       # %bb.0:
  ; X64-NEXT:    vmovaps %xmm0, %xmm0
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
  ; X64-NEXT:    retq
    %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    ret <16 x float> %res
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

index 8cff13a..a9aff5b 100644 (file)
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -3054,7 +3054,6 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x fl
  define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
  ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
  ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
  ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
  ; CHECK-NEXT:    kmovw %edi, %k1
  ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
@@ -3075,7 +3074,6 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>,
  define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
  ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
  ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
  ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
  ; CHECK-NEXT:    kmovw %edi, %k1
  ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
author	Craig Topper <craig.topper@intel.com>
	Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Tue, 13 Mar 2018 22:05:25 +0000 (22:05 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrAVX512.td		patch \| blob \| history
test/CodeGen/X86/avx512-intrinsics-fast-isel.ll		patch \| blob \| history
test/CodeGen/X86/avx512-intrinsics-upgrade.ll		patch \| blob \| history