[X86][AVX2] Add support for target shuffle combining to BROADCAST

author Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d6a3690..4dad0c1 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -24717,13 +24717,10 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
      return true;
    }
  
-  if (!FloatDomain)
-    return false;
-
    // Check if we have SSE3 which will let us use MOVDDUP etc. The
    // instructions are no slower than UNPCKLPD but has the option to
    // fold the input operand into even an unaligned memory load.
-  if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) {
+  if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
      if (isTargetShuffleEquivalent(Mask, {0, 0})) {
        Shuffle = X86ISD::MOVDDUP;
        ShuffleVT = MVT::v2f64;
@@ -24741,7 +24738,7 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
      }
    }
  
-  if (SrcVT.is256BitVector()) {
+  if (SrcVT.is256BitVector() && FloatDomain) {
      assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
      if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
        Shuffle = X86ISD::MOVDDUP;
@@ -24760,7 +24757,7 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
      }
    }
  
-  if (SrcVT.is512BitVector()) {
+  if (SrcVT.is512BitVector() && FloatDomain) {
      assert(Subtarget.hasAVX512() &&
             "AVX512 required for 512-bit vector shuffles");
      if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -24782,6 +24779,23 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
      }
    }
  
+  // Attempt to match against broadcast-from-vector.
+  if (Subtarget.hasAVX2()) {
+    for (MVT SVT :
+         {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64}) {
+      if (FloatDomain != SVT.isFloatingPoint())
+        continue;
+
+      unsigned NumElts = SrcVT.getSizeInBits() / SVT.getSizeInBits();
+      SmallVector<int, 64> BroadcastMask(NumElts, 0);
+      if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+        Shuffle = X86ISD::VBROADCAST;
+        ShuffleVT = MVT::getVectorVT(SVT, NumElts);
+        return true;
+      }
+    }
+  }
+
    return false;
  }
  
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

index b6e66c9..8324c58 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -3,6 +3,7 @@
  
  declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
  declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
  declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
  
  define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
@@ -59,3 +60,126 @@ define <4 x i64> @combine_permq_pshufb(<4 x i64> %a0) {
    %4 = bitcast <32 x i8> %3 to <4 x i64>
    ret <4 x i64> %4
  }
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %2 = bitcast <4 x i64> %1 to <32 x i8>
+  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
+  %4 = bitcast <32 x i8> %3 to <8 x i32>
+  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+  %6 = bitcast <8 x i32> %5 to <32 x i8>
+  ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %2 = bitcast <4 x i64> %1 to <32 x i8>
+  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %4 = bitcast <32 x i8> %3 to <8 x i32>
+  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+  %6 = bitcast <8 x i32> %5 to <32 x i8>
+  ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastd128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+  %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
+  ret <16 x i8> %2
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastd256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
+  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i32> %3
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastq256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i32> %3
+}
+
+define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <4 x float> %a to <16 x i8>
+  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+  %3 = bitcast <16 x i8> %2 to <4 x float>
+  ret <4 x float> %3
+}
+
+define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastss256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
+  ret <8 x float> %2
+}
+
+define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastsd256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %2 = bitcast <4 x double> %1 to <8 x float>
+  %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+  %4 = bitcast <8 x float> %3 to <4 x double>
+  ret <4 x double> %4
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

index 4a80663..751ee52 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -5,6 +5,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x
  
  declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
  declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
  
  declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
  declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
@@ -369,3 +371,31 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
    %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
    ret <64 x i8> %res1
  }
+
+define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastw512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %1
+}
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 5 Jul 2016 20:11:29 +0000 (20:11 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll		patch \| blob \| history