[X86][SSE] Add faux shuffle combining support for PACKUS

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index e68a6ed..6abf46f 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5931,7 +5931,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
        Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
      return true;
    }
-  case X86ISD::PACKSS: {
+  case X86ISD::PACKSS:
+  case X86ISD::PACKUS: {
      SDValue N0 = N.getOperand(0);
      SDValue N1 = N.getOperand(1);
      assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
@@ -5940,9 +5941,19 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
  
      // If we know input saturation won't happen we can treat this
      // as a truncation shuffle.
-    if (DAG.ComputeNumSignBits(N0) <= NumBitsPerElt ||
-        DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)
-      return false;
+    if (Opcode == X86ISD::PACKSS) {
+      if (DAG.ComputeNumSignBits(N0) <= NumBitsPerElt ||
+          DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)
+        return false;
+    } else {
+      KnownBits Known0, Known1;
+      DAG.computeKnownBits(N0, Known0);
+      if (Known0.countMinLeadingZeros() < NumBitsPerElt)
+        return false;
+      DAG.computeKnownBits(N1, Known1);
+      if (Known1.countMinLeadingZeros() < NumBitsPerElt)
+        return false;
+    }
  
      bool IsUnary = (N0 == N1);
      unsigned Offset = IsUnary ? 0 : NumElts;
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

index 69a6eb7..f0c7ae3 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -846,16 +846,12 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn
  define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
  ; X32-LABEL: shuffle_combine_packusdw_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17]
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: shuffle_combine_packusdw_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X64-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17]
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
  ; X64-NEXT:    retq
    %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
@@ -867,18 +863,12 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readno
  define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
  ; X32-LABEL: shuffle_combine_packuswb_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; X32-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; X32-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,23,22,21,20,19,18,17,16,23,22,21,20,19,18,17,16]
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: shuffle_combine_packuswb_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; X64-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,23,22,21,20,19,18,17,16,23,22,21,20,19,18,17,16]
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
  ; X64-NEXT:    retq
    %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

index 874d090..3abf057 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -683,18 +683,12 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
  define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
  ; SSE-LABEL: shuffle_combine_packuswb_pshufb:
  ; SSE:       # BB#0:
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: shuffle_combine_packuswb_pshufb:
  ; AVX:       # BB#0:
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
  ; AVX-NEXT:    retq
    %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 1 Oct 2017 18:43:48 +0000 (18:43 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-ssse3.ll		patch \| blob \| history