[X86][SSE] Avoid duplicate shuffle input sources in combineX86ShufflesRecursively

author Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 0c7d4bc..cff91d6 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -30279,23 +30279,26 @@ static SDValue combineX86ShufflesRecursively(
    // Add the inputs to the Ops list, avoiding duplicates.
    SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
  
-  int InputIdx0 = -1, InputIdx1 = -1;
-  for (int i = 0, e = Ops.size(); i < e; ++i) {
-    SDValue BC = peekThroughBitcasts(Ops[i]);
-    if (Input0 && BC == peekThroughBitcasts(Input0))
-      InputIdx0 = i;
-    if (Input1 && BC == peekThroughBitcasts(Input1))
-      InputIdx1 = i;
-  }
+  auto AddOp = [&Ops](SDValue Input, int InsertionPoint = -1) -> int {
+    if (!Input)
+      return -1;
+    // Attempt to find an existing match.
+    SDValue InputBC = peekThroughBitcasts(Input);
+    for (int i = 0, e = Ops.size(); i < e; ++i)
+      if (InputBC == peekThroughBitcasts(Ops[i]))
+        return i;
+    // Match failed - should we replace an existing Op?
+    if (InsertionPoint >= 0) {
+      Ops[InsertionPoint] = Input;
+      return InsertionPoint;
+    }
+    // Add to the end of the Ops list.
+    Ops.push_back(Input);
+    return Ops.size() - 1;
+  };
  
-  if (Input0 && InputIdx0 < 0) {
-    InputIdx0 = SrcOpIndex;
-    Ops[SrcOpIndex] = Input0;
-  }
-  if (Input1 && InputIdx1 < 0) {
-    InputIdx1 = Ops.size();
-    Ops.push_back(Input1);
-  }
+  int InputIdx0 = AddOp(Input0, SrcOpIndex);
+  int InputIdx1 = AddOp(Input1);
  
    assert(((RootMask.size() > OpMask.size() &&
             RootMask.size() % OpMask.size() == 0) ||
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

index 05d19f3..122c77a 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -923,18 +923,12 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8>
  define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
  ; X32-LABEL: combine_pshufb_pshufb_or_pshufb:
  ; X32:       # %bb.0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
-; X32-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; X32-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: combine_pshufb_pshufb_or_pshufb:
  ; X64:       # %bb.0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
-; X64-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
  ; X64-NEXT:    retq
    %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
    %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

index b67f9e2..a1316eb 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -707,20 +707,23 @@ define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8>
  define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) {
  ; SSE-LABEL: combine_pshufb_pshufb_or_pshufb:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
  ; SSE-NEXT:    retq
  ;
-; AVX-LABEL: combine_pshufb_pshufb_or_pshufb:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vbroadcastss %xmm0, %xmm0
+; AVX512F-NEXT:    retq
    %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
    %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
    %3 = or <16 x i8> %1, %2
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 14 Aug 2018 17:22:37 +0000 (17:22 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-ssse3.ll		patch \| blob \| history