[InstCombine][AVX2] Combine VPERMD/VPERMPS intrinsics with constant masks to shufflev...

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp

index 10be4b9..307bb9f 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -681,6 +681,37 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
    return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
  }
  
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  unsigned Size = VecTy->getNumElements();
+  assert(Size == 8 && "Unexpected shuffle mask size");
+
+  // Initialize the resulting shuffle mask to all zeroes.
+  uint32_t Indexes[8] = {0};
+
+  for (unsigned I = 0; I < Size; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || !isa<ConstantInt>(COp))
+      return nullptr;
+
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.getLoBits(3);
+    Indexes[I] = (uint32_t)Index.getZExtValue();
+  }
+
+  auto ShuffleMask =
+      ConstantDataVector::get(II.getContext(), makeArrayRef(Indexes, Size));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
  /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
  /// source vectors, unless a zero bit is set. If a zero bit is set,
  /// then ignore that half of the mask and clear that half of the vector.
@@ -1751,6 +1782,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
        return replaceInstUsesWith(*II, V);
      break;
  
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    if (Value *V = simplifyX86vpermv(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
    case Intrinsic::x86_avx_vperm2f128_pd_256:
    case Intrinsic::x86_avx_vperm2f128_ps_256:
    case Intrinsic::x86_avx_vperm2f128_si_256:
diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/x86-avx2.ll

index ef6d4e6..8d1fd89 100644 (file)
--- a/test/Transforms/InstCombine/x86-avx2.ll
+++ b/test/Transforms/InstCombine/x86-avx2.ll
@@ -2,12 +2,11 @@
  ; RUN: opt < %s -instcombine -S | FileCheck %s
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  
-; FIXME: Verify that instcombine is able to fold identity shuffles.
+; Verify that instcombine is able to fold identity shuffles.
  
  define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
  ; CHECK-LABEL: @identity_test_vpermd(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-; CHECK-NEXT:    ret <8 x i32> [[A]]
+; CHECK-NEXT:    ret <8 x i32> %a0
  ;
    %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
    ret <8 x i32> %a
@@ -15,20 +14,19 @@ define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
  
  define <8 x float> @identity_test_vpermps(<8 x float> %a0) {
  ; CHECK-LABEL: @identity_test_vpermps(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-; CHECK-NEXT:    ret <8 x float> [[A]]
+; CHECK-NEXT:    ret <8 x float> %a0
  ;
    %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
    ret <8 x float> %a
  }
  
-; FIXME: Instcombine should be able to fold the following shuffle to a builtin shufflevector
+; Instcombine should be able to fold the following shuffle to a builtin shufflevector
  ; with a shuffle mask of all zeroes.
  
  define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
  ; CHECK-LABEL: @zero_test_vpermd(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
  ;
    %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
    ret <8 x i32> %a
@@ -36,19 +34,19 @@ define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
  
  define <8 x float> @zero_test_vpermps(<8 x float> %a0) {
  ; CHECK-LABEL: @zero_test_vpermps(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x float> [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
  ;
    %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
    ret <8 x float> %a
  }
  
-; FIXME: Verify that instcombine is able to fold constant shuffles.
+; Verify that instcombine is able to fold constant shuffles.
  
  define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
  ; CHECK-LABEL: @shuffle_test_vpermd(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A:%.*]]0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-; CHECK-NEXT:    ret <8 x i32> [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
  ;
    %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
    ret <8 x i32> %a
@@ -56,8 +54,8 @@ define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
  
  define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) {
  ; CHECK-LABEL: @shuffle_test_vpermps(
-; CHECK-NEXT:    [[A:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A:%.*]]0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-; CHECK-NEXT:    ret <8 x float> [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
  ;
    %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
    ret <8 x float> %a
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 1 May 2016 16:41:22 +0000 (16:41 +0000)
lib/Transforms/InstCombine/InstCombineCalls.cpp		patch \| blob \| history
test/Transforms/InstCombine/x86-avx2.ll		patch \| blob \| history