[ScalarizeMaskedMemIntrin] Add support for scalarizing expandload and compressstore...

author Craig Topper <craig.topper@intel.com>

Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)

committer Craig Topper <craig.topper@intel.com>

Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)
author Craig Topper <craig.topper@intel.com>
Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)
committer Craig Topper <craig.topper@intel.com>
Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 26e86c2..75f0798 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -499,17 +499,21 @@ public:
    /// modes that operate across loop iterations.
    bool shouldFavorBackedgeIndex(const Loop *L) const;
  
-  /// Return true if the target supports masked load/store
-  /// AVX2 and AVX-512 targets allow masks for consecutive load and store
+  /// Return true if the target supports masked load.
    bool isLegalMaskedStore(Type *DataType) const;
+  /// Return true if the target supports masked store.
    bool isLegalMaskedLoad(Type *DataType) const;
  
-  /// Return true if the target supports masked gather/scatter
-  /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
-  /// bits scalar type.
+  /// Return true if the target supports masked scatter.
    bool isLegalMaskedScatter(Type *DataType) const;
+  /// Return true if the target supports masked gather.
    bool isLegalMaskedGather(Type *DataType) const;
  
+  /// Return true if the target supports masked compress store.
+  bool isLegalMaskedCompressStore(Type *DataType) const;
+  /// Return true if the target supports masked expand load.
+  bool isLegalMaskedExpandLoad(Type *DataType) const;
+
    /// Return true if the target has a unified operation to calculate division
    /// and remainder. If so, the additional implicit multiplication and
    /// subtraction required to calculate a remainder from division are free. This
@@ -1085,6 +1089,8 @@ public:
    virtual bool isLegalMaskedLoad(Type *DataType) = 0;
    virtual bool isLegalMaskedScatter(Type *DataType) = 0;
    virtual bool isLegalMaskedGather(Type *DataType) = 0;
+  virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
+  virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
    virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
    virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
    virtual bool prefersVectorizedAddressing() = 0;
@@ -1336,6 +1342,12 @@ public:
    bool isLegalMaskedGather(Type *DataType) override {
      return Impl.isLegalMaskedGather(DataType);
    }
+  bool isLegalMaskedCompressStore(Type *DataType) override {
+    return Impl.isLegalMaskedCompressStore(DataType);
+  }
+  bool isLegalMaskedExpandLoad(Type *DataType) override {
+    return Impl.isLegalMaskedExpandLoad(DataType);
+  }
    bool hasDivRemOp(Type *DataType, bool IsSigned) override {
      return Impl.hasDivRemOp(DataType, IsSigned);
    }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index 8e66726..6b209ed 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -270,6 +270,10 @@ public:
  
    bool isLegalMaskedGather(Type *DataType) { return false; }
  
+  bool isLegalMaskedCompressStore(Type *DataType) { return false; }
+
+  bool isLegalMaskedExpandLoad(Type *DataType) { return false; }
+
    bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; }
  
    bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 7cd7951..404159d 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -185,6 +185,14 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
    return TTIImpl->isLegalMaskedScatter(DataType);
  }
  
+bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
+  return TTIImpl->isLegalMaskedCompressStore(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
+  return TTIImpl->isLegalMaskedExpandLoad(DataType);
+}
+
  bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
    return TTIImpl->hasDivRemOp(DataType, IsSigned);
  }
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp

index 3a27035..e2ee9f2 100644 (file)
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -534,6 +534,154 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
    ModifiedDT = true;
  }
  
+static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(1);
+  Value *PassThru = CI->getArgOperand(2);
+
+  VectorType *VecType = cast<VectorType>(CI->getType());
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // The result vector
+  Value *VResult = PassThru;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.load, label %else
+    //
+
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Idx);
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, 1);
+    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Create the phi to join the new and previous value.
+    PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    ResultPhi->addIncoming(NewVResult, CondBlock);
+    ResultPhi->addIncoming(VResult, PrevIfBlock);
+    VResult = ResultPhi;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+
+  CI->replaceAllUsesWith(VResult);
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+
+  VectorType *VecType = cast<VectorType>(Src->getType());
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Type *EltTy = VecType->getVectorElementType();
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+    Builder.CreateAlignedStore(OneElt, Ptr, 1);
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
  bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
    bool EverMadeChange = false;
  
@@ -600,6 +748,16 @@ bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
          return false;
        scalarizeMaskedScatter(CI, ModifiedDT);
        return true;
+    case Intrinsic::masked_expandload:
+      if (TTI->isLegalMaskedExpandLoad(CI->getType()))
+        return false;
+      scalarizeMaskedExpandLoad(CI, ModifiedDT);
+      return true;
+    case Intrinsic::masked_compressstore:
+      if (TTI->isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
+        return false;
+      scalarizeMaskedCompressStore(CI, ModifiedDT);
+      return true;
      }
    }
  
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 4529c2c..16128ce 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3014,6 +3014,34 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
    return isLegalMaskedLoad(DataType);
  }
  
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+  if (!isa<VectorType>(DataTy))
+    return false;
+
+  if (!ST->hasAVX512())
+    return false;
+
+  // The backend can't handle a single element vector.
+  if (DataTy->getVectorNumElements() == 1)
+    return false;
+
+  Type *ScalarTy = DataTy->getVectorElementType();
+
+  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+    return true;
+
+  if (!ScalarTy->isIntegerTy())
+    return false;
+
+  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+  return IntWidth == 32 || IntWidth == 64 ||
+         ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+  return isLegalMaskedExpandLoad(DataTy);
+}
+
  bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
    // Some CPUs have better gather performance than others.
    // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h

index 8bfcd07..5dce3a7 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -184,6 +184,8 @@ public:
    bool isLegalMaskedStore(Type *DataType);
    bool isLegalMaskedGather(Type *DataType);
    bool isLegalMaskedScatter(Type *DataType);
+  bool isLegalMaskedExpandLoad(Type *DataType);
+  bool isLegalMaskedCompressStore(Type *DataType);
    bool hasDivRemOp(Type *DataType, bool IsSigned);
    bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
    bool areInlineCompatible(const Function *Caller,
diff --git a/test/CodeGen/X86/pr39666.ll b/test/CodeGen/X86/pr39666.ll

new file mode 100644 (file)

index 0000000..949e8ab
--- /dev/null
+++ b/test/CodeGen/X86/pr39666.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s
+
+define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %else
+; CHECK-NEXT:    vpinsrq $1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+ %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
+
+define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrb $0, %xmm1, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %cond.store
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    addq $8, %rdi
+; CHECK-NEXT:  .LBB1_2: # %else
+; CHECK-NEXT:    vpextrb $8, %xmm1, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_4
+; CHECK-NEXT:  # %bb.3: # %cond.store1
+; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT:  .LBB1_4: # %else2
+; CHECK-NEXT:    retq
+ call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>)
diff --git a/test/CodeGen/X86/pr40994.ll b/test/CodeGen/X86/pr40994.ll

new file mode 100644 (file)

index 0000000..66d5594
--- /dev/null
+++ b/test/CodeGen/X86/pr40994.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s
+
+define <8 x i8> @foo(<16 x i8> %a) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %cond.store
+; CHECK-NEXT:    pextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $2, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $4, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $6, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $8, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $10, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $12, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $14, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    retq
+  %v = alloca i8, i32 8, align 16
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+  %ptr = bitcast i8* %v to <8 x i8>*
+  %out = load <8 x i8>, <8 x i8>* %ptr
+  ret <8 x i8> %out
+}
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>) #0
diff --git a/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll

new file mode 100644 (file)

index 0000000..cc53a2c
--- /dev/null
+++ b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP5]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    br i1 true, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 true, i1 true>)
+  ret void
+}
+
+define void @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 false>)
+  ret void
+}
+
+define void @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64>, i64*, <2 x i1>)
diff --git a/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll

new file mode 100644 (file)

index 0000000..3b8c7aa
--- /dev/null
+++ b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define <2 x i64> @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP3]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP4]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP6]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP7]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+declare <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64*,  <2 x i1>, <2 x i64>)
author	Craig Topper <craig.topper@intel.com>
	Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Thu, 21 Mar 2019 17:38:52 +0000 (17:38 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/CodeGen/ScalarizeMaskedMemIntrin.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
test/CodeGen/X86/pr39666.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/pr40994.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll	[new file with mode: 0644]	patch \| blob