/// modes that operate across loop iterations.
bool shouldFavorBackedgeIndex(const Loop *L) const;
- /// Return true if the target supports masked load/store
- /// AVX2 and AVX-512 targets allow masks for consecutive load and store
+ /// Return true if the target supports masked load.
bool isLegalMaskedStore(Type *DataType) const;
+ /// Return true if the target supports masked store.
bool isLegalMaskedLoad(Type *DataType) const;
- /// Return true if the target supports masked gather/scatter
- /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
- /// bits scalar type.
+ /// Return true if the target supports masked scatter.
bool isLegalMaskedScatter(Type *DataType) const;
+ /// Return true if the target supports masked gather.
bool isLegalMaskedGather(Type *DataType) const;
+ /// Return true if the target supports masked compress store.
+ bool isLegalMaskedCompressStore(Type *DataType) const;
+ /// Return true if the target supports masked expand load.
+ bool isLegalMaskedExpandLoad(Type *DataType) const;
+
/// Return true if the target has a unified operation to calculate division
/// and remainder. If so, the additional implicit multiplication and
/// subtraction required to calculate a remainder from division are free. This
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
virtual bool isLegalMaskedGather(Type *DataType) = 0;
+ virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
+ virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
virtual bool prefersVectorizedAddressing() = 0;
bool isLegalMaskedGather(Type *DataType) override {
return Impl.isLegalMaskedGather(DataType);
}
+ bool isLegalMaskedCompressStore(Type *DataType) override {
+ return Impl.isLegalMaskedCompressStore(DataType);
+ }
+ bool isLegalMaskedExpandLoad(Type *DataType) override {
+ return Impl.isLegalMaskedExpandLoad(DataType);
+ }
bool hasDivRemOp(Type *DataType, bool IsSigned) override {
return Impl.hasDivRemOp(DataType, IsSigned);
}
bool isLegalMaskedGather(Type *DataType) { return false; }
+ bool isLegalMaskedCompressStore(Type *DataType) { return false; }
+
+ bool isLegalMaskedExpandLoad(Type *DataType) { return false; }
+
bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; }
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; }
return TTIImpl->isLegalMaskedScatter(DataType);
}
+bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
+ return TTIImpl->isLegalMaskedCompressStore(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
+ return TTIImpl->isLegalMaskedExpandLoad(DataType);
+}
+
bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
return TTIImpl->hasDivRemOp(DataType, IsSigned);
}
ModifiedDT = true;
}
+static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Mask = CI->getArgOperand(1);
+ Value *PassThru = CI->getArgOperand(2);
+
+ VectorType *VecType = cast<VectorType>(CI->getType());
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // The result vector
+ Value *VResult = PassThru;
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // br i1 %mask_1, label %cond.load, label %else
+ //
+
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Idx);
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+ "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, 1);
+ Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+ // Move the pointer if there are more blocks to come.
+ Value *NewPtr;
+ if ((Idx + 1) != VectorWidth)
+ NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ // Create the phi to join the new and previous value.
+ PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ ResultPhi->addIncoming(NewVResult, CondBlock);
+ ResultPhi->addIncoming(VResult, PrevIfBlock);
+ VResult = ResultPhi;
+
+ // Add a PHI for the pointer if this isn't the last iteration.
+ if ((Idx + 1) != VectorWidth) {
+ PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+ PtrPhi->addIncoming(NewPtr, CondBlock);
+ PtrPhi->addIncoming(Ptr, PrevIfBlock);
+ Ptr = PtrPhi;
+ }
+ }
+
+ CI->replaceAllUsesWith(VResult);
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
+static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+
+ VectorType *VecType = cast<VectorType>(Src->getType());
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ Type *EltTy = VecType->getVectorElementType();
+
+ unsigned VectorWidth = VecType->getNumElements();
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // br i1 %mask_1, label %cond.store, label %else
+ //
+ Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock =
+ IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+ Builder.CreateAlignedStore(OneElt, Ptr, 1);
+
+ // Move the pointer if there are more blocks to come.
+ Value *NewPtr;
+ if ((Idx + 1) != VectorWidth)
+ NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+ OldBr->eraseFromParent();
+ BasicBlock *PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+
+ // Add a PHI for the pointer if this isn't the last iteration.
+ if ((Idx + 1) != VectorWidth) {
+ PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+ PtrPhi->addIncoming(NewPtr, CondBlock);
+ PtrPhi->addIncoming(Ptr, PrevIfBlock);
+ Ptr = PtrPhi;
+ }
+ }
+ CI->eraseFromParent();
+
+ ModifiedDT = true;
+}
+
bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
bool EverMadeChange = false;
return false;
scalarizeMaskedScatter(CI, ModifiedDT);
return true;
+ case Intrinsic::masked_expandload:
+ if (TTI->isLegalMaskedExpandLoad(CI->getType()))
+ return false;
+ scalarizeMaskedExpandLoad(CI, ModifiedDT);
+ return true;
+ case Intrinsic::masked_compressstore:
+ if (TTI->isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
+ return false;
+ scalarizeMaskedCompressStore(CI, ModifiedDT);
+ return true;
}
}
return isLegalMaskedLoad(DataType);
}
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+ if (!isa<VectorType>(DataTy))
+ return false;
+
+ if (!ST->hasAVX512())
+ return false;
+
+ // The backend can't handle a single element vector.
+ if (DataTy->getVectorNumElements() == 1)
+ return false;
+
+ Type *ScalarTy = DataTy->getVectorElementType();
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+ return isLegalMaskedExpandLoad(DataTy);
+}
+
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
+ bool isLegalMaskedExpandLoad(Type *DataType);
+ bool isLegalMaskedCompressStore(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s
+
+define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0: # %else
+; CHECK-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
+
+define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
+; CHECK-LABEL: test11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpextrb $0, %xmm1, %eax
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB1_2
+; CHECK-NEXT: # %bb.1: # %cond.store
+; CHECK-NEXT: vmovq %xmm0, (%rdi)
+; CHECK-NEXT: addq $8, %rdi
+; CHECK-NEXT: .LBB1_2: # %else
+; CHECK-NEXT: vpextrb $8, %xmm1, %eax
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB1_4
+; CHECK-NEXT: # %bb.3: # %cond.store1
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: .LBB1_4: # %else2
+; CHECK-NEXT: retq
+ call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s
+
+define <8 x i8> @foo(<16 x i8> %a) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %cond.store
+; CHECK-NEXT: pextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $2, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $4, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $6, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $8, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: retq
+ %v = alloca i8, i32 8, align 16
+ call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+ %ptr = bitcast i8* %v to <8 x i8>*
+ %out = load <8 x i8>, <8 x i8>* %ptr
+ ret <8 x i8> %out
+}
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>) #0
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.store:
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT: store i64 [[TMP2]], i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT: br i1 [[TMP4]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.store1:
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT: store i64 [[TMP5]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT: br i1 true, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.store:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT: br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.store1:
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+define void @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT: br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.store:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT: br i1 false, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.store1:
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 false>)
+ ret void
+}
+
+define void @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT: br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.store:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT: br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.store1:
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64>, i64*, <2 x i1>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define <2 x i64> @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.load:
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP3]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP4]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT: br i1 [[TMP5]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.load1:
+; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP6]], i64 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP7]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+ %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru)
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT: br i1 true, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.load:
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT: br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.load1:
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+ %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT: br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.load:
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT: br i1 false, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.load1:
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+ %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT: br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK: cond.load:
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT: br label [[ELSE]]
+; CHECK: else:
+; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT: br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK: cond.load1:
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT: br label [[ELSE2]]
+; CHECK: else2:
+; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+ %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
+ ret <2 x i64> %ret
+}
+
+declare <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64*, <2 x i1>, <2 x i64>)