STATISTIC(NumSimplified, "Number of library calls simplified");
-static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
- "unfold-element-atomic-memcpy-max-elements",
- cl::init(16),
- cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
- "allowed to unfold"));
-
static cl::opt<unsigned> GuardWideningWindow(
"instcombine-guard-widening-window",
cl::init(3),
cl::desc("How wide an instruction window to bypass looking for "
"another guard"));
-
/// Return the specified type promoted as it would be to pass though a va_arg
/// area.
static Type *getPromotedType(Type *Ty) {
return ConstantVector::get(BoolVec);
}
-Instruction *
-InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
- // Try to unfold this intrinsic into sequence of explicit atomic loads and
- // stores.
- // First check that number of elements is compile time constant.
- auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
- if (!LengthCI)
- return nullptr;
-
- // Check that there are not too many elements.
- uint64_t LengthInBytes = LengthCI->getZExtValue();
- uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
- uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
- if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
- return nullptr;
-
- // Only expand if there are elements to copy.
- if (NumElements > 0) {
- // Don't unfold into illegal integers
- uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
- if (!getDataLayout().isLegalInteger(ElementSizeInBits))
- return nullptr;
-
- // Cast source and destination to the correct type. Intrinsic input
- // arguments are usually represented as i8*. Often operands will be
- // explicitly casted to i8* and we can just strip those casts instead of
- // inserting new ones. However it's easier to rely on other InstCombine
- // rules which will cover trivial cases anyway.
- Value *Src = AMI->getRawSource();
- Value *Dst = AMI->getRawDest();
- Type *ElementPointerType =
- Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
- Src->getType()->getPointerAddressSpace());
-
- Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
- "memcpy_unfold.src_casted");
- Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
- "memcpy_unfold.dst_casted");
-
- for (uint64_t i = 0; i < NumElements; ++i) {
- // Get current element addresses
- ConstantInt *ElementIdxCI =
- ConstantInt::get(AMI->getContext(), APInt(64, i));
- Value *SrcElementAddr =
- Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
- Value *DstElementAddr =
- Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
- // Load from the source. Transfer alignment information and mark load as
- // unordered atomic.
- LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
- Load->setOrdering(AtomicOrdering::Unordered);
- // We know alignment of the first element. It is also guaranteed by the
- // verifier that element size is less or equal than first element
- // alignment and both of this values are powers of two. This means that
- // all subsequent accesses are at least element size aligned.
- // TODO: We can infer better alignment but there is no evidence that this
- // will matter.
- Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
- : ElementSizeInBytes);
- Load->setDebugLoc(AMI->getDebugLoc());
-
- // Store loaded value via unordered atomic store.
- StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
- Store->setOrdering(AtomicOrdering::Unordered);
- Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
- : ElementSizeInBytes);
- Store->setDebugLoc(AMI->getDebugLoc());
- }
- }
-
- // Set the number of elements of the copy to 0, it will be deleted on the
- // next iteration.
- AMI->setLength(Constant::getNullValue(LengthCI->getType()));
- return AMI;
-}
-
-Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
unsigned CopyDstAlign = MI->getDestAlignment();
if (CopyDstAlign < DstAlign){
return MI;
}
- auto* MTI = cast<MemTransferInst>(MI);
- unsigned SrcAlign = getKnownAlignment(MTI->getRawSource(), DL, MI, &AC, &DT);
- unsigned CopySrcAlign = MTI->getSourceAlignment();
+ unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+ unsigned CopySrcAlign = MI->getSourceAlignment();
if (CopySrcAlign < SrcAlign) {
- MTI->setSourceAlignment(SrcAlign);
+ MI->setSourceAlignment(SrcAlign);
return MI;
}
// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
// load/store.
- ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+ ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
if (!MemOpLength) return nullptr;
// Source and destination pointer types are always "i8*" for intrinsic. See
Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
- LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
+ LoadInst *L = Builder.CreateLoad(Src);
// Alignment from the mem intrinsic will be better, so use it.
L->setAlignment(CopySrcAlign);
if (CopyMD)
if (LoopMemParallelMD)
L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
- StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
+ StoreInst *S = Builder.CreateStore(L, Dest);
// Alignment from the mem intrinsic will be better, so use it.
S->setAlignment(CopyDstAlign);
if (CopyMD)
if (LoopMemParallelMD)
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+ if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+ // non-atomics can be volatile
+ L->setVolatile(MT->isVolatile());
+ S->setVolatile(MT->isVolatile());
+ }
+ if (isa<AtomicMemTransferInst>(MI)) {
+ // atomics have to be unordered
+ L->setOrdering(AtomicOrdering::Unordered);
+ S->setOrdering(AtomicOrdering::Unordered);
+ }
+
// Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+ MI->setLength(Constant::getNullValue(MemOpLength->getType()));
return MI;
}
// Intrinsics cannot occur in an invoke, so handle them here instead of in
// visitCallSite.
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
bool Changed = false;
// memmove/cpy/set of zero bytes is a noop.
}
// No other transformations apply to volatile transfers.
- if (MI->isVolatile())
- return nullptr;
+ if (auto *M = dyn_cast<MemIntrinsic>(MI))
+ if (M->isVolatile())
+ return nullptr;
// If we have a memmove and the source operation is a constant global,
// then the source and dest pointers can't alias, so we can change this
// into a call to memcpy.
- if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+ if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
if (GVSrc->isConstant()) {
Module *M = CI.getModule();
- Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+ Intrinsic::ID MemCpyID =
+ isa<AtomicMemMoveInst>(MMI)
+ ? Intrinsic::memcpy_element_unordered_atomic
+ : Intrinsic::memcpy;
Type *Tys[3] = { CI.getArgOperand(0)->getType(),
CI.getArgOperand(1)->getType(),
CI.getArgOperand(2)->getType() };
}
}
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
// memmove(x,x,size) -> noop.
if (MTI->getSource() == MTI->getDest())
return eraseInstFromFunction(CI);
// If we can determine a pointer alignment that is bigger than currently
// set, update the alignment.
- if (isa<MemTransferInst>(MI)) {
- if (Instruction *I = SimplifyMemTransfer(MI))
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemTransfer(MTI))
return I;
} else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
if (Instruction *I = SimplifyMemSet(MSI))
if (Changed) return II;
}
- if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
- if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
- if (C->isNullValue())
- return eraseInstFromFunction(*AMI);
-
- if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
- return I;
- }
-
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
return I;
-;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
-;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
-;; verify that inst combine handles these intrinsics properly once they have been
-;; added to that class hierarchy.
-
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -instcombine -S < %s | FileCheck %s
;; ---- memset -----
-; Ensure 0-length memset isn't removed
+; Ensure 0-length memset is removed
define void @test_memset_zero_length(i8* %dest) {
- ; CHECK-LABEL: test_memset_zero_length
- ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
- ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memset_zero_length(
+; CHECK-NEXT: ret void
+;
call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
ret void
}
-; Ensure that small-sized memsets don't convert to stores
+; Placeholder test. This will chance once support for lowering atomic memsets is added to instcombine.
define void @test_memset_to_store(i8* %dest) {
- ; CHECK-LABEL: test_memset_to_store
- ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
- ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
- ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
- ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
- ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memset_to_store(
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST:%.*]], i8 1, i32 1, i32 1)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 2, i32 1)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 4, i32 1)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 8, i32 1)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 16, i32 1)
+; CHECK-NEXT: ret void
+;
call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 16, i32 1)
ret void
}
;; =========================================
;; ----- memmove ------
-; memmove from a global constant source does not become memcpy
-@gconst = constant [8 x i8] c"0123456\00"
+
+@gconst = constant [32 x i8] c"0123456789012345678901234567890\00"
+; Check that a memmove from a global constant is converted into a memcpy
define void @test_memmove_to_memcpy(i8* %dest) {
- ; CHECK-LABEL: test_memmove_to_memcpy
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
- ; CHECK-NEXT: ret void
- call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+; CHECK-LABEL: @test_memmove_to_memcpy(
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.*]], i8* align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
ret void
}
define void @test_memmove_zero_length(i8* %dest, i8* %src) {
- ; CHECK-LABEL: test_memmove_zero_length
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
- ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_zero_length(
+; CHECK-NEXT: ret void
+;
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
- ret void
+ ret void
}
; memmove with src==dest is removed
define void @test_memmove_removed(i8* %srcdest, i32 %sz) {
- ; CHECK-LABEL: test_memmove_removed
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
- ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_removed(
+; CHECK-NEXT: ret void
+;
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
; memmove with a small constant length is converted to a load/store pair
define void @test_memmove_loadstore(i8* %dest, i8* %src) {
- ; CHECK-LABEL: test_memmove_loadstore
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
- ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
- ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_loadstore(
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1
+; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16*
+; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1
+; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1
+; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1
+; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
+; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
+; CHECK-NEXT: ret void
+;
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
+ ret void
+}
+
+define void @test_memmove_loadstore_2(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_2(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2
+; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2
+; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2
+; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
+; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
+ ret void
+}
+
+define void @test_memmove_loadstore_4(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_4(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
+; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4
+; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
+; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
+ ret void
+}
+
+define void @test_memmove_loadstore_8(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_8(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8
+; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
+; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
+ ret void
+}
+
+define void @test_memmove_loadstore_16(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_16(
+; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
ret void
}
declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly
+
+;; =========================================
+;; ----- memcpy ------
+
+define void @test_memcpy_zero_length(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_zero_length(
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+ ret void
+}
+
+; memcpy with src==dest is removed
+define void @test_memcpy_removed(i8* %srcdest, i32 %sz) {
+; CHECK-LABEL: @test_memcpy_removed(
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+ ret void
+}
+
+; memcpy with a small constant length is converted to a load/store pair
+define void @test_memcpy_loadstore(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore(
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1
+; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16*
+; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1
+; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1
+; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1
+; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
+ ret void
+}
+
+define void @test_memcpy_loadstore_2(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_2(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2
+; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2
+; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2
+; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
+ ret void
+}
+
+define void @test_memcpy_loadstore_4(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_4(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
+; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4
+; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
+ ret void
+}
+
+define void @test_memcpy_loadstore_8(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_8(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64*
+; CHECK-NEXT: [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8
+; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
+ ret void
+}
+
+define void @test_memcpy_loadstore_16(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_16(
+; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
+ ret void
+}
+
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly