From b71c6a9b390846b717b601be7b3280f9a7f5d647 Mon Sep 17 00:00:00 2001
From: Sean Fertile <sfertile@ca.ibm.com>
Date: Mon, 18 Dec 2017 15:31:14 +0000
Subject: [PATCH] [Memcpy Loop Lowering] Remove the fixed int8 lowering.

Switch over to the lowering that uses target supplied operand types.

Differential Revision: https://reviews.llvm.org/D41201

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320989 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h        |   6 --
 include/llvm/Transforms/Utils/LowerMemIntrinsics.h |   6 --
 lib/Analysis/TargetTransformInfo.cpp               |   9 --
 lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp          |  24 ++---
 lib/Transforms/Utils/LowerMemIntrinsics.cpp        |  93 +++---------------
 test/CodeGen/AMDGPU/lower-mem-intrinsics.ll        |  24 ++---
 test/CodeGen/NVPTX/lower-aggr-copies.ll            | 108 +++++++++------------
 7 files changed, 73 insertions(+), 197 deletions(-)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 90b71e93947..c20f20cfbe4 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -862,12 +862,6 @@ public:
                                          unsigned SrcAlign,
                                          unsigned DestAlign) const;
 
-  /// \returns True if we want to test the new memcpy lowering functionality in
-  /// Transform/Utils.
-  /// Temporary. Will be removed once we move to the new functionality and
-  /// remove the old.
-  bool useWideIRMemcpyLoopLowering() const;
-
   /// \returns True if the two functions have compatible attributes for inlining
   /// purposes.
   bool areInlineCompatible(const Function *Caller,
diff --git a/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index 4554b5cbc64..2b7d0f67a32 100644
--- a/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -25,12 +25,6 @@ class MemSetInst;
 class TargetTransformInfo;
 class Value;
 
-/// Emit a loop implementing the semantics of llvm.memcpy with the equivalent
-/// arguments at \p InsertBefore.
-void createMemCpyLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
-                      Value *CopyLen, unsigned SrcAlign, unsigned DestAlign,
-                      bool SrcIsVolatile, bool DstIsVolatile);
-
 /// Emit a loop implementing the semantics of llvm.memcpy where the size is not
 /// a compile-time constant. Loop will be insterted at \p InsertBefore.
 void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr,
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 7feb40da271..b744cae51ed 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -26,11 +26,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "tti"
 
-static cl::opt<bool> UseWideMemcpyLoopLowering(
-    "use-wide-memcpy-loop-lowering", cl::init(false),
-    cl::desc("Enables the new wide memcpy loop lowering in Transforms/Utils."),
-    cl::Hidden);
-
 static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
                                      cl::Hidden,
                                      cl::desc("Recognize reduction patterns."));
@@ -547,10 +542,6 @@ void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
                                              SrcAlign, DestAlign);
 }
 
-bool TargetTransformInfo::useWideIRMemcpyLoopLowering() const {
-  return UseWideMemcpyLoopLowering;
-}
-
 bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
                                               const Function *Callee) const {
   return TTIImpl->areInlineCompatible(Caller, Callee);
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 989f0a3aba2..52ced266b91 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -111,23 +111,13 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     ConstantInt *CopyLen =
         ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
 
-    if (!TTI.useWideIRMemcpyLoopLowering()) {
-      createMemCpyLoop(/* ConvertedInst */ SI,
-                       /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                       /* CopyLen */ CopyLen,
-                       /* SrcAlign */ LI->getAlignment(),
-                       /* DestAlign */ SI->getAlignment(),
-                       /* SrcIsVolatile */ LI->isVolatile(),
-                       /* DstIsVolatile */ SI->isVolatile());
-    } else {
-      createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
-                                /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                                /* CopyLen */ CopyLen,
-                                /* SrcAlign */ LI->getAlignment(),
-                                /* DestAlign */ SI->getAlignment(),
-                                /* SrcIsVolatile */ LI->isVolatile(),
-                                /* DstIsVolatile */ SI->isVolatile(), TTI);
-    }
+    createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
+                              /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                              /* CopyLen */ CopyLen,
+                              /* SrcAlign */ LI->getAlignment(),
+                              /* DestAlign */ SI->getAlignment(),
+                              /* SrcIsVolatile */ LI->isVolatile(),
+                              /* DstIsVolatile */ SI->isVolatile(), TTI);
 
     SI->eraseFromParent();
     LI->eraseFromParent();
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 6641e8ba496..57dc225e9da 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -263,61 +263,6 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
   }
 }
 
-void llvm::createMemCpyLoop(Instruction *InsertBefore,
-                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
-                            unsigned SrcAlign, unsigned DestAlign,
-                            bool SrcIsVolatile, bool DstIsVolatile) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-
-  BasicBlock *OrigBB = InsertBefore->getParent();
-  Function *F = OrigBB->getParent();
-  BasicBlock *NewBB =
-    InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
-                                          F, NewBB);
-
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // SrcAddr and DstAddr are expected to be pointer types,
-  // so no check is made here.
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
-  // Cast pointers to (char *)
-  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
-  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
-
-  Builder.CreateCondBr(
-      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
-      LoopBB);
-  OrigBB->getTerminator()->eraseFromParent();
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
-  // load from SrcAddr+LoopIndex
-  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
-  // word-sized loads and stores.
-  Value *Element =
-    LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
-                             LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
-                           SrcIsVolatile);
-  // store at DstAddr+LoopIndex
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
-                                                        DstAddr, LoopIndex),
-                          DstIsVolatile);
-
-  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
-  Value *NewIndex =
-    LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
 // Lower memmove to IR. memmove is required to correctly copy overlapping memory
 // regions; therefore, it has to check the relative positions of the source and
 // destination pointers and choose the copy direction accordingly.
@@ -459,38 +404,26 @@ static void createMemSetLoop(Instruction *InsertBefore,
 
 void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                               const TargetTransformInfo &TTI) {
-  // Original implementation
-  if (!TTI.useWideIRMemcpyLoopLowering()) {
-    createMemCpyLoop(/* InsertBefore */ Memcpy,
-                     /* SrcAddr */ Memcpy->getRawSource(),
-                     /* DstAddr */ Memcpy->getRawDest(),
-                     /* CopyLen */ Memcpy->getLength(),
-                     /* SrcAlign */ Memcpy->getAlignment(),
-                     /* DestAlign */ Memcpy->getAlignment(),
-                     /* SrcIsVolatile */ Memcpy->isVolatile(),
-                     /* DstIsVolatile */ Memcpy->isVolatile());
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+    createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+                              /* SrcAddr */ Memcpy->getRawSource(),
+                              /* DstAddr */ Memcpy->getRawDest(),
+                              /* CopyLen */ CI,
+                              /* SrcAlign */ Memcpy->getAlignment(),
+                              /* DestAlign */ Memcpy->getAlignment(),
+                              /* SrcIsVolatile */ Memcpy->isVolatile(),
+                              /* DstIsVolatile */ Memcpy->isVolatile(),
+                              /* TargetTransformInfo */ TTI);
   } else {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
-      createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+    createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
                                 /* SrcAddr */ Memcpy->getRawSource(),
                                 /* DstAddr */ Memcpy->getRawDest(),
-                                /* CopyLen */ CI,
+                                /* CopyLen */ Memcpy->getLength(),
                                 /* SrcAlign */ Memcpy->getAlignment(),
                                 /* DestAlign */ Memcpy->getAlignment(),
                                 /* SrcIsVolatile */ Memcpy->isVolatile(),
                                 /* DstIsVolatile */ Memcpy->isVolatile(),
-                                /* TargetTransformInfo */ TTI);
-    } else {
-      createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
-                                  /* SrcAddr */ Memcpy->getRawSource(),
-                                  /* DstAddr */ Memcpy->getRawDest(),
-                                  /* CopyLen */ Memcpy->getLength(),
-                                  /* SrcAlign */ Memcpy->getAlignment(),
-                                  /* DestAlign */ Memcpy->getAlignment(),
-                                  /* SrcIsVolatile */ Memcpy->isVolatile(),
-                                  /* DstIsVolatile */ Memcpy->isVolatile(),
-                                  /* TargetTransfomrInfo */ TTI);
-    }
+                                /* TargetTransfomrInfo */ TTI);
   }
 }
 
diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index e1a2af6c7ef..778467207a0 100644
--- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s
 
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
@@ -18,21 +17,14 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)*
 ; Smallest static size which will be expanded
 ; OPT-LABEL: @min_size_large_static_memcpy_caller0(
 ; OPT-NOT: call
-; OPT: getelementptr
-; OPT-NEXT: load i8
-; OPT: getelementptr
-; OPT-NEXT: store i8
-
-; WOPT-LABEL: @min_size_large_static_memcpy_caller0(
-; WOPT-NOT: call
-; WOPT: br label %load-store-loop
-; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
-; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
-; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
-; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
-; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
-; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
-; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
+; OPT: br label %load-store-loop
+; OPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
+; OPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
+; OPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
+; OPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
+; OPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
+; OPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
+; OPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
index c11ced00be9..1da1af65947 100644
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
-; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR
 
 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
 ; llvm.mem* intrinsics get lowered to loops.
@@ -18,13 +17,22 @@ entry:
   ret i8* %dst
 
 ; IR-LABEL:   @memcpy_caller
-; IR:         [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n
-; IR:         br i1 [[CMPREG]], label %split, label %loadstoreloop
-; IR:         loadstoreloop:
-; IR:         [[LOADPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64
-; IR-NEXT:    [[VAL:%[0-9]+]] = load i8, i8* [[LOADPTR]]
-; IR-NEXT:    [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
-; IR-NEXT:    store i8 [[VAL]], i8* [[STOREPTR]]
+; IR:         entry:
+; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
+; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR:         loop-memcpy-expansion:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:         [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:         store i8 [[Load]], i8* [[DstGep]]
+; IR:         [[IndexInc]] = add i64 %loop-index, 1
+; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
+; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR-LABEL:   post-loop-memcpy-expansion:
+; IR:         ret i8* %dst
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_caller
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
@@ -34,23 +42,6 @@ entry:
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
 
-; WIR-LABEL:   @memcpy_caller
-; WIR:         entry:
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR:         loop-memcpy-expansion:
-; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
-; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:         [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
-; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:         store i8 [[Load]], i8* [[DstGep]]
-; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR-LABEL:   post-loop-memcpy-expansion:
-; WIR:         ret i8* %dst
 }
 
 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -59,8 +50,23 @@ entry:
   ret i8* %dst
 
 ; IR-LABEL:   @memcpy_volatile_caller
-; IR:         load volatile
-; IR:         store volatile
+; IR:         entry:
+; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
+; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR:         loop-memcpy-expansion:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:         [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
+; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:         store volatile i8 [[Load]], i8* [[DstGep]]
+; IR:         [[IndexInc]] = add i64 %loop-index, 1
+; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
+; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR-LABEL:   post-loop-memcpy-expansion:
+; IR:         ret i8* %dst
+
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
@@ -69,24 +75,6 @@ entry:
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
-
-; WIR-LABEL:   @memcpy_volatile_caller
-; WIR:         entry:
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR:         loop-memcpy-expansion:
-; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
-; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:         [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
-; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:         store volatile i8 [[Load]], i8* [[DstGep]]
-; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR-LABEL:   post-loop-memcpy-expansion:
-; WIR:         ret i8* %dst
 }
 
 define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -102,12 +90,6 @@ entry:
 ; IR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
 ; IR:         getelementptr inbounds i8, i8* [[SRCCAST]]
 ; IR:         getelementptr inbounds i8, i8* [[DSTCAST]]
-
-; WIR-LABEL:   @memcpy_casting_caller
-; WIR:         [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
-; WIR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
-; WIR:         getelementptr inbounds i8, i8* [[SRCCAST]]
-; WIR:         getelementptr inbounds i8, i8* [[DSTCAST]]
 }
 
 define i8* @memcpy_known_size(i8* %dst, i8* %src) {
@@ -116,18 +98,18 @@ entry:
   ret i8* %dst
 
 ; Check that calls with compile-time constant size are handled correctly
-; WIR-LABEL:    @memcpy_known_size
-; WIR:          entry:
-; WIR:          br label %load-store-loop
-; WIR:          load-store-loop:
-; WIR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
-; WIR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:          [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
-; WIR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:          store i8 [[Load]], i8* [[DstGep]]
-; WIR:          [[IndexInc]] = add i64 %loop-index, 1
-; WIR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
-; WIR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
+; IR-LABEL:    @memcpy_known_size
+; IR:          entry:
+; IR:          br label %load-store-loop
+; IR:          load-store-loop:
+; IR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; IR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:          [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; IR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:          store i8 [[Load]], i8* [[DstGep]]
+; IR:          [[IndexInc]] = add i64 %loop-index, 1
+; IR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
+; IR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
 }
 
 define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
-- 
2.11.0