From 14bde67afca5a2790e75304dae16111d922c8b83 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Tue, 20 Mar 2018 19:35:09 +0000
Subject: [PATCH] [Hexagon] Add a few more lit tests, NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328023 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll  |  24 +++
 test/CodeGen/Hexagon/jump-prob.ll             | 164 ++++++++++++++++++
 test/CodeGen/Hexagon/late_instr.ll            | 231 ++++++++++++++++++++++++++
 test/CodeGen/Hexagon/mlong-calls.ll           |  41 +++++
 test/CodeGen/Hexagon/simplify64bitops_7223.ll |  61 +++++++
 test/CodeGen/Hexagon/swp-carried-1.ll         |  62 +++++++
 test/CodeGen/Hexagon/swp-change-deps.ll       |  61 +++++++
 test/CodeGen/Hexagon/swp-epilog-numphis.ll    |  82 +++++++++
 test/CodeGen/Hexagon/swp-epilog-phi9.ll       |  55 ++++++
 test/CodeGen/Hexagon/swp-phi-ref.ll           |  45 +++++
 test/CodeGen/Hexagon/swp-phi-start.ll         |  44 +++++
 test/CodeGen/Hexagon/swp-rename.ll            |  30 ++++
 test/CodeGen/Hexagon/swp-xxh2.ll              |  57 +++++++
 test/CodeGen/Hexagon/vect-downscale.ll        | 177 ++++++++++++++++++++
 14 files changed, 1134 insertions(+)
 create mode 100644 test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll
 create mode 100644 test/CodeGen/Hexagon/jump-prob.ll
 create mode 100644 test/CodeGen/Hexagon/late_instr.ll
 create mode 100644 test/CodeGen/Hexagon/mlong-calls.ll
 create mode 100644 test/CodeGen/Hexagon/simplify64bitops_7223.ll
 create mode 100644 test/CodeGen/Hexagon/swp-carried-1.ll
 create mode 100644 test/CodeGen/Hexagon/swp-change-deps.ll
 create mode 100644 test/CodeGen/Hexagon/swp-epilog-numphis.ll
 create mode 100644 test/CodeGen/Hexagon/swp-epilog-phi9.ll
 create mode 100644 test/CodeGen/Hexagon/swp-phi-ref.ll
 create mode 100644 test/CodeGen/Hexagon/swp-phi-start.ll
 create mode 100644 test/CodeGen/Hexagon/swp-rename.ll
 create mode 100644 test/CodeGen/Hexagon/swp-xxh2.ll
 create mode 100644 test/CodeGen/Hexagon/vect-downscale.ll

diff --git a/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll b/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll
new file mode 100644
index 00000000000..d79cbd413d9
--- /dev/null
+++ b/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
+; CHECK: if (!p{{[0-3]}}.new) jumpr:nt r31
+; CHECK-NOT: .falign
+
+@g0 = common global i8 0, align 1
+@g1 = common global i32 0, align 4
+
+define i32 @f0(i32* nocapture %a0) {
+b0:
+  %v0 = load i8, i8* @g0, align 1
+  %v1 = icmp eq i8 %v0, 65
+  br i1 %v1, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  %v2 = load i32, i32* %a0, align 4
+  %v3 = add nsw i32 %v2, 9
+  %v4 = load i32, i32* @g1, align 4
+  %v5 = sub i32 %v3, %v4
+  store i32 %v5, i32* %a0, align 4
+  br label %b2
+
+b2:                                               ; preds = %b1, %b0
+  ret i32 undef
+}
diff --git a/test/CodeGen/Hexagon/jump-prob.ll b/test/CodeGen/Hexagon/jump-prob.ll
new file mode 100644
index 00000000000..a5f420df0df
--- /dev/null
+++ b/test/CodeGen/Hexagon/jump-prob.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK: {
+; CHECK: jump .LBB0_
+; CHECK: r{{[0-9]+}} =
+; CHECK: memw
+; CHECK: }
+
+target triple = "hexagon-unknown--elf"
+
+%s.0 = type { i8, i8, i8, [6 x i32] }
+%s.1 = type { %s.2 }
+%s.2 = type { i32, i8* }
+%s.3 = type <{ i8*, i8*, i16, i8, i8, i8 }>
+
+@g0 = internal global [2 x %s.0] [%s.0 { i8 0, i8 6, i8 7, [6 x i32] zeroinitializer }, %s.0 { i8 0, i8 6, i8 7, [6 x i32] zeroinitializer }], align 8
+@g1 = internal constant [60 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+@g2 = internal constant %s.1 { %s.2 { i32 24, i8* getelementptr inbounds ([60 x i8], [60 x i8]* @g1, i32 0, i32 0) } }, section ".rodata.xxxxxxxxxx.", align 4
+@g3 = internal constant [115 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+@g4 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @g6, i32 0, i32 0), i16 215, i8 4, i8 0, i8 1 }>, align 1
+@g5 = private unnamed_addr constant [120 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@g6 = private unnamed_addr constant [31 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@g7 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([91 x i8], [91 x i8]* @g8, i32 0, i32 0), i16 225, i8 2, i8 2, i8 2 }>, align 1
+@g8 = private unnamed_addr constant [91 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@g9 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([109 x i8], [109 x i8]* @g10, i32 0, i32 0), i16 233, i8 2, i8 2, i8 4 }>, align 1
+@g10 = private unnamed_addr constant [109 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@g11 = internal constant [116 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+@g12 = internal constant [134 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+@g13 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @g6, i32 0, i32 0), i16 264, i8 4, i8 0, i8 1 }>, align 1
+@g14 = internal constant [116 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+@g15 = internal constant [134 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4
+
+; Function Attrs: nounwind
+define zeroext i8 @f0(i8 zeroext %a0, i8 zeroext %a1, i8* nocapture %a2) #0 {
+b0:
+  store i8 -1, i8* %a2, align 1, !tbaa !0
+  %v0 = zext i8 %a0 to i32
+  %v1 = icmp ugt i8 %a0, 7
+  %v2 = zext i8 %a1 to i32
+  %v3 = icmp ugt i8 %a1, 5
+  %v4 = or i1 %v1, %v3
+  br i1 %v4, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  br label %b12
+
+b2:                                               ; preds = %b0
+  %v5 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 2), align 2, !tbaa !0
+  %v6 = icmp eq i8 %v5, %a0
+  %v7 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 2), align 2, !tbaa !0
+  %v8 = icmp eq i8 %v7, %a0
+  %v9 = and i1 %v6, %v8
+  br i1 %v9, label %b3, label %b4
+
+b3:                                               ; preds = %b2
+  %v10 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 3, i32 %v2
+  %v11 = load i32, i32* %v10, align 4, !tbaa !3
+  %v12 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 3, i32 %v2
+  %v13 = load i32, i32* %v12, align 4, !tbaa !3
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  br label %b12
+
+b4:                                               ; preds = %b2
+  %v14 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 0), align 8, !tbaa !0
+  %v15 = icmp eq i8 %v14, 1
+  %v16 = and i1 %v15, %v6
+  br i1 %v16, label %b5, label %b8
+
+b5:                                               ; preds = %b4
+  store i8 0, i8* %a2, align 1, !tbaa !0
+  %v17 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 3, i32 %v2
+  %v18 = tail call i32 asm sideeffect "1:     $0 = memw_locked($2)\0A       $0 = add($0, $3)\0A       memw_locked($2, p0) = $0\0A       if !p0 jump 1b\0A", "=&r,=*m,r,r,*m,~{p0}"(i32* %v17, i32* %v17, i32 1, i32* %v17) #0, !srcloc !5
+  %v19 = load i32, i32* %v17, align 4, !tbaa !3
+  %v20 = icmp eq i32 %v19, 255
+  br i1 %v20, label %b6, label %b7
+
+b6:                                               ; preds = %b5
+  tail call void @f2(%s.3* @g4, i32 %v2) #2
+  unreachable
+
+b7:                                               ; preds = %b5
+  store i8 %a1, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 1), align 1, !tbaa !0
+  %v21 = load i8, i8* %a2, align 1, !tbaa !0
+  %v22 = zext i8 %v21 to i32
+  tail call void @f3(%s.3* @g7, i32 %v2, i32 %v22) #0
+  %v23 = load i32, i32* bitcast ([2 x %s.0]* @g0 to i32*), align 8
+  %v24 = and i32 %v23, 255
+  %v25 = lshr i32 %v23, 8
+  %v26 = and i32 %v25, 255
+  %v27 = lshr i32 %v23, 16
+  %v28 = and i32 %v27, 255
+  %v29 = load i32, i32* %v17, align 4, !tbaa !3
+  tail call void @f4(%s.3* @g9, i32 %v24, i32 %v26, i32 %v28, i32 %v29) #0
+  %v30 = load i8, i8* %a2, align 1, !tbaa !0
+  %v31 = zext i8 %v30 to i32
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  %v32 = load i32, i32* bitcast ([2 x %s.0]* @g0 to i32*), align 8
+  %v33 = and i32 %v32, 255
+  %v34 = lshr i32 %v32, 8
+  %v35 = and i32 %v34, 255
+  %v36 = lshr i32 %v32, 16
+  %v37 = and i32 %v36, 255
+  %v38 = load i32, i32* %v17, align 4, !tbaa !3
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  br label %b12
+
+b8:                                               ; preds = %b4
+  %v39 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 0), align 4, !tbaa !0
+  %v40 = icmp eq i8 %v39, 1
+  %v41 = and i1 %v40, %v8
+  br i1 %v41, label %b9, label %b12
+
+b9:                                               ; preds = %b8
+  store i8 1, i8* %a2, align 1, !tbaa !0
+  %v42 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 3, i32 %v2
+  %v43 = tail call i32 asm sideeffect "1:     $0 = memw_locked($2)\0A       $0 = add($0, $3)\0A       memw_locked($2, p0) = $0\0A       if !p0 jump 1b\0A", "=&r,=*m,r,r,*m,~{p0}"(i32* %v42, i32* %v42, i32 1, i32* %v42) #0, !srcloc !5
+  %v44 = load i32, i32* %v42, align 4, !tbaa !3
+  %v45 = icmp eq i32 %v44, 255
+  br i1 %v45, label %b10, label %b11
+
+b10:                                              ; preds = %b9
+  tail call void @f2(%s.3* @g13, i32 %v2) #2
+  unreachable
+
+b11:                                              ; preds = %b9
+  store i8 %a1, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 1), align 1, !tbaa !0
+  %v46 = load i8, i8* %a2, align 1, !tbaa !0
+  %v47 = zext i8 %v46 to i32
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  %v48 = load i32, i32* bitcast (i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 0) to i32*), align 4
+  %v49 = and i32 %v48, 255
+  %v50 = lshr i32 %v48, 8
+  %v51 = and i32 %v50, 255
+  %v52 = lshr i32 %v48, 16
+  %v53 = and i32 %v52, 255
+  %v54 = load i32, i32* %v42, align 4, !tbaa !3
+  tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2)
+  br label %b12
+
+b12:                                              ; preds = %b11, %b8, %b7, %b3, %b1
+  %v55 = phi i8 [ 0, %b1 ], [ 0, %b3 ], [ 1, %b7 ], [ 1, %b11 ], [ 0, %b8 ]
+  ret i8 %v55
+}
+
+declare void @f1(%s.1*, i32, i32, i32)
+
+; Function Attrs: noreturn
+declare void @f2(%s.3*, i32) #1
+
+declare void @f3(%s.3*, i32, i32)
+
+declare void @f4(%s.3*, i32, i32, i32, i32)
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" }
+attributes #1 = { noreturn }
+attributes #2 = { noreturn nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"long", !1}
+!5 = !{i32 86170, i32 86211, i32 86247, i32 86291}
diff --git a/test/CodeGen/Hexagon/late_instr.ll b/test/CodeGen/Hexagon/late_instr.ll
new file mode 100644
index 00000000000..c21e0140ca0
--- /dev/null
+++ b/test/CodeGen/Hexagon/late_instr.ll
@@ -0,0 +1,231 @@
+; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s
+
+; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets.
+; CHECK: or(q{{[0-3]+}},q{{[0-3]+}})
+; CHECK: }
+; CHECK-NOT: }
+; CHECK: |= vand(q{{[0-3]+}},r{{[0-9]+}})
+; CHECK: endloop0
+
+target triple = "hexagon-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @f0(i8* noalias nocapture readonly %a0, i32 %a1, i32 %a2, i32 %a3, i32* noalias nocapture %a4, i32 %a5) #0 {
+b0:
+  %v0 = mul i32 %a2, 3
+  %v1 = bitcast i32* %a4 to <16 x i32>*
+  %v2 = mul i32 %a5, -2
+  %v3 = add i32 %v2, %a1
+  %v4 = and i32 %a5, 63
+  %v5 = add i32 %v3, %v4
+  %v6 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 -1)
+  %v7 = lshr i32 %v5, 6
+  %v8 = and i32 %v7, 7
+  %v9 = and i32 %v5, 511
+  %v10 = icmp eq i32 %v9, 0
+  %v11 = shl i32 -1, %v8
+  %v12 = select i1 %v10, i32 0, i32 %v11
+  %v13 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %v12)
+  %v14 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 %v13)
+  %v15 = tail call <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32> %v14)
+  %v16 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %v5)
+  %v17 = shl i32 1, %v8
+  %v18 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %v17)
+  %v19 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32> %v15, <512 x i1> %v16, i32 %v18)
+  %v20 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %a3)
+  %v21 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 %v20)
+  %v22 = icmp sgt i32 %v5, 0
+  br i1 %v22, label %b1, label %b8
+
+b1:                                               ; preds = %b0
+  %v23 = getelementptr inbounds i8, i8* %a0, i32 %a5
+  %v24 = bitcast i8* %v23 to <16 x i32>*
+  %v25 = load <16 x i32>, <16 x i32>* %v24, align 64, !tbaa !0
+  %v26 = add i32 %a5, 64
+  %v27 = getelementptr inbounds i8, i8* %a0, i32 %v26
+  %v28 = bitcast i8* %v27 to <16 x i32>*
+  %v29 = add i32 %a5, -64
+  %v30 = getelementptr inbounds i8, i8* %a0, i32 %v29
+  %v31 = bitcast i8* %v30 to <16 x i32>*
+  %v32 = load <16 x i32>, <16 x i32>* %v31, align 64, !tbaa !0
+  %v33 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %a5)
+  %v34 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1> %v33, i32 16843009)
+  %v35 = tail call <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32> %v34)
+  %v36 = add i32 %v0, %a5
+  %v37 = getelementptr inbounds i8, i8* %a0, i32 %v36
+  %v38 = bitcast i8* %v37 to <16 x i32>*
+  %v39 = sub i32 %a5, %v0
+  %v40 = getelementptr inbounds i8, i8* %a0, i32 %v39
+  %v41 = bitcast i8* %v40 to <16 x i32>*
+  %v42 = tail call <16 x i32> @llvm.hexagon.V6.vd0()
+  %v43 = add i32 %v4, %a1
+  %v44 = mul i32 %a5, 2
+  %v45 = sub i32 %v43, %v44
+  %v46 = xor i32 %v45, -1
+  %v47 = icmp sgt i32 %v46, -513
+  %v48 = select i1 %v47, i32 %v46, i32 -513
+  %v49 = add i32 %v48, %a1
+  %v50 = add i32 %v49, %v4
+  %v51 = add i32 %v50, 512
+  %v52 = sub i32 %v51, %v44
+  %v53 = lshr i32 %v52, 9
+  %v54 = mul nuw nsw i32 %v53, 16
+  %v55 = add nuw nsw i32 %v54, 16
+  %v56 = getelementptr i32, i32* %a4, i32 %v55
+  br label %b2
+
+b2:                                               ; preds = %b6, %b1
+  %v57 = phi i32 [ %v46, %b1 ], [ %v125, %b6 ]
+  %v58 = phi i32 [ %v5, %b1 ], [ %v123, %b6 ]
+  %v59 = phi <16 x i32>* [ %v1, %b1 ], [ %v122, %b6 ]
+  %v60 = phi <16 x i32>* [ %v38, %b1 ], [ %v114, %b6 ]
+  %v61 = phi <16 x i32>* [ %v41, %b1 ], [ %v115, %b6 ]
+  %v62 = phi <16 x i32>* [ %v28, %b1 ], [ %v116, %b6 ]
+  %v63 = phi i32 [ 512, %b1 ], [ %v69, %b6 ]
+  %v64 = phi i32 [ -2139062144, %b1 ], [ %v117, %b6 ]
+  %v65 = phi <16 x i32> [ %v32, %b1 ], [ %v118, %b6 ]
+  %v66 = phi <16 x i32> [ %v25, %b1 ], [ %v119, %b6 ]
+  %v67 = phi <16 x i32> [ %v35, %b1 ], [ %v6, %b6 ]
+  %v68 = icmp slt i32 %v58, %v63
+  %v69 = select i1 %v68, i32 %v58, i32 %v63
+  %v70 = icmp sgt i32 %v69, 0
+  br i1 %v70, label %b3, label %b6
+
+b3:                                               ; preds = %b2
+  %v71 = xor i32 %v63, -1
+  %v72 = icmp sgt i32 %v57, %v71
+  %v73 = select i1 %v72, i32 %v57, i32 %v71
+  %v74 = icmp sgt i32 %v73, -65
+  %v75 = add i32 %v73, 63
+  %v76 = select i1 %v74, i32 %v75, i32 -2
+  %v77 = sub i32 %v76, %v73
+  %v78 = lshr i32 %v77, 6
+  br label %b4
+
+b4:                                               ; preds = %b4, %b3
+  %v79 = phi i32 [ %v69, %b3 ], [ %v108, %b4 ]
+  %v80 = phi <16 x i32>* [ %v60, %b3 ], [ %v89, %b4 ]
+  %v81 = phi <16 x i32>* [ %v61, %b3 ], [ %v87, %b4 ]
+  %v82 = phi <16 x i32>* [ %v62, %b3 ], [ %v92, %b4 ]
+  %v83 = phi i32 [ %v64, %b3 ], [ %v106, %b4 ]
+  %v84 = phi <16 x i32> [ %v65, %b3 ], [ %v85, %b4 ]
+  %v85 = phi <16 x i32> [ %v66, %b3 ], [ %v93, %b4 ]
+  %v86 = phi <16 x i32> [ %v42, %b3 ], [ %v107, %b4 ]
+  %v87 = getelementptr inbounds <16 x i32>, <16 x i32>* %v81, i32 1
+  %v88 = load <16 x i32>, <16 x i32>* %v81, align 64, !tbaa !0
+  %v89 = getelementptr inbounds <16 x i32>, <16 x i32>* %v80, i32 1
+  %v90 = load <16 x i32>, <16 x i32>* %v80, align 64, !tbaa !0
+  %v91 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v85, <16 x i32> %v84, i32 3)
+  %v92 = getelementptr inbounds <16 x i32>, <16 x i32>* %v82, i32 1
+  %v93 = load <16 x i32>, <16 x i32>* %v82, align 64, !tbaa !0
+  %v94 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v93, <16 x i32> %v85, i32 3)
+  %v95 = tail call <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32> %v85, <16 x i32> %v21)
+  %v96 = tail call <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32> %v85, <16 x i32> %v21)
+  %v97 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v88, <16 x i32> %v90)
+  %v98 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v88, <16 x i32> %v90)
+  %v99 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v94, <16 x i32> %v91)
+  %v100 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v94, <16 x i32> %v91)
+  %v101 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v97, <16 x i32> %v99)
+  %v102 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v98, <16 x i32> %v100)
+  %v103 = tail call <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32> %v101, <16 x i32> %v96)
+  %v104 = tail call <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32> %v95, <16 x i32> %v102)
+  %v105 = tail call <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1> %v103, <512 x i1> %v104)
+  %v106 = tail call i32 @llvm.hexagon.S6.rol.i.r(i32 %v83, i32 1)
+  %v107 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32> %v86, <512 x i1> %v105, i32 %v106)
+  %v108 = add nsw i32 %v79, -64
+  %v109 = icmp sgt i32 %v79, 64
+  br i1 %v109, label %b4, label %b5
+
+b5:                                               ; preds = %b4
+  %v110 = add nuw nsw i32 %v78, 1
+  %v111 = getelementptr <16 x i32>, <16 x i32>* %v62, i32 %v110
+  %v112 = getelementptr <16 x i32>, <16 x i32>* %v60, i32 %v110
+  %v113 = getelementptr <16 x i32>, <16 x i32>* %v61, i32 %v110
+  br label %b6
+
+b6:                                               ; preds = %b5, %b2
+  %v114 = phi <16 x i32>* [ %v112, %b5 ], [ %v60, %b2 ]
+  %v115 = phi <16 x i32>* [ %v113, %b5 ], [ %v61, %b2 ]
+  %v116 = phi <16 x i32>* [ %v111, %b5 ], [ %v62, %b2 ]
+  %v117 = phi i32 [ %v106, %b5 ], [ %v64, %b2 ]
+  %v118 = phi <16 x i32> [ %v85, %b5 ], [ %v65, %b2 ]
+  %v119 = phi <16 x i32> [ %v93, %b5 ], [ %v66, %b2 ]
+  %v120 = phi <16 x i32> [ %v107, %b5 ], [ %v42, %b2 ]
+  %v121 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v120, <16 x i32> %v67)
+  %v122 = getelementptr inbounds <16 x i32>, <16 x i32>* %v59, i32 1
+  store <16 x i32> %v121, <16 x i32>* %v59, align 64, !tbaa !0
+  %v123 = add nsw i32 %v58, -512
+  %v124 = icmp sgt i32 %v58, 512
+  %v125 = add i32 %v57, 512
+  br i1 %v124, label %b2, label %b7
+
+b7:                                               ; preds = %b6
+  %v126 = bitcast i32* %v56 to <16 x i32>*
+  br label %b8
+
+b8:                                               ; preds = %b7, %b0
+  %v127 = phi <16 x i32>* [ %v126, %b7 ], [ %v1, %b0 ]
+  %v128 = getelementptr inbounds <16 x i32>, <16 x i32>* %v127, i32 -1
+  %v129 = load <16 x i32>, <16 x i32>* %v128, align 64, !tbaa !0
+  %v130 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v129, <16 x i32> %v19)
+  store <16 x i32> %v130, <16 x i32>* %v128, align 64, !tbaa !0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.vsplatrb(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32>, <512 x i1>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vd0() #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S6.rol.i.r(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vand(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"omnipotent char", !2, i64 0}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/mlong-calls.ll b/test/CodeGen/Hexagon/mlong-calls.ll
new file mode 100644
index 00000000000..d76b87f987f
--- /dev/null
+++ b/test/CodeGen/Hexagon/mlong-calls.ll
@@ -0,0 +1,41 @@
+; RUN: llc -hexagon-long-calls -march=hexagon -enable-save-restore-long=true < %s | FileCheck %s
+
+; CHECK: call ##f1
+; CHECK: jump ##__restore
+
+; Function Attrs: minsize nounwind
+define i64 @f0(i32 %a0, i32 %a1) #0 {
+b0:
+  %v0 = add nsw i32 %a0, 5
+  %v1 = tail call i64 @f1(i32 %v0) #1
+  %v2 = sext i32 %a1 to i64
+  %v3 = add nsw i64 %v1, %v2
+  ret i64 %v3
+}
+
+; Function Attrs: minsize nounwind
+declare i64 @f1(i32) #0
+
+; Function Attrs: nounwind
+define i64 @f2(i32 %a0, i32 %a1) #1 {
+b0:
+  %v0 = add nsw i32 %a0, 5
+  %v1 = tail call i64 @f1(i32 %v0) #1
+  ret i64 %v1
+}
+
+; Function Attrs: noreturn nounwind
+define i64 @f3(i32 %a0, i32 %a1) #2 {
+b0:
+  %v0 = add nsw i32 %a0, 5
+  %v1 = tail call i64 @f4(i32 %v0) #2
+  unreachable
+}
+
+; Function Attrs: noreturn
+declare i64 @f4(i32) #3
+
+attributes #0 = { minsize nounwind }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { noreturn }
diff --git a/test/CodeGen/Hexagon/simplify64bitops_7223.ll b/test/CodeGen/Hexagon/simplify64bitops_7223.ll
new file mode 100644
index 00000000000..56093c1d00c
--- /dev/null
+++ b/test/CodeGen/Hexagon/simplify64bitops_7223.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner < %s
+; REQUIRES: asserts
+; CHECK-NOT: and(
+; CHECK-NOT: or(
+; CHECK-NOT: combine(0
+; CHECK: add
+; CHECK: add(
+; CHECK-NEXT: memuh(
+; CHECK-NEXT: endloop
+
+%s.22 = type { i64 }
+
+@g0 = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i64 @f0(%s.22* nocapture %a0, i32 %a1) #0 {
+b0:
+  %v0 = bitcast %s.22* %a0 to i16*
+  %v1 = load i16, i16* %v0, align 2, !tbaa !0
+  %v2 = zext i16 %v1 to i64
+  %v3 = icmp sgt i32 %a1, 0
+  br i1 %v3, label %b1, label %b4
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v4 = phi i16* [ %v8, %b2 ], [ %v0, %b1 ]
+  %v5 = phi i32 [ %v10, %b2 ], [ undef, %b1 ]
+  %v6 = phi i32 [ %v15, %b2 ], [ 0, %b1 ]
+  %v7 = phi i64 [ %v14, %b2 ], [ %v2, %b1 ]
+  %v8 = getelementptr inbounds i16, i16* %v4, i32 1
+  %v9 = trunc i64 %v7 to i32
+  %v10 = add i32 %v5, %v9
+  %v11 = load i16, i16* %v8, align 2, !tbaa !0
+  %v12 = zext i16 %v11 to i64
+  %v13 = and i64 %v7, -4294967296
+  %v14 = or i64 %v12, %v13
+  %v15 = add nsw i32 %v6, 1
+  %v16 = icmp eq i32 %v15, %a1
+  br i1 %v16, label %b3, label %b2
+
+b3:                                               ; preds = %b2
+  br label %b4
+
+b4:                                               ; preds = %b3, %b0
+  %v17 = phi i32 [ undef, %b0 ], [ %v10, %b3 ]
+  %v18 = phi i64 [ %v2, %b0 ], [ %v14, %b3 ]
+  store volatile i32 %v17, i32* @g0, align 4, !tbaa !4
+  ret i64 %v18
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"short", !2}
+!2 = !{!"omnipotent char", !3}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"long", !2}
diff --git a/test/CodeGen/Hexagon/swp-carried-1.ll b/test/CodeGen/Hexagon/swp-carried-1.ll
new file mode 100644
index 00000000000..e0aff5cb28c
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched < %s | FileCheck %s
+
+; Test that we generate the correct code when a loop carried value
+; is scheduled one stage earlier than it's use. The code in
+; isLoopCarried was returning false in this case, and the generated
+; code was missing an copy.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: += mpy([[REG0:(r[0-9]+)]],r{{[0-9]+}})
+; CHECK: [[REG0]] = r{{[0-9]+}}
+; CHECK-NOT: [[REG0]] = memw
+; CHECK: endloop0
+
+@g0 = external global [256 x i32], align 8
+
+define void @f0() #0 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b1, %b0
+  br i1 undef, label %b2, label %b1
+
+b2:                                               ; preds = %b1
+  br label %b3
+
+b3:                                               ; preds = %b3, %b2
+  %v0 = phi i32* [ getelementptr inbounds ([256 x i32], [256 x i32]* @g0, i32 0, i32 0), %b2 ], [ %v1, %b3 ]
+  %v1 = getelementptr i32, i32* %v0, i32 6
+  br i1 undef, label %b4, label %b3
+
+b4:                                               ; preds = %b3
+  br i1 undef, label %b6, label %b5
+
+b5:                                               ; preds = %b5, %b4
+  %v2 = phi i64 [ %v19, %b5 ], [ undef, %b4 ]
+  %v3 = phi i32* [ %v8, %b5 ], [ %v1, %b4 ]
+  %v4 = phi i32 [ %v9, %b5 ], [ undef, %b4 ]
+  %v5 = phi i32 [ %v11, %b5 ], [ undef, %b4 ]
+  %v6 = phi i32 [ %v5, %b5 ], [ undef, %b4 ]
+  %v7 = phi i32 [ %v10, %b5 ], [ 0, %b4 ]
+  %v8 = getelementptr i32, i32* %v3, i32 1
+  %v9 = add nsw i32 %v4, 1
+  %v10 = load i32, i32* %v8, align 4
+  %v11 = load i32, i32* null, align 4
+  %v12 = sext i32 %v6 to i64
+  %v13 = sext i32 %v10 to i64
+  %v14 = sext i32 %v7 to i64
+  %v15 = mul nsw i64 %v14, %v12
+  %v16 = add i64 %v12, %v2
+  %v17 = add i64 %v16, %v13
+  %v18 = add i64 %v17, 0
+  %v19 = add i64 %v18, %v15
+  %v20 = icmp eq i32 %v9, 128
+  br i1 %v20, label %b6, label %b5
+
+b6:                                               ; preds = %b5, %b4
+  %v21 = phi i64 [ undef, %b4 ], [ %v19, %b5 ]
+  unreachable
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv62" }
diff --git a/test/CodeGen/Hexagon/swp-change-deps.ll b/test/CodeGen/Hexagon/swp-change-deps.ll
new file mode 100644
index 00000000000..cf9dc79ad69
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-change-deps.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test that we generate the correct offsets for loads in the prolog
+; after removing dependences on a post-increment instructions of the
+; base register.
+
+; CHECK: memh([[REG0:(r[0-9]+)]]+#0)
+; CHECK: memh([[REG0]]+#2)
+; CHECK: loop0
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.A2.sath(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.A2.asrh(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.A2.addsat(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32, i32) #1
+
+define void @f0() #0 align 2 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v0 = phi i16* [ undef, %b1 ], [ %v14, %b2 ]
+  %v1 = phi i32 [ 0, %b1 ], [ %v12, %b2 ]
+  %v2 = load i16, i16* %v0, align 2
+  %v3 = sext i16 %v2 to i32
+  %v4 = call i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32 undef, i32 %v3)
+  %v5 = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v4, i32 undef)
+  %v6 = call i32 @llvm.hexagon.A2.addsat(i32 %v5, i32 32768)
+  %v7 = call i32 @llvm.hexagon.A2.asrh(i32 %v6)
+  %v8 = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v7, i32 undef)
+  %v9 = call i32 @llvm.hexagon.A2.sath(i32 %v8)
+  %v10 = trunc i32 %v9 to i16
+  store i16 %v10, i16* null, align 2
+  %v11 = trunc i32 %v7 to i16
+  store i16 %v11, i16* %v0, align 2
+  %v12 = add nsw i32 %v1, 1
+  %v13 = icmp slt i32 %v12, undef
+  %v14 = getelementptr i16, i16* %v0, i32 1
+  br i1 %v13, label %b2, label %b3
+
+b3:                                               ; preds = %b2
+  unreachable
+
+b4:                                               ; No predecessors!
+  unreachable
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/swp-epilog-numphis.ll b/test/CodeGen/Hexagon/swp-epilog-numphis.ll
new file mode 100644
index 00000000000..0af6c5a327d
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-epilog-numphis.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK: endloop0
+; CHECK: vmem
+; CHECK: vmem([[REG:r([0-9]+)]]+#1) =
+; CHECK: vmem([[REG]]+#0) =
+
+define void @f0(i32 %a0) local_unnamed_addr #0 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v33, %b1 ], [ %a0, %b0 ]
+  %v1 = phi <16 x i32>* [ %v32, %b1 ], [ undef, %b0 ]
+  %v2 = phi <16 x i32>* [ %v23, %b1 ], [ undef, %b0 ]
+  %v3 = phi <16 x i32>* [ %v10, %b1 ], [ undef, %b0 ]
+  %v4 = phi <16 x i32>* [ %v8, %b1 ], [ null, %b0 ]
+  %v5 = phi <32 x i32> [ %v12, %b1 ], [ undef, %b0 ]
+  %v6 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v5)
+  %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v6, <16 x i32> undef, i32 6)
+  %v8 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 1
+  %v9 = load <16 x i32>, <16 x i32>* %v4, align 64
+  %v10 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 1
+  %v11 = load <16 x i32>, <16 x i32>* %v3, align 64
+  %v12 = tail call <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32> %v11, <16 x i32> %v9)
+  %v13 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v12)
+  %v14 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v13, <16 x i32> undef)
+  %v15 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v14, <16 x i32> undef, i32 4)
+  %v16 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v14, <16 x i32> %v15)
+  %v17 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v14, <16 x i32> undef, i32 4)
+  %v18 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v16, <16 x i32> undef, i32 2)
+  %v19 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> undef, <16 x i32> %v17)
+  %v20 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v18, <16 x i32> %v19)
+  %v21 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 1
+  %v22 = load <16 x i32>, <16 x i32>* %v2, align 64
+  %v23 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 2
+  %v24 = load <16 x i32>, <16 x i32>* %v21, align 64
+  %v25 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v22, <16 x i32> %v7)
+  %v26 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v24, <16 x i32> undef)
+  %v27 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v25, <16 x i32> %v20)
+  %v28 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v26, <16 x i32> %v20)
+  store <16 x i32> %v27, <16 x i32>* %v2, align 64
+  store <16 x i32> %v28, <16 x i32>* %v21, align 64
+  %v29 = tail call <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32> %v27, i32 17760527)
+  %v30 = tail call <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32> %v28, i32 17760527)
+  %v31 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v30, <16 x i32> %v29)
+  %v32 = getelementptr inbounds <16 x i32>, <16 x i32>* %v1, i32 1
+  store <16 x i32> %v31, <16 x i32>* %v1, align 64
+  %v33 = add nsw i32 %v0, -64
+  %v34 = icmp sgt i32 %v0, 192
+  br i1 %v34, label %b1, label %b2
+
+b2:                                               ; preds = %b1
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv65" "target-features"="+hvxv65,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/swp-epilog-phi9.ll b/test/CodeGen/Hexagon/swp-epilog-phi9.ll
new file mode 100644
index 00000000000..db92a33b559
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-epilog-phi9.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test that we generate the correct Phi name in the last couple of epilog
+; blocks, when there are 3 epilog blocks. The Phi was scheduled in stage
+; 2, so the computation for the number of Phis needs to be adjusted when
+; the incoming prolog block is from prolog 0 or prolog 1.
+; Note: the pipeliner no longer generates a 3 stage pipeline for this test.
+
+; CHECK: loop0
+; CHECK: [[REG0:r([0-9]+)]] = add(r{{[0-8]+}},#8)
+; CHECK: endloop0
+; CHECK: [[REG0]] = add(r{{[0-9]+}},#8)
+
+; Function Attrs: nounwind
+define void @f0(i16* nocapture readonly %a0) #0 {
+b0:
+  %v0 = alloca [129 x i32], align 8
+  br i1 undef, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ]
+  %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ]
+  %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ]
+  %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ]
+  %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
+  %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ]
+  %v7 = load i16, i16* %v2, align 2
+  %v8 = sext i16 %v7 to i32
+  %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2
+  %v10 = load i16, i16* %v6, align 2
+  %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2
+  store i32 %v11, i32* %v4, align 4
+  %v12 = getelementptr inbounds i16, i16* %v6, i32 -1
+  %v13 = add i32 %v5, 1
+  %v14 = getelementptr inbounds i32, i32* %v3, i32 2
+  %v15 = getelementptr inbounds i16, i16* %v1, i32 2
+  %v16 = icmp slt i32 %v13, undef
+  br i1 %v16, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/Hexagon/swp-phi-ref.ll b/test/CodeGen/Hexagon/swp-phi-ref.ll
new file mode 100644
index 00000000000..1b6def17bd9
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-phi-ref.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s | FileCheck %s
+
+; Test that we generate the correct Phi values when there is a Phi that
+; references another Phi. We need to examine the other Phi to get the
+; correct value. We need to do this even if we haven't generated the
+; kernel code for the other Phi yet.
+
+; CHECK: [[REG0:(v[0-9]+)]] = [[REG1:(v[0-9]+)]]
+; CHECK: loop0
+; CHECK: [[REG0]] = [[REG1]]
+; CHECK: endloop0
+
+; Function Attrs: nounwind
+define void @f0() #0 {
+b0:
+  br i1 undef, label %b1, label %b2
+
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v7, %b1 ], [ 0, %b0 ]
+  %v1 = phi <16 x i32> [ %v4, %b1 ], [ undef, %b0 ]
+  %v2 = phi <16 x i32> [ %v1, %b1 ], [ undef, %b0 ]
+  %v3 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v1, <16 x i32> %v2, i32 62)
+  %v4 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> undef, <16 x i32> undef)
+  %v5 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v4, <16 x i32> %v1, i32 2)
+  %v6 = tail call <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32> %v3, <16 x i32> %v5)
+  store <16 x i32> %v6, <16 x i32>* null, align 64
+  %v7 = add nsw i32 %v0, 1
+  %v8 = icmp slt i32 %v7, undef
+  br i1 %v8, label %b1, label %b2
+
+b2:                                               ; preds = %b1, %b0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/swp-phi-start.ll b/test/CodeGen/Hexagon/swp-phi-start.ll
new file mode 100644
index 00000000000..0e451f924a9
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-phi-start.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 -disable-packetizer < %s | FileCheck %s
+
+; Test that the early start and late start values are computed correctly
+; when a Phi depends on another Phi. In this case, they should occur in
+; the same stage.
+
+; CHECK-DAG: [[REG3:(r[0-9]+)]] = add([[REG1:(r[0-9]+)]],#-1)
+; CHECK-DAG: [[REG2:(r[0-9]+)]] = add([[REG1]],#-1)
+; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],[[REG3]])
+; CHECK-NOT: = [[REG2]]
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: }{{[ \t]*}}:endloop
+
+; Function Attrs: nounwind
+define void @f0(i32 %a0, i16* nocapture %a1) #0 {
+b0:
+  br i1 undef, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  %v0 = add nsw i32 undef, -8
+  br i1 undef, label %b3, label %b2
+
+b2:                                               ; preds = %b2, %b1, %b0
+  %v1 = phi i32 [ %v7, %b2 ], [ undef, %b0 ], [ %v0, %b1 ]
+  %v2 = phi i32 [ %v1, %b2 ], [ %a0, %b0 ], [ undef, %b1 ]
+  %v3 = add nsw i32 %v2, -2
+  %v4 = getelementptr inbounds i16, i16* %a1, i32 %v3
+  %v5 = load i16, i16* %v4, align 2, !tbaa !0
+  %v6 = getelementptr inbounds i16, i16* %a1, i32 %v1
+  store i16 %v5, i16* %v6, align 2, !tbaa !0
+  %v7 = add nsw i32 %v1, -1
+  %v8 = icmp sgt i32 %v7, 0
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b1
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"short", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/swp-rename.ll b/test/CodeGen/Hexagon/swp-rename.ll
new file mode 100644
index 00000000000..7f5060134f4
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-rename.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s
+
+; A test that the Phi rewrite logic is correct.
+
+; CHECK: [[REG0:(r[0-9]+)]] = #0
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: memh([[REG0]]+#0) = #0
+
+define void @f0() #0 {
+b0:
+  %v0 = add i32 undef, -4
+  br label %b1
+
+b1:                                               ; preds = %b1, %b0
+  %v1 = phi i16* [ %v4, %b1 ], [ null, %b0 ]
+  %v2 = phi i32 [ %v5, %b1 ], [ 0, %b0 ]
+  %v3 = getelementptr inbounds i16, i16* %v1, i32 1
+  store i16 0, i16* %v1, align 2
+  %v4 = getelementptr inbounds i16, i16* %v1, i32 2
+  store i16 0, i16* %v3, align 2
+  %v5 = add nsw i32 %v2, 8
+  %v6 = icmp slt i32 %v5, %v0
+  br i1 %v6, label %b1, label %b2
+
+b2:                                               ; preds = %b1
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" }
diff --git a/test/CodeGen/Hexagon/swp-xxh2.ll b/test/CodeGen/Hexagon/swp-xxh2.ll
new file mode 100644
index 00000000000..55f39e263d5
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-xxh2.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s
+; REQUIRES: asserts
+
+; Fix bug when pipelining xxh benchmark at O3, mv55, and with vectorization.
+; The problem is choosing the correct name for the Phis in the epilog.
+
+; CHECK: New block
+; CHECK: %{{.*}}, %[[REG:([0-9]+)]]{{.*}} = L2_loadri_pi
+; CHECK: epilog:
+; CHECK: = PHI
+; CHECK-NOT: = PHI %{{[0-9]+}}, {{.*}}, %[[REG]]
+; CHECK: = PHI
+
+; Function Attrs: nounwind
+define void @f0(i32 %a0, i32* %a1) #0 {
+b0:
+  %v0 = ashr i32 %a0, 1
+  br label %b1
+
+b1:                                               ; preds = %b1, %b0
+  %v1 = phi i64 [ %v8, %b1 ], [ undef, %b0 ]
+  %v2 = phi i32 [ %v9, %b1 ], [ 0, %b0 ]
+  %v3 = phi i32 [ %v7, %b1 ], [ undef, %b0 ]
+  %v4 = inttoptr i32 %v3 to i32*
+  %v5 = load i32, i32* %v4, align 4, !tbaa !0
+  %v6 = tail call i64 @llvm.hexagon.S2.packhl(i32 %v5, i32 undef)
+  %v7 = add nsw i32 %v3, -16
+  %v8 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %v1, i64 undef, i64 %v6)
+  %v9 = add nsw i32 %v2, 1
+  %v10 = icmp eq i32 %v9, %v0
+  br i1 %v10, label %b2, label %b1
+
+b2:                                               ; preds = %b1
+  %v11 = trunc i64 %v8 to i32
+  %v12 = getelementptr inbounds i32, i32* %a1, i32 8
+  store i32 %v11, i32* %v12, align 4, !tbaa !0
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.vdmacs.s0(i64, i64, i64) #1
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.S2.packhl(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noreturn nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/vect-downscale.ll b/test/CodeGen/Hexagon/vect-downscale.ll
new file mode 100644
index 00000000000..514581789e1
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect-downscale.ll
@@ -0,0 +1,177 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Make sure we generate a hardware loop and pipeline the inner loop using
+; 4 packets, which is equivalent to the hand-coded version.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK-NOT: }
+; CHECK: }{{[ \t]*}}:endloop0
+
+define void @f0(i8* noalias %a0, i32 %a1, i32 %a2, i32 %a3, i8* noalias nocapture %a4, i32 %a5, i32 %a6) #0 {
+b0:
+  %v0 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 8388736)
+  %v1 = zext i32 %a3 to i64
+  %v2 = shl nuw i64 %v1, 32
+  %v3 = zext i32 %a1 to i64
+  %v4 = shl nuw nsw i64 %v3, 16
+  %v5 = or i64 %v4, %v2
+  %v6 = or i64 %v5, 281474976710658
+  tail call void asm sideeffect "    l2fetch($0, $1)\0A", "r,r"(i8* %a0, i64 %v6) #2, !srcloc !0
+  %v7 = tail call i32 @llvm.hexagon.S2.ct0(i32 %a6)
+  %v8 = add i32 %v7, 1
+  %v9 = lshr i32 %a1, %v8
+  %v10 = mul i32 %a6, 2
+  %v11 = mul i32 %v10, %v9
+  %v12 = sub i32 %a1, %v11
+  %v13 = lshr i32 %v12, 1
+  %v14 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %v13)
+  %v15 = icmp eq i32 %a2, 0
+  br i1 %v15, label %b11, label %b1
+
+b1:                                               ; preds = %b0
+  %v16 = mul i32 %a3, 2
+  %v17 = icmp eq i32 %v9, 0
+  %v18 = icmp eq i32 %v11, %a1
+  %v19 = icmp ugt i32 %v12, %a6
+  %v20 = mul i32 %v9, 64
+  %v21 = getelementptr i8, i8* %a4, i32 %v20
+  %v22 = mul i32 %v9, 128
+  %v23 = add i32 %v22, %a3
+  %v24 = getelementptr i8, i8* %a0, i32 %v23
+  %v25 = getelementptr i8, i8* %a0, i32 %v22
+  br label %b2
+
+b2:                                               ; preds = %b10, %b1
+  %v26 = phi i8* [ %v25, %b1 ], [ %v90, %b10 ]
+  %v27 = phi i8* [ %v24, %b1 ], [ %v89, %b10 ]
+  %v28 = phi i8* [ %v21, %b1 ], [ %v88, %b10 ]
+  %v29 = phi <16 x i32> [ undef, %b1 ], [ %v85, %b10 ]
+  %v30 = phi <16 x i32> [ undef, %b1 ], [ %v84, %b10 ]
+  %v31 = phi i8* [ %a0, %b1 ], [ %v86, %b10 ]
+  %v32 = phi i8* [ %a4, %b1 ], [ %v87, %b10 ]
+  %v33 = phi i32 [ 0, %b1 ], [ %v37, %b10 ]
+  %v34 = bitcast i8* %v26 to <16 x i32>*
+  %v35 = bitcast i8* %v27 to <16 x i32>*
+  %v36 = bitcast i8* %v28 to <16 x i32>*
+  %v37 = add nsw i32 %v33, 2
+  %v38 = icmp ult i32 %v37, %a2
+  br i1 %v38, label %b3, label %b4
+
+b3:                                               ; preds = %b2
+  %v39 = getelementptr inbounds i8, i8* %v31, i32 %v16
+  tail call void asm sideeffect "    l2fetch($0, $1)\0A", "r,r"(i8* %v39, i64 %v6) #2, !srcloc !1
+  br label %b4
+
+b4:                                               ; preds = %b3, %b2
+  %v40 = bitcast i8* %v32 to <16 x i32>*
+  %v41 = bitcast i8* %v31 to <16 x i32>*
+  %v42 = getelementptr inbounds i8, i8* %v31, i32 %a3
+  %v43 = bitcast i8* %v42 to <16 x i32>*
+  br i1 %v17, label %b6, label %b5
+
+b5:                                               ; preds = %b5, %b4
+  %v44 = phi <16 x i32>* [ %v54, %b5 ], [ %v43, %b4 ]
+  %v45 = phi <16 x i32>* [ %v52, %b5 ], [ %v41, %b4 ]
+  %v46 = phi <16 x i32>* [ %v61, %b5 ], [ %v40, %b4 ]
+  %v47 = phi i32 [ %v62, %b5 ], [ 0, %b4 ]
+  %v48 = getelementptr inbounds <16 x i32>, <16 x i32>* %v45, i32 1
+  %v49 = load <16 x i32>, <16 x i32>* %v45, align 64, !tbaa !2
+  %v50 = getelementptr inbounds <16 x i32>, <16 x i32>* %v44, i32 1
+  %v51 = load <16 x i32>, <16 x i32>* %v44, align 64, !tbaa !2
+  %v52 = getelementptr inbounds <16 x i32>, <16 x i32>* %v45, i32 2
+  %v53 = load <16 x i32>, <16 x i32>* %v48, align 64, !tbaa !2
+  %v54 = getelementptr inbounds <16 x i32>, <16 x i32>* %v44, i32 2
+  %v55 = load <16 x i32>, <16 x i32>* %v50, align 64, !tbaa !2
+  %v56 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v49, i32 1077952576)
+  %v57 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v53, i32 1077952576)
+  %v58 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v56, <16 x i32> %v51, i32 1077952576)
+  %v59 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v57, <16 x i32> %v55, i32 1077952576)
+  %v60 = tail call <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32> %v59, <16 x i32> %v58)
+  %v61 = getelementptr inbounds <16 x i32>, <16 x i32>* %v46, i32 1
+  store <16 x i32> %v60, <16 x i32>* %v46, align 64, !tbaa !2
+  %v62 = add nsw i32 %v47, 1
+  %v63 = icmp eq i32 %v62, %v9
+  br i1 %v63, label %b6, label %b5
+
+b6:                                               ; preds = %b5, %b4
+  %v64 = phi <16 x i32> [ %v29, %b4 ], [ %v55, %b5 ]
+  %v65 = phi <16 x i32> [ %v30, %b4 ], [ %v53, %b5 ]
+  %v66 = phi <16 x i32>* [ %v43, %b4 ], [ %v35, %b5 ]
+  %v67 = phi <16 x i32>* [ %v41, %b4 ], [ %v34, %b5 ]
+  %v68 = phi <16 x i32>* [ %v40, %b4 ], [ %v36, %b5 ]
+  br i1 %v18, label %b10, label %b7
+
+b7:                                               ; preds = %b6
+  %v69 = load <16 x i32>, <16 x i32>* %v67, align 64, !tbaa !2
+  %v70 = load <16 x i32>, <16 x i32>* %v66, align 64, !tbaa !2
+  br i1 %v19, label %b8, label %b9
+
+b8:                                               ; preds = %b7
+  %v71 = getelementptr inbounds <16 x i32>, <16 x i32>* %v66, i32 1
+  %v72 = getelementptr inbounds <16 x i32>, <16 x i32>* %v67, i32 1
+  %v73 = load <16 x i32>, <16 x i32>* %v72, align 64, !tbaa !2
+  %v74 = load <16 x i32>, <16 x i32>* %v71, align 64, !tbaa !2
+  br label %b9
+
+b9:                                               ; preds = %b8, %b7
+  %v75 = phi <16 x i32> [ %v73, %b8 ], [ %v65, %b7 ]
+  %v76 = phi <16 x i32> [ %v74, %b8 ], [ %v64, %b7 ]
+  %v77 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v69, i32 1077952576)
+  %v78 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v75, i32 1077952576)
+  %v79 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v77, <16 x i32> %v70, i32 1077952576)
+  %v80 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v78, <16 x i32> %v76, i32 1077952576)
+  %v81 = tail call <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32> %v80, <16 x i32> %v79)
+  %v82 = load <16 x i32>, <16 x i32>* %v68, align 64, !tbaa !2
+  %v83 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1> %v14, <16 x i32> %v81, <16 x i32> %v82)
+  store <16 x i32> %v83, <16 x i32>* %v68, align 64, !tbaa !2
+  br label %b10
+
+b10:                                              ; preds = %b9, %b6
+  %v84 = phi <16 x i32> [ %v75, %b9 ], [ %v65, %b6 ]
+  %v85 = phi <16 x i32> [ %v76, %b9 ], [ %v64, %b6 ]
+  %v86 = getelementptr inbounds i8, i8* %v31, i32 %v16
+  %v87 = getelementptr inbounds i8, i8* %v32, i32 %a5
+  %v88 = getelementptr i8, i8* %v28, i32 %a5
+  %v89 = getelementptr i8, i8* %v27, i32 %v16
+  %v90 = getelementptr i8, i8* %v26, i32 %v16
+  br i1 %v38, label %b2, label %b11
+
+b11:                                              ; preds = %b10, %b0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.ct0(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!0 = !{i32 -2146401371}
+!1 = !{i32 -2146401153}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
-- 
2.11.0