From 14bde67afca5a2790e75304dae16111d922c8b83 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 20 Mar 2018 19:35:09 +0000 Subject: [PATCH] [Hexagon] Add a few more lit tests, NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328023 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll | 24 +++ test/CodeGen/Hexagon/jump-prob.ll | 164 ++++++++++++++++++ test/CodeGen/Hexagon/late_instr.ll | 231 ++++++++++++++++++++++++++ test/CodeGen/Hexagon/mlong-calls.ll | 41 +++++ test/CodeGen/Hexagon/simplify64bitops_7223.ll | 61 +++++++ test/CodeGen/Hexagon/swp-carried-1.ll | 62 +++++++ test/CodeGen/Hexagon/swp-change-deps.ll | 61 +++++++ test/CodeGen/Hexagon/swp-epilog-numphis.ll | 82 +++++++++ test/CodeGen/Hexagon/swp-epilog-phi9.ll | 55 ++++++ test/CodeGen/Hexagon/swp-phi-ref.ll | 45 +++++ test/CodeGen/Hexagon/swp-phi-start.ll | 44 +++++ test/CodeGen/Hexagon/swp-rename.ll | 30 ++++ test/CodeGen/Hexagon/swp-xxh2.ll | 57 +++++++ test/CodeGen/Hexagon/vect-downscale.ll | 177 ++++++++++++++++++++ 14 files changed, 1134 insertions(+) create mode 100644 test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll create mode 100644 test/CodeGen/Hexagon/jump-prob.ll create mode 100644 test/CodeGen/Hexagon/late_instr.ll create mode 100644 test/CodeGen/Hexagon/mlong-calls.ll create mode 100644 test/CodeGen/Hexagon/simplify64bitops_7223.ll create mode 100644 test/CodeGen/Hexagon/swp-carried-1.ll create mode 100644 test/CodeGen/Hexagon/swp-change-deps.ll create mode 100644 test/CodeGen/Hexagon/swp-epilog-numphis.ll create mode 100644 test/CodeGen/Hexagon/swp-epilog-phi9.ll create mode 100644 test/CodeGen/Hexagon/swp-phi-ref.ll create mode 100644 test/CodeGen/Hexagon/swp-phi-start.ll create mode 100644 test/CodeGen/Hexagon/swp-rename.ll create mode 100644 test/CodeGen/Hexagon/swp-xxh2.ll create mode 100644 test/CodeGen/Hexagon/vect-downscale.ll diff --git a/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll b/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll new file mode 100644 index 00000000000..d79cbd413d9 --- /dev/null +++ b/test/CodeGen/Hexagon/hexagon-cond-jumpr31.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=hexagon -O3 < %s | FileCheck %s +; CHECK: if (!p{{[0-3]}}.new) jumpr:nt r31 +; CHECK-NOT: .falign + +@g0 = common global i8 0, align 1 +@g1 = common global i32 0, align 4 + +define i32 @f0(i32* nocapture %a0) { +b0: + %v0 = load i8, i8* @g0, align 1 + %v1 = icmp eq i8 %v0, 65 + br i1 %v1, label %b1, label %b2 + +b1: ; preds = %b0 + %v2 = load i32, i32* %a0, align 4 + %v3 = add nsw i32 %v2, 9 + %v4 = load i32, i32* @g1, align 4 + %v5 = sub i32 %v3, %v4 + store i32 %v5, i32* %a0, align 4 + br label %b2 + +b2: ; preds = %b1, %b0 + ret i32 undef +} diff --git a/test/CodeGen/Hexagon/jump-prob.ll b/test/CodeGen/Hexagon/jump-prob.ll new file mode 100644 index 00000000000..a5f420df0df --- /dev/null +++ b/test/CodeGen/Hexagon/jump-prob.ll @@ -0,0 +1,164 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK: { +; CHECK: jump .LBB0_ +; CHECK: r{{[0-9]+}} = +; CHECK: memw +; CHECK: } + +target triple = "hexagon-unknown--elf" + +%s.0 = type { i8, i8, i8, [6 x i32] } +%s.1 = type { %s.2 } +%s.2 = type { i32, i8* } +%s.3 = type <{ i8*, i8*, i16, i8, i8, i8 }> + +@g0 = internal global [2 x %s.0] [%s.0 { i8 0, i8 6, i8 7, [6 x i32] zeroinitializer }, %s.0 { i8 0, i8 6, i8 7, [6 x i32] zeroinitializer }], align 8 +@g1 = internal constant [60 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 +@g2 = internal constant %s.1 { %s.2 { i32 24, i8* getelementptr inbounds ([60 x i8], [60 x i8]* @g1, i32 0, i32 0) } }, section ".rodata.xxxxxxxxxx.", align 4 +@g3 = internal constant [115 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 +@g4 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @g6, i32 0, i32 0), i16 215, i8 4, i8 0, i8 1 }>, align 1 +@g5 = private unnamed_addr constant [120 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 +@g6 = private unnamed_addr constant [31 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 +@g7 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([91 x i8], [91 x i8]* @g8, i32 0, i32 0), i16 225, i8 2, i8 2, i8 2 }>, align 1 +@g8 = private unnamed_addr constant [91 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 +@g9 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([109 x i8], [109 x i8]* @g10, i32 0, i32 0), i16 233, i8 2, i8 2, i8 4 }>, align 1 +@g10 = private unnamed_addr constant [109 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 +@g11 = internal constant [116 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 +@g12 = internal constant [134 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 +@g13 = internal constant %s.3 <{ i8* getelementptr inbounds ([120 x i8], [120 x i8]* @g5, i32 0, i32 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @g6, i32 0, i32 0), i16 264, i8 4, i8 0, i8 1 }>, align 1 +@g14 = internal constant [116 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 +@g15 = internal constant [134 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", section "xxxxxxxxxxx.rodata.", align 4 + +; Function Attrs: nounwind +define zeroext i8 @f0(i8 zeroext %a0, i8 zeroext %a1, i8* nocapture %a2) #0 { +b0: + store i8 -1, i8* %a2, align 1, !tbaa !0 + %v0 = zext i8 %a0 to i32 + %v1 = icmp ugt i8 %a0, 7 + %v2 = zext i8 %a1 to i32 + %v3 = icmp ugt i8 %a1, 5 + %v4 = or i1 %v1, %v3 + br i1 %v4, label %b1, label %b2 + +b1: ; preds = %b0 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + br label %b12 + +b2: ; preds = %b0 + %v5 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 2), align 2, !tbaa !0 + %v6 = icmp eq i8 %v5, %a0 + %v7 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 2), align 2, !tbaa !0 + %v8 = icmp eq i8 %v7, %a0 + %v9 = and i1 %v6, %v8 + br i1 %v9, label %b3, label %b4 + +b3: ; preds = %b2 + %v10 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 3, i32 %v2 + %v11 = load i32, i32* %v10, align 4, !tbaa !3 + %v12 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 3, i32 %v2 + %v13 = load i32, i32* %v12, align 4, !tbaa !3 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + br label %b12 + +b4: ; preds = %b2 + %v14 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 0), align 8, !tbaa !0 + %v15 = icmp eq i8 %v14, 1 + %v16 = and i1 %v15, %v6 + br i1 %v16, label %b5, label %b8 + +b5: ; preds = %b4 + store i8 0, i8* %a2, align 1, !tbaa !0 + %v17 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 3, i32 %v2 + %v18 = tail call i32 asm sideeffect "1: $0 = memw_locked($2)\0A $0 = add($0, $3)\0A memw_locked($2, p0) = $0\0A if !p0 jump 1b\0A", "=&r,=*m,r,r,*m,~{p0}"(i32* %v17, i32* %v17, i32 1, i32* %v17) #0, !srcloc !5 + %v19 = load i32, i32* %v17, align 4, !tbaa !3 + %v20 = icmp eq i32 %v19, 255 + br i1 %v20, label %b6, label %b7 + +b6: ; preds = %b5 + tail call void @f2(%s.3* @g4, i32 %v2) #2 + unreachable + +b7: ; preds = %b5 + store i8 %a1, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 0, i32 1), align 1, !tbaa !0 + %v21 = load i8, i8* %a2, align 1, !tbaa !0 + %v22 = zext i8 %v21 to i32 + tail call void @f3(%s.3* @g7, i32 %v2, i32 %v22) #0 + %v23 = load i32, i32* bitcast ([2 x %s.0]* @g0 to i32*), align 8 + %v24 = and i32 %v23, 255 + %v25 = lshr i32 %v23, 8 + %v26 = and i32 %v25, 255 + %v27 = lshr i32 %v23, 16 + %v28 = and i32 %v27, 255 + %v29 = load i32, i32* %v17, align 4, !tbaa !3 + tail call void @f4(%s.3* @g9, i32 %v24, i32 %v26, i32 %v28, i32 %v29) #0 + %v30 = load i8, i8* %a2, align 1, !tbaa !0 + %v31 = zext i8 %v30 to i32 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + %v32 = load i32, i32* bitcast ([2 x %s.0]* @g0 to i32*), align 8 + %v33 = and i32 %v32, 255 + %v34 = lshr i32 %v32, 8 + %v35 = and i32 %v34, 255 + %v36 = lshr i32 %v32, 16 + %v37 = and i32 %v36, 255 + %v38 = load i32, i32* %v17, align 4, !tbaa !3 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + br label %b12 + +b8: ; preds = %b4 + %v39 = load i8, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 0), align 4, !tbaa !0 + %v40 = icmp eq i8 %v39, 1 + %v41 = and i1 %v40, %v8 + br i1 %v41, label %b9, label %b12 + +b9: ; preds = %b8 + store i8 1, i8* %a2, align 1, !tbaa !0 + %v42 = getelementptr inbounds [2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 3, i32 %v2 + %v43 = tail call i32 asm sideeffect "1: $0 = memw_locked($2)\0A $0 = add($0, $3)\0A memw_locked($2, p0) = $0\0A if !p0 jump 1b\0A", "=&r,=*m,r,r,*m,~{p0}"(i32* %v42, i32* %v42, i32 1, i32* %v42) #0, !srcloc !5 + %v44 = load i32, i32* %v42, align 4, !tbaa !3 + %v45 = icmp eq i32 %v44, 255 + br i1 %v45, label %b10, label %b11 + +b10: ; preds = %b9 + tail call void @f2(%s.3* @g13, i32 %v2) #2 + unreachable + +b11: ; preds = %b9 + store i8 %a1, i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 1), align 1, !tbaa !0 + %v46 = load i8, i8* %a2, align 1, !tbaa !0 + %v47 = zext i8 %v46 to i32 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + %v48 = load i32, i32* bitcast (i8* getelementptr inbounds ([2 x %s.0], [2 x %s.0]* @g0, i32 0, i32 1, i32 0) to i32*), align 4 + %v49 = and i32 %v48, 255 + %v50 = lshr i32 %v48, 8 + %v51 = and i32 %v50, 255 + %v52 = lshr i32 %v48, 16 + %v53 = and i32 %v52, 255 + %v54 = load i32, i32* %v42, align 4, !tbaa !3 + tail call void @f1(%s.1* @g2, i32 2, i32 %v0, i32 %v2) + br label %b12 + +b12: ; preds = %b11, %b8, %b7, %b3, %b1 + %v55 = phi i8 [ 0, %b1 ], [ 0, %b3 ], [ 1, %b7 ], [ 1, %b11 ], [ 0, %b8 ] + ret i8 %v55 +} + +declare void @f1(%s.1*, i32, i32, i32) + +; Function Attrs: noreturn +declare void @f2(%s.3*, i32) #1 + +declare void @f3(%s.3*, i32, i32) + +declare void @f4(%s.3*, i32, i32, i32, i32) + +attributes #0 = { nounwind "target-cpu"="hexagonv55" } +attributes #1 = { noreturn } +attributes #2 = { noreturn nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"omnipotent char", !2} +!2 = !{!"Simple C/C++ TBAA"} +!3 = !{!4, !4, i64 0} +!4 = !{!"long", !1} +!5 = !{i32 86170, i32 86211, i32 86247, i32 86291} diff --git a/test/CodeGen/Hexagon/late_instr.ll b/test/CodeGen/Hexagon/late_instr.ll new file mode 100644 index 00000000000..c21e0140ca0 --- /dev/null +++ b/test/CodeGen/Hexagon/late_instr.ll @@ -0,0 +1,231 @@ +; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s + +; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets. +; CHECK: or(q{{[0-3]+}},q{{[0-3]+}}) +; CHECK: } +; CHECK-NOT: } +; CHECK: |= vand(q{{[0-3]+}},r{{[0-9]+}}) +; CHECK: endloop0 + +target triple = "hexagon-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @f0(i8* noalias nocapture readonly %a0, i32 %a1, i32 %a2, i32 %a3, i32* noalias nocapture %a4, i32 %a5) #0 { +b0: + %v0 = mul i32 %a2, 3 + %v1 = bitcast i32* %a4 to <16 x i32>* + %v2 = mul i32 %a5, -2 + %v3 = add i32 %v2, %a1 + %v4 = and i32 %a5, 63 + %v5 = add i32 %v3, %v4 + %v6 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 -1) + %v7 = lshr i32 %v5, 6 + %v8 = and i32 %v7, 7 + %v9 = and i32 %v5, 511 + %v10 = icmp eq i32 %v9, 0 + %v11 = shl i32 -1, %v8 + %v12 = select i1 %v10, i32 0, i32 %v11 + %v13 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %v12) + %v14 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 %v13) + %v15 = tail call <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32> %v14) + %v16 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %v5) + %v17 = shl i32 1, %v8 + %v18 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %v17) + %v19 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32> %v15, <512 x i1> %v16, i32 %v18) + %v20 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %a3) + %v21 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 %v20) + %v22 = icmp sgt i32 %v5, 0 + br i1 %v22, label %b1, label %b8 + +b1: ; preds = %b0 + %v23 = getelementptr inbounds i8, i8* %a0, i32 %a5 + %v24 = bitcast i8* %v23 to <16 x i32>* + %v25 = load <16 x i32>, <16 x i32>* %v24, align 64, !tbaa !0 + %v26 = add i32 %a5, 64 + %v27 = getelementptr inbounds i8, i8* %a0, i32 %v26 + %v28 = bitcast i8* %v27 to <16 x i32>* + %v29 = add i32 %a5, -64 + %v30 = getelementptr inbounds i8, i8* %a0, i32 %v29 + %v31 = bitcast i8* %v30 to <16 x i32>* + %v32 = load <16 x i32>, <16 x i32>* %v31, align 64, !tbaa !0 + %v33 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %a5) + %v34 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1> %v33, i32 16843009) + %v35 = tail call <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32> %v34) + %v36 = add i32 %v0, %a5 + %v37 = getelementptr inbounds i8, i8* %a0, i32 %v36 + %v38 = bitcast i8* %v37 to <16 x i32>* + %v39 = sub i32 %a5, %v0 + %v40 = getelementptr inbounds i8, i8* %a0, i32 %v39 + %v41 = bitcast i8* %v40 to <16 x i32>* + %v42 = tail call <16 x i32> @llvm.hexagon.V6.vd0() + %v43 = add i32 %v4, %a1 + %v44 = mul i32 %a5, 2 + %v45 = sub i32 %v43, %v44 + %v46 = xor i32 %v45, -1 + %v47 = icmp sgt i32 %v46, -513 + %v48 = select i1 %v47, i32 %v46, i32 -513 + %v49 = add i32 %v48, %a1 + %v50 = add i32 %v49, %v4 + %v51 = add i32 %v50, 512 + %v52 = sub i32 %v51, %v44 + %v53 = lshr i32 %v52, 9 + %v54 = mul nuw nsw i32 %v53, 16 + %v55 = add nuw nsw i32 %v54, 16 + %v56 = getelementptr i32, i32* %a4, i32 %v55 + br label %b2 + +b2: ; preds = %b6, %b1 + %v57 = phi i32 [ %v46, %b1 ], [ %v125, %b6 ] + %v58 = phi i32 [ %v5, %b1 ], [ %v123, %b6 ] + %v59 = phi <16 x i32>* [ %v1, %b1 ], [ %v122, %b6 ] + %v60 = phi <16 x i32>* [ %v38, %b1 ], [ %v114, %b6 ] + %v61 = phi <16 x i32>* [ %v41, %b1 ], [ %v115, %b6 ] + %v62 = phi <16 x i32>* [ %v28, %b1 ], [ %v116, %b6 ] + %v63 = phi i32 [ 512, %b1 ], [ %v69, %b6 ] + %v64 = phi i32 [ -2139062144, %b1 ], [ %v117, %b6 ] + %v65 = phi <16 x i32> [ %v32, %b1 ], [ %v118, %b6 ] + %v66 = phi <16 x i32> [ %v25, %b1 ], [ %v119, %b6 ] + %v67 = phi <16 x i32> [ %v35, %b1 ], [ %v6, %b6 ] + %v68 = icmp slt i32 %v58, %v63 + %v69 = select i1 %v68, i32 %v58, i32 %v63 + %v70 = icmp sgt i32 %v69, 0 + br i1 %v70, label %b3, label %b6 + +b3: ; preds = %b2 + %v71 = xor i32 %v63, -1 + %v72 = icmp sgt i32 %v57, %v71 + %v73 = select i1 %v72, i32 %v57, i32 %v71 + %v74 = icmp sgt i32 %v73, -65 + %v75 = add i32 %v73, 63 + %v76 = select i1 %v74, i32 %v75, i32 -2 + %v77 = sub i32 %v76, %v73 + %v78 = lshr i32 %v77, 6 + br label %b4 + +b4: ; preds = %b4, %b3 + %v79 = phi i32 [ %v69, %b3 ], [ %v108, %b4 ] + %v80 = phi <16 x i32>* [ %v60, %b3 ], [ %v89, %b4 ] + %v81 = phi <16 x i32>* [ %v61, %b3 ], [ %v87, %b4 ] + %v82 = phi <16 x i32>* [ %v62, %b3 ], [ %v92, %b4 ] + %v83 = phi i32 [ %v64, %b3 ], [ %v106, %b4 ] + %v84 = phi <16 x i32> [ %v65, %b3 ], [ %v85, %b4 ] + %v85 = phi <16 x i32> [ %v66, %b3 ], [ %v93, %b4 ] + %v86 = phi <16 x i32> [ %v42, %b3 ], [ %v107, %b4 ] + %v87 = getelementptr inbounds <16 x i32>, <16 x i32>* %v81, i32 1 + %v88 = load <16 x i32>, <16 x i32>* %v81, align 64, !tbaa !0 + %v89 = getelementptr inbounds <16 x i32>, <16 x i32>* %v80, i32 1 + %v90 = load <16 x i32>, <16 x i32>* %v80, align 64, !tbaa !0 + %v91 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v85, <16 x i32> %v84, i32 3) + %v92 = getelementptr inbounds <16 x i32>, <16 x i32>* %v82, i32 1 + %v93 = load <16 x i32>, <16 x i32>* %v82, align 64, !tbaa !0 + %v94 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v93, <16 x i32> %v85, i32 3) + %v95 = tail call <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32> %v85, <16 x i32> %v21) + %v96 = tail call <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32> %v85, <16 x i32> %v21) + %v97 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v88, <16 x i32> %v90) + %v98 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v88, <16 x i32> %v90) + %v99 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v94, <16 x i32> %v91) + %v100 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v94, <16 x i32> %v91) + %v101 = tail call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %v97, <16 x i32> %v99) + %v102 = tail call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %v98, <16 x i32> %v100) + %v103 = tail call <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32> %v101, <16 x i32> %v96) + %v104 = tail call <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32> %v95, <16 x i32> %v102) + %v105 = tail call <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1> %v103, <512 x i1> %v104) + %v106 = tail call i32 @llvm.hexagon.S6.rol.i.r(i32 %v83, i32 1) + %v107 = tail call <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32> %v86, <512 x i1> %v105, i32 %v106) + %v108 = add nsw i32 %v79, -64 + %v109 = icmp sgt i32 %v79, 64 + br i1 %v109, label %b4, label %b5 + +b5: ; preds = %b4 + %v110 = add nuw nsw i32 %v78, 1 + %v111 = getelementptr <16 x i32>, <16 x i32>* %v62, i32 %v110 + %v112 = getelementptr <16 x i32>, <16 x i32>* %v60, i32 %v110 + %v113 = getelementptr <16 x i32>, <16 x i32>* %v61, i32 %v110 + br label %b6 + +b6: ; preds = %b5, %b2 + %v114 = phi <16 x i32>* [ %v112, %b5 ], [ %v60, %b2 ] + %v115 = phi <16 x i32>* [ %v113, %b5 ], [ %v61, %b2 ] + %v116 = phi <16 x i32>* [ %v111, %b5 ], [ %v62, %b2 ] + %v117 = phi i32 [ %v106, %b5 ], [ %v64, %b2 ] + %v118 = phi <16 x i32> [ %v85, %b5 ], [ %v65, %b2 ] + %v119 = phi <16 x i32> [ %v93, %b5 ], [ %v66, %b2 ] + %v120 = phi <16 x i32> [ %v107, %b5 ], [ %v42, %b2 ] + %v121 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v120, <16 x i32> %v67) + %v122 = getelementptr inbounds <16 x i32>, <16 x i32>* %v59, i32 1 + store <16 x i32> %v121, <16 x i32>* %v59, align 64, !tbaa !0 + %v123 = add nsw i32 %v58, -512 + %v124 = icmp sgt i32 %v58, 512 + %v125 = add i32 %v57, 512 + br i1 %v124, label %b2, label %b7 + +b7: ; preds = %b6 + %v126 = bitcast i32* %v56 to <16 x i32>* + br label %b8 + +b8: ; preds = %b7, %b0 + %v127 = phi <16 x i32>* [ %v126, %b7 ], [ %v1, %b0 ] + %v128 = getelementptr inbounds <16 x i32>, <16 x i32>* %v127, i32 -1 + %v129 = load <16 x i32>, <16 x i32>* %v128, align 64, !tbaa !0 + %v130 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v129, <16 x i32> %v19) + store <16 x i32> %v130, <16 x i32>* %v128, align 64, !tbaa !0 + ret void +} + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1>, i32) #1 + +; Function Attrs: nounwind readnone +declare <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.S2.vsplatrb(i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32>, <512 x i1>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vd0() #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1>, <512 x i1>) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.S6.rol.i.r(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vand(<16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" } +attributes #1 = { nounwind readnone } + +!0 = !{!1, !1, i64 0} +!1 = !{!"omnipotent char", !2, i64 0} +!2 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/Hexagon/mlong-calls.ll b/test/CodeGen/Hexagon/mlong-calls.ll new file mode 100644 index 00000000000..d76b87f987f --- /dev/null +++ b/test/CodeGen/Hexagon/mlong-calls.ll @@ -0,0 +1,41 @@ +; RUN: llc -hexagon-long-calls -march=hexagon -enable-save-restore-long=true < %s | FileCheck %s + +; CHECK: call ##f1 +; CHECK: jump ##__restore + +; Function Attrs: minsize nounwind +define i64 @f0(i32 %a0, i32 %a1) #0 { +b0: + %v0 = add nsw i32 %a0, 5 + %v1 = tail call i64 @f1(i32 %v0) #1 + %v2 = sext i32 %a1 to i64 + %v3 = add nsw i64 %v1, %v2 + ret i64 %v3 +} + +; Function Attrs: minsize nounwind +declare i64 @f1(i32) #0 + +; Function Attrs: nounwind +define i64 @f2(i32 %a0, i32 %a1) #1 { +b0: + %v0 = add nsw i32 %a0, 5 + %v1 = tail call i64 @f1(i32 %v0) #1 + ret i64 %v1 +} + +; Function Attrs: noreturn nounwind +define i64 @f3(i32 %a0, i32 %a1) #2 { +b0: + %v0 = add nsw i32 %a0, 5 + %v1 = tail call i64 @f4(i32 %v0) #2 + unreachable +} + +; Function Attrs: noreturn +declare i64 @f4(i32) #3 + +attributes #0 = { minsize nounwind } +attributes #1 = { nounwind } +attributes #2 = { noreturn nounwind } +attributes #3 = { noreturn } diff --git a/test/CodeGen/Hexagon/simplify64bitops_7223.ll b/test/CodeGen/Hexagon/simplify64bitops_7223.ll new file mode 100644 index 00000000000..56093c1d00c --- /dev/null +++ b/test/CodeGen/Hexagon/simplify64bitops_7223.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner < %s +; REQUIRES: asserts +; CHECK-NOT: and( +; CHECK-NOT: or( +; CHECK-NOT: combine(0 +; CHECK: add +; CHECK: add( +; CHECK-NEXT: memuh( +; CHECK-NEXT: endloop + +%s.22 = type { i64 } + +@g0 = common global i32 0, align 4 + +; Function Attrs: nounwind +define i64 @f0(%s.22* nocapture %a0, i32 %a1) #0 { +b0: + %v0 = bitcast %s.22* %a0 to i16* + %v1 = load i16, i16* %v0, align 2, !tbaa !0 + %v2 = zext i16 %v1 to i64 + %v3 = icmp sgt i32 %a1, 0 + br i1 %v3, label %b1, label %b4 + +b1: ; preds = %b0 + br label %b2 + +b2: ; preds = %b2, %b1 + %v4 = phi i16* [ %v8, %b2 ], [ %v0, %b1 ] + %v5 = phi i32 [ %v10, %b2 ], [ undef, %b1 ] + %v6 = phi i32 [ %v15, %b2 ], [ 0, %b1 ] + %v7 = phi i64 [ %v14, %b2 ], [ %v2, %b1 ] + %v8 = getelementptr inbounds i16, i16* %v4, i32 1 + %v9 = trunc i64 %v7 to i32 + %v10 = add i32 %v5, %v9 + %v11 = load i16, i16* %v8, align 2, !tbaa !0 + %v12 = zext i16 %v11 to i64 + %v13 = and i64 %v7, -4294967296 + %v14 = or i64 %v12, %v13 + %v15 = add nsw i32 %v6, 1 + %v16 = icmp eq i32 %v15, %a1 + br i1 %v16, label %b3, label %b2 + +b3: ; preds = %b2 + br label %b4 + +b4: ; preds = %b3, %b0 + %v17 = phi i32 [ undef, %b0 ], [ %v10, %b3 ] + %v18 = phi i64 [ %v2, %b0 ], [ %v14, %b3 ] + store volatile i32 %v17, i32* @g0, align 4, !tbaa !4 + ret i64 %v18 +} + +attributes #0 = { nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"short", !2} +!2 = !{!"omnipotent char", !3} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!5, !5, i64 0} +!5 = !{!"long", !2} diff --git a/test/CodeGen/Hexagon/swp-carried-1.ll b/test/CodeGen/Hexagon/swp-carried-1.ll new file mode 100644 index 00000000000..e0aff5cb28c --- /dev/null +++ b/test/CodeGen/Hexagon/swp-carried-1.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched < %s | FileCheck %s + +; Test that we generate the correct code when a loop carried value +; is scheduled one stage earlier than it's use. The code in +; isLoopCarried was returning false in this case, and the generated +; code was missing an copy. + +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: += mpy([[REG0:(r[0-9]+)]],r{{[0-9]+}}) +; CHECK: [[REG0]] = r{{[0-9]+}} +; CHECK-NOT: [[REG0]] = memw +; CHECK: endloop0 + +@g0 = external global [256 x i32], align 8 + +define void @f0() #0 { +b0: + br label %b1 + +b1: ; preds = %b1, %b0 + br i1 undef, label %b2, label %b1 + +b2: ; preds = %b1 + br label %b3 + +b3: ; preds = %b3, %b2 + %v0 = phi i32* [ getelementptr inbounds ([256 x i32], [256 x i32]* @g0, i32 0, i32 0), %b2 ], [ %v1, %b3 ] + %v1 = getelementptr i32, i32* %v0, i32 6 + br i1 undef, label %b4, label %b3 + +b4: ; preds = %b3 + br i1 undef, label %b6, label %b5 + +b5: ; preds = %b5, %b4 + %v2 = phi i64 [ %v19, %b5 ], [ undef, %b4 ] + %v3 = phi i32* [ %v8, %b5 ], [ %v1, %b4 ] + %v4 = phi i32 [ %v9, %b5 ], [ undef, %b4 ] + %v5 = phi i32 [ %v11, %b5 ], [ undef, %b4 ] + %v6 = phi i32 [ %v5, %b5 ], [ undef, %b4 ] + %v7 = phi i32 [ %v10, %b5 ], [ 0, %b4 ] + %v8 = getelementptr i32, i32* %v3, i32 1 + %v9 = add nsw i32 %v4, 1 + %v10 = load i32, i32* %v8, align 4 + %v11 = load i32, i32* null, align 4 + %v12 = sext i32 %v6 to i64 + %v13 = sext i32 %v10 to i64 + %v14 = sext i32 %v7 to i64 + %v15 = mul nsw i64 %v14, %v12 + %v16 = add i64 %v12, %v2 + %v17 = add i64 %v16, %v13 + %v18 = add i64 %v17, 0 + %v19 = add i64 %v18, %v15 + %v20 = icmp eq i32 %v9, 128 + br i1 %v20, label %b6, label %b5 + +b6: ; preds = %b5, %b4 + %v21 = phi i64 [ undef, %b4 ], [ %v19, %b5 ] + unreachable +} + +attributes #0 = { nounwind "target-cpu"="hexagonv62" } diff --git a/test/CodeGen/Hexagon/swp-change-deps.ll b/test/CodeGen/Hexagon/swp-change-deps.ll new file mode 100644 index 00000000000..cf9dc79ad69 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-change-deps.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Test that we generate the correct offsets for loads in the prolog +; after removing dependences on a post-increment instructions of the +; base register. + +; CHECK: memh([[REG0:(r[0-9]+)]]+#0) +; CHECK: memh([[REG0]]+#2) +; CHECK: loop0 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.A2.sath(i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.A2.asrh(i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.A2.addsat(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32, i32) #1 + +define void @f0() #0 align 2 { +b0: + br label %b1 + +b1: ; preds = %b0 + br label %b2 + +b2: ; preds = %b2, %b1 + %v0 = phi i16* [ undef, %b1 ], [ %v14, %b2 ] + %v1 = phi i32 [ 0, %b1 ], [ %v12, %b2 ] + %v2 = load i16, i16* %v0, align 2 + %v3 = sext i16 %v2 to i32 + %v4 = call i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32 undef, i32 %v3) + %v5 = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v4, i32 undef) + %v6 = call i32 @llvm.hexagon.A2.addsat(i32 %v5, i32 32768) + %v7 = call i32 @llvm.hexagon.A2.asrh(i32 %v6) + %v8 = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v7, i32 undef) + %v9 = call i32 @llvm.hexagon.A2.sath(i32 %v8) + %v10 = trunc i32 %v9 to i16 + store i16 %v10, i16* null, align 2 + %v11 = trunc i32 %v7 to i16 + store i16 %v11, i16* %v0, align 2 + %v12 = add nsw i32 %v1, 1 + %v13 = icmp slt i32 %v12, undef + %v14 = getelementptr i16, i16* %v0, i32 1 + br i1 %v13, label %b2, label %b3 + +b3: ; preds = %b2 + unreachable + +b4: ; No predecessors! + unreachable +} + +attributes #0 = { nounwind "target-cpu"="hexagonv55" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/Hexagon/swp-epilog-numphis.ll b/test/CodeGen/Hexagon/swp-epilog-numphis.ll new file mode 100644 index 00000000000..0af6c5a327d --- /dev/null +++ b/test/CodeGen/Hexagon/swp-epilog-numphis.ll @@ -0,0 +1,82 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK: endloop0 +; CHECK: vmem +; CHECK: vmem([[REG:r([0-9]+)]]+#1) = +; CHECK: vmem([[REG]]+#0) = + +define void @f0(i32 %a0) local_unnamed_addr #0 { +b0: + br label %b1 + +b1: ; preds = %b1, %b0 + %v0 = phi i32 [ %v33, %b1 ], [ %a0, %b0 ] + %v1 = phi <16 x i32>* [ %v32, %b1 ], [ undef, %b0 ] + %v2 = phi <16 x i32>* [ %v23, %b1 ], [ undef, %b0 ] + %v3 = phi <16 x i32>* [ %v10, %b1 ], [ undef, %b0 ] + %v4 = phi <16 x i32>* [ %v8, %b1 ], [ null, %b0 ] + %v5 = phi <32 x i32> [ %v12, %b1 ], [ undef, %b0 ] + %v6 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v5) + %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v6, <16 x i32> undef, i32 6) + %v8 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 1 + %v9 = load <16 x i32>, <16 x i32>* %v4, align 64 + %v10 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 1 + %v11 = load <16 x i32>, <16 x i32>* %v3, align 64 + %v12 = tail call <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32> %v11, <16 x i32> %v9) + %v13 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v12) + %v14 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v13, <16 x i32> undef) + %v15 = tail call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %v14, <16 x i32> undef, i32 4) + %v16 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v14, <16 x i32> %v15) + %v17 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v14, <16 x i32> undef, i32 4) + %v18 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %v16, <16 x i32> undef, i32 2) + %v19 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> undef, <16 x i32> %v17) + %v20 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v18, <16 x i32> %v19) + %v21 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 1 + %v22 = load <16 x i32>, <16 x i32>* %v2, align 64 + %v23 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 2 + %v24 = load <16 x i32>, <16 x i32>* %v21, align 64 + %v25 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v22, <16 x i32> %v7) + %v26 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v24, <16 x i32> undef) + %v27 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v25, <16 x i32> %v20) + %v28 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v26, <16 x i32> %v20) + store <16 x i32> %v27, <16 x i32>* %v2, align 64 + store <16 x i32> %v28, <16 x i32>* %v21, align 64 + %v29 = tail call <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32> %v27, i32 17760527) + %v30 = tail call <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32> %v28, i32 17760527) + %v31 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v30, <16 x i32> %v29) + %v32 = getelementptr inbounds <16 x i32>, <16 x i32>* %v1, i32 1 + store <16 x i32> %v31, <16 x i32>* %v1, align 64 + %v33 = add nsw i32 %v0, -64 + %v34 = icmp sgt i32 %v0, 192 + br i1 %v34, label %b1, label %b2 + +b2: ; preds = %b1 + unreachable +} + +; Function Attrs: nounwind readnone +declare <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv65" "target-features"="+hvxv65,+hvx-length64b" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/Hexagon/swp-epilog-phi9.ll b/test/CodeGen/Hexagon/swp-epilog-phi9.ll new file mode 100644 index 00000000000..db92a33b559 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-epilog-phi9.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Test that we generate the correct Phi name in the last couple of epilog +; blocks, when there are 3 epilog blocks. The Phi was scheduled in stage +; 2, so the computation for the number of Phis needs to be adjusted when +; the incoming prolog block is from prolog 0 or prolog 1. +; Note: the pipeliner no longer generates a 3 stage pipeline for this test. + +; CHECK: loop0 +; CHECK: [[REG0:r([0-9]+)]] = add(r{{[0-8]+}},#8) +; CHECK: endloop0 +; CHECK: [[REG0]] = add(r{{[0-9]+}},#8) + +; Function Attrs: nounwind +define void @f0(i16* nocapture readonly %a0) #0 { +b0: + %v0 = alloca [129 x i32], align 8 + br i1 undef, label %b1, label %b3 + +b1: ; preds = %b0 + br label %b2 + +b2: ; preds = %b2, %b1 + %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ] + %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ] + %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ] + %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ] + %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ] + %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ] + %v7 = load i16, i16* %v2, align 2 + %v8 = sext i16 %v7 to i32 + %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2 + %v10 = load i16, i16* %v6, align 2 + %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2 + store i32 %v11, i32* %v4, align 4 + %v12 = getelementptr inbounds i16, i16* %v6, i32 -1 + %v13 = add i32 %v5, 1 + %v14 = getelementptr inbounds i32, i32* %v3, i32 2 + %v15 = getelementptr inbounds i16, i16* %v1, i32 2 + %v16 = icmp slt i32 %v13, undef + br i1 %v16, label %b2, label %b3 + +b3: ; preds = %b2, %b0 + unreachable +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } diff --git a/test/CodeGen/Hexagon/swp-phi-ref.ll b/test/CodeGen/Hexagon/swp-phi-ref.ll new file mode 100644 index 00000000000..1b6def17bd9 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-phi-ref.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s | FileCheck %s + +; Test that we generate the correct Phi values when there is a Phi that +; references another Phi. We need to examine the other Phi to get the +; correct value. We need to do this even if we haven't generated the +; kernel code for the other Phi yet. + +; CHECK: [[REG0:(v[0-9]+)]] = [[REG1:(v[0-9]+)]] +; CHECK: loop0 +; CHECK: [[REG0]] = [[REG1]] +; CHECK: endloop0 + +; Function Attrs: nounwind +define void @f0() #0 { +b0: + br i1 undef, label %b1, label %b2 + +b1: ; preds = %b1, %b0 + %v0 = phi i32 [ %v7, %b1 ], [ 0, %b0 ] + %v1 = phi <16 x i32> [ %v4, %b1 ], [ undef, %b0 ] + %v2 = phi <16 x i32> [ %v1, %b1 ], [ undef, %b0 ] + %v3 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v1, <16 x i32> %v2, i32 62) + %v4 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> undef, <16 x i32> undef) + %v5 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v4, <16 x i32> %v1, i32 2) + %v6 = tail call <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32> %v3, <16 x i32> %v5) + store <16 x i32> %v6, <16 x i32>* null, align 64 + %v7 = add nsw i32 %v0, 1 + %v8 = icmp slt i32 %v7, undef + br i1 %v8, label %b1, label %b2 + +b2: ; preds = %b1, %b0 + ret void +} + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/Hexagon/swp-phi-start.ll b/test/CodeGen/Hexagon/swp-phi-start.ll new file mode 100644 index 00000000000..0e451f924a9 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-phi-start.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 -disable-packetizer < %s | FileCheck %s + +; Test that the early start and late start values are computed correctly +; when a Phi depends on another Phi. In this case, they should occur in +; the same stage. + +; CHECK-DAG: [[REG3:(r[0-9]+)]] = add([[REG1:(r[0-9]+)]],#-1) +; CHECK-DAG: [[REG2:(r[0-9]+)]] = add([[REG1]],#-1) +; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],[[REG3]]) +; CHECK-NOT: = [[REG2]] +; CHECK: .LBB0_[[LOOP]]: +; CHECK: }{{[ \t]*}}:endloop + +; Function Attrs: nounwind +define void @f0(i32 %a0, i16* nocapture %a1) #0 { +b0: + br i1 undef, label %b1, label %b2 + +b1: ; preds = %b0 + %v0 = add nsw i32 undef, -8 + br i1 undef, label %b3, label %b2 + +b2: ; preds = %b2, %b1, %b0 + %v1 = phi i32 [ %v7, %b2 ], [ undef, %b0 ], [ %v0, %b1 ] + %v2 = phi i32 [ %v1, %b2 ], [ %a0, %b0 ], [ undef, %b1 ] + %v3 = add nsw i32 %v2, -2 + %v4 = getelementptr inbounds i16, i16* %a1, i32 %v3 + %v5 = load i16, i16* %v4, align 2, !tbaa !0 + %v6 = getelementptr inbounds i16, i16* %a1, i32 %v1 + store i16 %v5, i16* %v6, align 2, !tbaa !0 + %v7 = add nsw i32 %v1, -1 + %v8 = icmp sgt i32 %v7, 0 + br i1 %v8, label %b2, label %b3 + +b3: ; preds = %b2, %b1 + ret void +} + +attributes #0 = { nounwind "target-cpu"="hexagonv55" } + +!0 = !{!1, !1, i64 0} +!1 = !{!"short", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/Hexagon/swp-rename.ll b/test/CodeGen/Hexagon/swp-rename.ll new file mode 100644 index 00000000000..7f5060134f4 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-rename.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s + +; A test that the Phi rewrite logic is correct. + +; CHECK: [[REG0:(r[0-9]+)]] = #0 +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: memh([[REG0]]+#0) = #0 + +define void @f0() #0 { +b0: + %v0 = add i32 undef, -4 + br label %b1 + +b1: ; preds = %b1, %b0 + %v1 = phi i16* [ %v4, %b1 ], [ null, %b0 ] + %v2 = phi i32 [ %v5, %b1 ], [ 0, %b0 ] + %v3 = getelementptr inbounds i16, i16* %v1, i32 1 + store i16 0, i16* %v1, align 2 + %v4 = getelementptr inbounds i16, i16* %v1, i32 2 + store i16 0, i16* %v3, align 2 + %v5 = add nsw i32 %v2, 8 + %v6 = icmp slt i32 %v5, %v0 + br i1 %v6, label %b1, label %b2 + +b2: ; preds = %b1 + ret void +} + +attributes #0 = { nounwind "target-cpu"="hexagonv55" } diff --git a/test/CodeGen/Hexagon/swp-xxh2.ll b/test/CodeGen/Hexagon/swp-xxh2.ll new file mode 100644 index 00000000000..55f39e263d5 --- /dev/null +++ b/test/CodeGen/Hexagon/swp-xxh2.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s +; REQUIRES: asserts + +; Fix bug when pipelining xxh benchmark at O3, mv55, and with vectorization. +; The problem is choosing the correct name for the Phis in the epilog. + +; CHECK: New block +; CHECK: %{{.*}}, %[[REG:([0-9]+)]]{{.*}} = L2_loadri_pi +; CHECK: epilog: +; CHECK: = PHI +; CHECK-NOT: = PHI %{{[0-9]+}}, {{.*}}, %[[REG]] +; CHECK: = PHI + +; Function Attrs: nounwind +define void @f0(i32 %a0, i32* %a1) #0 { +b0: + %v0 = ashr i32 %a0, 1 + br label %b1 + +b1: ; preds = %b1, %b0 + %v1 = phi i64 [ %v8, %b1 ], [ undef, %b0 ] + %v2 = phi i32 [ %v9, %b1 ], [ 0, %b0 ] + %v3 = phi i32 [ %v7, %b1 ], [ undef, %b0 ] + %v4 = inttoptr i32 %v3 to i32* + %v5 = load i32, i32* %v4, align 4, !tbaa !0 + %v6 = tail call i64 @llvm.hexagon.S2.packhl(i32 %v5, i32 undef) + %v7 = add nsw i32 %v3, -16 + %v8 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %v1, i64 undef, i64 %v6) + %v9 = add nsw i32 %v2, 1 + %v10 = icmp eq i32 %v9, %v0 + br i1 %v10, label %b2, label %b1 + +b2: ; preds = %b1 + %v11 = trunc i64 %v8 to i32 + %v12 = getelementptr inbounds i32, i32* %a1, i32 8 + store i32 %v11, i32* %v12, align 4, !tbaa !0 + call void @llvm.trap() + unreachable +} + +; Function Attrs: nounwind readnone +declare i64 @llvm.hexagon.M2.vdmacs.s0(i64, i64, i64) #1 + +; Function Attrs: nounwind readnone +declare i64 @llvm.hexagon.S2.packhl(i32, i32) #1 + +; Function Attrs: noreturn nounwind +declare void @llvm.trap() #2 + +attributes #0 = { nounwind "target-cpu"="hexagonv55" } +attributes #1 = { nounwind readnone } +attributes #2 = { noreturn nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/Hexagon/vect-downscale.ll b/test/CodeGen/Hexagon/vect-downscale.ll new file mode 100644 index 00000000000..514581789e1 --- /dev/null +++ b/test/CodeGen/Hexagon/vect-downscale.ll @@ -0,0 +1,177 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Make sure we generate a hardware loop and pipeline the inner loop using +; 4 packets, which is equivalent to the hand-coded version. + +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: { +; CHECK: } +; CHECK: { +; CHECK: } +; CHECK: { +; CHECK: } +; CHECK: { +; CHECK-NOT: } +; CHECK: }{{[ \t]*}}:endloop0 + +define void @f0(i8* noalias %a0, i32 %a1, i32 %a2, i32 %a3, i8* noalias nocapture %a4, i32 %a5, i32 %a6) #0 { +b0: + %v0 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 8388736) + %v1 = zext i32 %a3 to i64 + %v2 = shl nuw i64 %v1, 32 + %v3 = zext i32 %a1 to i64 + %v4 = shl nuw nsw i64 %v3, 16 + %v5 = or i64 %v4, %v2 + %v6 = or i64 %v5, 281474976710658 + tail call void asm sideeffect " l2fetch($0, $1)\0A", "r,r"(i8* %a0, i64 %v6) #2, !srcloc !0 + %v7 = tail call i32 @llvm.hexagon.S2.ct0(i32 %a6) + %v8 = add i32 %v7, 1 + %v9 = lshr i32 %a1, %v8 + %v10 = mul i32 %a6, 2 + %v11 = mul i32 %v10, %v9 + %v12 = sub i32 %a1, %v11 + %v13 = lshr i32 %v12, 1 + %v14 = tail call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 %v13) + %v15 = icmp eq i32 %a2, 0 + br i1 %v15, label %b11, label %b1 + +b1: ; preds = %b0 + %v16 = mul i32 %a3, 2 + %v17 = icmp eq i32 %v9, 0 + %v18 = icmp eq i32 %v11, %a1 + %v19 = icmp ugt i32 %v12, %a6 + %v20 = mul i32 %v9, 64 + %v21 = getelementptr i8, i8* %a4, i32 %v20 + %v22 = mul i32 %v9, 128 + %v23 = add i32 %v22, %a3 + %v24 = getelementptr i8, i8* %a0, i32 %v23 + %v25 = getelementptr i8, i8* %a0, i32 %v22 + br label %b2 + +b2: ; preds = %b10, %b1 + %v26 = phi i8* [ %v25, %b1 ], [ %v90, %b10 ] + %v27 = phi i8* [ %v24, %b1 ], [ %v89, %b10 ] + %v28 = phi i8* [ %v21, %b1 ], [ %v88, %b10 ] + %v29 = phi <16 x i32> [ undef, %b1 ], [ %v85, %b10 ] + %v30 = phi <16 x i32> [ undef, %b1 ], [ %v84, %b10 ] + %v31 = phi i8* [ %a0, %b1 ], [ %v86, %b10 ] + %v32 = phi i8* [ %a4, %b1 ], [ %v87, %b10 ] + %v33 = phi i32 [ 0, %b1 ], [ %v37, %b10 ] + %v34 = bitcast i8* %v26 to <16 x i32>* + %v35 = bitcast i8* %v27 to <16 x i32>* + %v36 = bitcast i8* %v28 to <16 x i32>* + %v37 = add nsw i32 %v33, 2 + %v38 = icmp ult i32 %v37, %a2 + br i1 %v38, label %b3, label %b4 + +b3: ; preds = %b2 + %v39 = getelementptr inbounds i8, i8* %v31, i32 %v16 + tail call void asm sideeffect " l2fetch($0, $1)\0A", "r,r"(i8* %v39, i64 %v6) #2, !srcloc !1 + br label %b4 + +b4: ; preds = %b3, %b2 + %v40 = bitcast i8* %v32 to <16 x i32>* + %v41 = bitcast i8* %v31 to <16 x i32>* + %v42 = getelementptr inbounds i8, i8* %v31, i32 %a3 + %v43 = bitcast i8* %v42 to <16 x i32>* + br i1 %v17, label %b6, label %b5 + +b5: ; preds = %b5, %b4 + %v44 = phi <16 x i32>* [ %v54, %b5 ], [ %v43, %b4 ] + %v45 = phi <16 x i32>* [ %v52, %b5 ], [ %v41, %b4 ] + %v46 = phi <16 x i32>* [ %v61, %b5 ], [ %v40, %b4 ] + %v47 = phi i32 [ %v62, %b5 ], [ 0, %b4 ] + %v48 = getelementptr inbounds <16 x i32>, <16 x i32>* %v45, i32 1 + %v49 = load <16 x i32>, <16 x i32>* %v45, align 64, !tbaa !2 + %v50 = getelementptr inbounds <16 x i32>, <16 x i32>* %v44, i32 1 + %v51 = load <16 x i32>, <16 x i32>* %v44, align 64, !tbaa !2 + %v52 = getelementptr inbounds <16 x i32>, <16 x i32>* %v45, i32 2 + %v53 = load <16 x i32>, <16 x i32>* %v48, align 64, !tbaa !2 + %v54 = getelementptr inbounds <16 x i32>, <16 x i32>* %v44, i32 2 + %v55 = load <16 x i32>, <16 x i32>* %v50, align 64, !tbaa !2 + %v56 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v49, i32 1077952576) + %v57 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v53, i32 1077952576) + %v58 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v56, <16 x i32> %v51, i32 1077952576) + %v59 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v57, <16 x i32> %v55, i32 1077952576) + %v60 = tail call <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32> %v59, <16 x i32> %v58) + %v61 = getelementptr inbounds <16 x i32>, <16 x i32>* %v46, i32 1 + store <16 x i32> %v60, <16 x i32>* %v46, align 64, !tbaa !2 + %v62 = add nsw i32 %v47, 1 + %v63 = icmp eq i32 %v62, %v9 + br i1 %v63, label %b6, label %b5 + +b6: ; preds = %b5, %b4 + %v64 = phi <16 x i32> [ %v29, %b4 ], [ %v55, %b5 ] + %v65 = phi <16 x i32> [ %v30, %b4 ], [ %v53, %b5 ] + %v66 = phi <16 x i32>* [ %v43, %b4 ], [ %v35, %b5 ] + %v67 = phi <16 x i32>* [ %v41, %b4 ], [ %v34, %b5 ] + %v68 = phi <16 x i32>* [ %v40, %b4 ], [ %v36, %b5 ] + br i1 %v18, label %b10, label %b7 + +b7: ; preds = %b6 + %v69 = load <16 x i32>, <16 x i32>* %v67, align 64, !tbaa !2 + %v70 = load <16 x i32>, <16 x i32>* %v66, align 64, !tbaa !2 + br i1 %v19, label %b8, label %b9 + +b8: ; preds = %b7 + %v71 = getelementptr inbounds <16 x i32>, <16 x i32>* %v66, i32 1 + %v72 = getelementptr inbounds <16 x i32>, <16 x i32>* %v67, i32 1 + %v73 = load <16 x i32>, <16 x i32>* %v72, align 64, !tbaa !2 + %v74 = load <16 x i32>, <16 x i32>* %v71, align 64, !tbaa !2 + br label %b9 + +b9: ; preds = %b8, %b7 + %v75 = phi <16 x i32> [ %v73, %b8 ], [ %v65, %b7 ] + %v76 = phi <16 x i32> [ %v74, %b8 ], [ %v64, %b7 ] + %v77 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v69, i32 1077952576) + %v78 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v0, <16 x i32> %v75, i32 1077952576) + %v79 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v77, <16 x i32> %v70, i32 1077952576) + %v80 = tail call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %v78, <16 x i32> %v76, i32 1077952576) + %v81 = tail call <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32> %v80, <16 x i32> %v79) + %v82 = load <16 x i32>, <16 x i32>* %v68, align 64, !tbaa !2 + %v83 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1> %v14, <16 x i32> %v81, <16 x i32> %v82) + store <16 x i32> %v83, <16 x i32>* %v68, align 64, !tbaa !2 + br label %b10 + +b10: ; preds = %b9, %b6 + %v84 = phi <16 x i32> [ %v75, %b9 ], [ %v65, %b6 ] + %v85 = phi <16 x i32> [ %v76, %b9 ], [ %v64, %b6 ] + %v86 = getelementptr inbounds i8, i8* %v31, i32 %v16 + %v87 = getelementptr inbounds i8, i8* %v32, i32 %a5 + %v88 = getelementptr i8, i8* %v28, i32 %a5 + %v89 = getelementptr i8, i8* %v27, i32 %v16 + %v90 = getelementptr i8, i8* %v26, i32 %v16 + br i1 %v38, label %b2, label %b11 + +b11: ; preds = %b10, %b0 + ret void +} + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.S2.ct0(i32) #1 + +; Function Attrs: nounwind readnone +declare <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32>, <16 x i32>, i32) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1>, <16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } + +!0 = !{i32 -2146401371} +!1 = !{i32 -2146401153} +!2 = !{!3, !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} -- 2.11.0