From 1febf59617ebd02386183e8ef2c36018f95dd703 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Tue, 30 Jan 2018 15:40:27 +0000 Subject: [PATCH] [AArch64] Update test cases for Exynos M3 Update any test case relevant for Exynos M3. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323775 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/arm64-ldp-cluster.ll | 69 ++++++++++++++------------- test/CodeGen/AArch64/arm64-neon-2velem.ll | 59 +++++++++++++++++++++++ test/CodeGen/AArch64/machine-combiner-madd.ll | 1 + test/CodeGen/AArch64/no-quad-ldp-stp.ll | 19 ++++---- 4 files changed, 106 insertions(+), 42 deletions(-) diff --git a/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/test/CodeGen/AArch64/arm64-ldp-cluster.ll index 75b02b9d913..80b67770665 100644 --- a/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -1,6 +1,7 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOSM1 %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; Test ldr clustering. ; CHECK: ********** MI Scheduling ********** @@ -8,11 +9,11 @@ ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDRWui ; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int:%bb.0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %{{[0-9]+}}:gpr32 = LDRWui -; EXYNOS: SU(2): %{{[0-9]+}}:gpr32 = LDRWui +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldr_int:%bb.0 +; EXYNOSM1: Cluster ld/st SU(1) - SU(2) +; EXYNOSM1: SU(1): %{{[0-9]+}}:gpr32 = LDRWui +; EXYNOSM1: SU(2): %{{[0-9]+}}:gpr32 = LDRWui define i32 @ldr_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load i32, i32* %p1, align 2 @@ -28,11 +29,11 @@ define i32 @ldr_int(i32* %a) nounwind { ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %{{[0-9]+}}:gpr64 = LDRSWui ; CHECK: SU(2): %{{[0-9]+}}:gpr64 = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_sext_int:%bb.0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %{{[0-9]+}}:gpr64 = LDRSWui -; EXYNOS: SU(2): %{{[0-9]+}}:gpr64 = LDRSWui +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldp_sext_int:%bb.0 +; EXYNOSM1: Cluster ld/st SU(1) - SU(2) +; EXYNOSM1: SU(1): %{{[0-9]+}}:gpr64 = LDRSWui +; EXYNOSM1: SU(2): %{{[0-9]+}}:gpr64 = LDRSWui define i64 @ldp_sext_int(i32* %p) nounwind { %tmp = load i32, i32* %p, align 4 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 @@ -49,11 +50,11 @@ define i64 @ldp_sext_int(i32* %p) nounwind { ; CHECK: Cluster ld/st SU(2) - SU(1) ; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDURWi ; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDURWi -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldur_int:%bb.0 -; EXYNOS: Cluster ld/st SU(2) - SU(1) -; EXYNOS: SU(1): %{{[0-9]+}}:gpr32 = LDURWi -; EXYNOS: SU(2): %{{[0-9]+}}:gpr32 = LDURWi +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldur_int:%bb.0 +; EXYNOSM1: Cluster ld/st SU(2) - SU(1) +; EXYNOSM1: SU(1): %{{[0-9]+}}:gpr32 = LDURWi +; EXYNOSM1: SU(2): %{{[0-9]+}}:gpr32 = LDURWi define i32 @ldur_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 @@ -69,11 +70,11 @@ define i32 @ldur_int(i32* %a) nounwind { ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui ; CHECK: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_sext_zext_int:%bb.0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui -; EXYNOS: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldp_half_sext_zext_int:%bb.0 +; EXYNOSM1: Cluster ld/st SU(3) - SU(4) +; EXYNOSM1: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui +; EXYNOSM1: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -92,11 +93,11 @@ define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind { ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui ; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_zext_sext_int:%bb.0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui -; EXYNOS: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldp_half_zext_sext_int:%bb.0 +; EXYNOSM1: Cluster ld/st SU(3) - SU(4) +; EXYNOSM1: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui +; EXYNOSM1: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -115,11 +116,11 @@ define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind { ; CHECK-NOT: Cluster ld/st ; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDRWui ; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int_volatile:%bb.0 -; EXYNOS-NOT: Cluster ld/st -; EXYNOS: SU(1): %{{[0-9]+}}:gpr32 = LDRWui -; EXYNOS: SU(2): %{{[0-9]+}}:gpr32 = LDRWui +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldr_int_volatile:%bb.0 +; EXYNOSM1-NOT: Cluster ld/st +; EXYNOSM1: SU(1): %{{[0-9]+}}:gpr32 = LDRWui +; EXYNOSM1: SU(2): %{{[0-9]+}}:gpr32 = LDRWui define i32 @ldr_int_volatile(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load volatile i32, i32* %p1, align 2 @@ -135,9 +136,9 @@ define i32 @ldr_int_volatile(i32* %a) nounwind { ; CHECK: Cluster ld/st SU(1) - SU(3) ; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui ; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldq_cluster:%bb.0 -; EXYNOS-NOT: Cluster ld/st +; EXYNOSM1: ********** MI Scheduling ********** +; EXYNOSM1-LABEL: ldq_cluster:%bb.0 +; EXYNOSM1-NOT: Cluster ld/st define <2 x i64> @ldq_cluster(i64* %p) { %a1 = bitcast i64* %p to <2 x i64>* %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll index b3a2bcd5d66..c2412371d14 100644 --- a/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,GENERIC ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1 +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m3 | FileCheck %s --check-prefixes=CHECK,EXYNOSM3 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -353,6 +354,7 @@ define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x floa ; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -366,6 +368,7 @@ define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x flo ; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -379,6 +382,7 @@ define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x flo ; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -390,6 +394,7 @@ define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x fl ; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -401,6 +406,7 @@ define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x floa ; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -413,6 +419,7 @@ define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x flo ; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -425,6 +432,7 @@ define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x flo ; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -437,6 +445,7 @@ define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x fl ; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -449,6 +458,7 @@ define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x ; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -462,6 +472,7 @@ define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x ; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -473,6 +484,7 @@ define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x ; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -485,6 +497,7 @@ define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x ; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -1311,6 +1324,7 @@ define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1334,6 +1348,7 @@ define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1345,6 +1360,7 @@ define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { ; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1356,6 +1372,7 @@ define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1379,6 +1396,7 @@ define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1390,6 +1408,7 @@ define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1401,6 +1420,7 @@ define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] ; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1412,6 +1432,7 @@ define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1423,6 +1444,7 @@ define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { ; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1434,6 +1456,7 @@ define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] ; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1445,6 +1468,7 @@ define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1456,6 +1480,7 @@ define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] ; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1771,6 +1796,7 @@ define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x fl ; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1782,6 +1808,7 @@ define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x f ; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1793,6 +1820,7 @@ define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x f ; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1804,6 +1832,7 @@ define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x ; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1815,6 +1844,7 @@ define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x fl ; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -1827,6 +1857,7 @@ define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x f ; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -1839,6 +1870,7 @@ define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x f ; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -1851,6 +1883,7 @@ define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x ; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -1863,6 +1896,7 @@ define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 ; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -1874,6 +1908,7 @@ define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 ; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2606,6 +2641,7 @@ define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2617,6 +2653,7 @@ define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2628,6 +2665,7 @@ define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2651,6 +2689,7 @@ define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2662,6 +2701,7 @@ define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -2673,6 +2713,7 @@ define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2684,6 +2725,7 @@ define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2695,6 +2737,7 @@ define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) { ; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2706,6 +2749,7 @@ define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] ; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s +; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2717,6 +2761,7 @@ define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] ; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2728,6 +2773,7 @@ define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] ; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d +; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2741,6 +2787,8 @@ define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, ; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -2758,6 +2806,8 @@ define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> ; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s ; EXYNOSM1: dup [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1] ; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s +; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -2785,3 +2835,12 @@ entry: %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) ret <2 x float> %0 } + +define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m3(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m3" { +; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3: +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} diff --git a/test/CodeGen/AArch64/machine-combiner-madd.ll b/test/CodeGen/AArch64/machine-combiner-madd.ll index 5ace6e63136..eeeafbbfa47 100644 --- a/test/CodeGen/AArch64/machine-combiner-madd.ll +++ b/test/CodeGen/AArch64/machine-combiner-madd.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s diff --git a/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/test/CodeGen/AArch64/no-quad-ldp-stp.ll index fb030d29136..32f57cd21e8 100644 --- a/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ b/test/CodeGen/AArch64/no-quad-ldp-stp.ll @@ -1,10 +1,12 @@ -; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW +; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW +; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m3 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST ; CHECK-LABEL: test_nopair_st -; CHECK: str -; CHECK: stur -; CHECK-NOT: stp +; SLOW: str +; SLOW: stur +; SLOW-NOT: stp +; FAST: stp define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { %tmp1 = bitcast double* %ptr to <2 x double>* store <2 x double> %v2, <2 x double>* %tmp1, align 16 @@ -15,9 +17,10 @@ define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { } ; CHECK-LABEL: test_nopair_ld -; CHECK: ldr -; CHECK: ldr -; CHECK-NOT: ldp +; SLOW: ldr +; SLOW: ldr +; SLOW-NOT: ldp +; FAST: ldp define <2 x i64> @test_nopair_ld(i64* %p) { %a1 = bitcast i64* %p to <2 x i64>* %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 -- 2.11.0