From d20a3d35e1875d7a4928184117e6a875c35f3f63 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 29 May 2020 12:53:30 +0100 Subject: [PATCH] [DAGComb] Do not turn insert_elt into shuffle for single elt vectors. Currently combineInsertEltToShuffle turns insert_vector_elt into a vector_shuffle, even if the inserted element is a vector with a single element. In this case, it should be unlikely that the additional shuffle would be more efficient than a insert_vector_elt. Additionally, this fixes a infinite cycle in DAGCombine, where combineInsertEltToShuffle turns a insert_vector_elt into a shuffle, which gets turned back into a insert_vector_elt/extract_vector_elt by a custom AArch64 lowering (in visitVECTOR_SHUFFLE). Such insert_vector_elt and extract_vector_elt combinations can be lowered efficiently using mov on AArch64. There are 2 test changes in arm64-neon-copy.ll: we now use one or two mov instructions instead of a single zip1. The reason that we need a second mov in ins1f2 is that we have to move the result to the result register and is not really related to the DAGCombine fold I think. But in any case, on most uarchs, mov should be cheaper than zip1. On a Cortex-A75 for example, zip1 is twice as expensive as mov (https://developer.arm.com/docs/101398/latest/arm-cortex-a75-software-optimization-guide-v20) Reviewers: spatel, efriedma, dmgreen, RKSimon Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D80710 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +++ llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 5 ++-- .../CodeGen/AArch64/vector-insert-shuffle-cycle.ll | 35 ++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 92161512728..0176ae3a0ab 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17137,6 +17137,10 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { EVT SubVecVT = SubVec.getValueType(); EVT VT = DestVec.getValueType(); unsigned NumSrcElts = SubVecVT.getVectorNumElements(); + // If the source only has a single vector element, the cost of creating adding + // it to a vector is likely to exceed the cost of a insert_vector_elt. + if (NumSrcElts == 1) + return SDValue(); unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); unsigned NumMaskVals = ExtendRatio * NumSrcElts; diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 7820734e366..05a273f5f2d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -200,7 +200,8 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { ; CHECK-LABEL: ins1f2: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 @@ -211,7 +212,7 @@ define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) ; CHECK-LABEL: ins1f2_args_flipped: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 diff --git a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll new file mode 100644 index 00000000000..57e7ef1a0e7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - | FileCheck %s + +target triple = "arm64-apple-ios13.4.0" + +; Make we do not get stuck in a cycle in DAGCombiner. + +define void @test(i1 %c, <1 x double>* %ptr) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: tbz w0, #0, LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb1 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: LBB0_2: ; %bb2 +; CHECK-NEXT: ldr q1, [x8] +; CHECK-NEXT: mov.d v1[0], v0[0] +; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: ret +entry: + br i1 %c, label %bb1, label %bb2 + +bb1: + %lv1 = load <1 x double>, <1 x double>* %ptr, align 16 + br label %bb2 + +bb2: + %p = phi <1 x double> [ %lv1, %bb1 ], [ zeroinitializer, %entry ] + %vecext19 = extractelement <1 x double> %p, i32 0 + %arrayidx21 = getelementptr inbounds [4 x <4 x double>], [4 x <4 x double>]* undef, i64 0, i64 3 + %lv2 = load <4 x double>, <4 x double>* %arrayidx21, align 16 + %vecins22 = insertelement <4 x double> %lv2, double %vecext19, i32 2 + store <4 x double> %vecins22, <4 x double>* %arrayidx21, align 16 + ret void +} -- 2.11.0