From 51e92d172ed0fdd01fb2c5cd6bb2837097a4a858 Mon Sep 17 00:00:00 2001 From: Geoff Berry Date: Mon, 26 Sep 2016 15:34:47 +0000 Subject: [PATCH] [AArch64] Improve add/sub/cmp isel of uxtw forms. Don't match the UXTW extended reg forms of ADD/ADDS/SUB/SUBS if the 32-bit to 64-bit zero-extend can be done for free by taking advantage of the 32-bit defining instruction zeroing the upper 32-bits of the X register destination. This enables better instruction selection in a few cases, such as: sub x0, xzr, x8 instead of: mov x8, xzr sub x0, x8, w9, uxtw madd x0, x1, x1, x8 instead of: mul x9, x1, x1 add x0, x9, w8, uxtw cmp x2, x8 instead of: sub x8, x2, w8, uxtw cmp x8, #0 add x0, x8, x1, lsl #3 instead of: lsl x9, x1, #3 add x0, x9, w8, uxtw Reviewers: t.p.northover, jmolloy Subscribers: mcrosier, aemerson, llvm-commits, rengolin Differential Revision: https://reviews.llvm.org/D24747 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@282413 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 5 ++ lib/Target/AArch64/AArch64ISelLowering.h | 15 ++++ lib/Target/AArch64/AArch64InstrInfo.td | 9 +-- test/CodeGen/AArch64/addsub_ext.ll | 109 +++++++++++++++++++++++++++-- 4 files changed, 124 insertions(+), 14 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6b7d517b5d0..6fa0782222d 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -586,6 +586,11 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, return false; Reg = N.getOperand(0); + + // Don't match if free 32-bit -> 64-bit zext can be used instead. + if (Ext == AArch64_AM::UXTW && + Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode())) + return false; } // AArch64 mandates that the RHS of the operation must use the smallest diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 86f1d972c9d..882ed19480b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -215,6 +215,21 @@ enum NodeType : unsigned { } // end namespace AArch64ISD +namespace { + +// Any instruction that defines a 32-bit result zeros out the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. But any other 32-bit operation will zero-extend +// up to 64 bits. +// FIXME: X86 also checks for CMOV here. Do we need something similar? +static inline bool isDef32(const SDNode &N) { + unsigned Opc = N.getOpcode(); + return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && + Opc != ISD::CopyFromReg; +} + +} // end anonymous namespace + class AArch64Subtarget; class AArch64TargetMachine; diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index e58ad278215..dad097e07ac 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -5272,15 +5272,8 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0 //---------------------------------------------------------------------------- // FIXME: Like for X86, these should go in their own separate .td file. -// Any instruction that defines a 32-bit result leaves the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. -// FIXME: X86 also checks for CMOV here. Do we need something similar? def def32 : PatLeaf<(i32 GPR32:$src), [{ - return N->getOpcode() != ISD::TRUNCATE && - N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && - N->getOpcode() != ISD::CopyFromReg; + return isDef32(*N); }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll index cfe0dfc956a..df1b9fe7855 100644 --- a/test/CodeGen/AArch64/addsub_ext.ll +++ b/test/CodeGen/AArch64/addsub_ext.ll @@ -274,19 +274,20 @@ define void @sub_i16rhs() minsize { ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for ; example), but the remaining instructions are probably not idiomatic ; in the face of "add/sub (shifted register)" so I don't intend to. -define void @addsub_i32rhs() minsize { +define void @addsub_i32rhs(i32 %in32) minsize { ; CHECK-LABEL: addsub_i32rhs: %val32_tmp = load i32, i32* @var32 %lhs64 = load i64, i64* @var64 %val32 = add i32 %val32_tmp, 123 - %rhs64_zext = zext i32 %val32 to i64 + %rhs64_zext = zext i32 %in32 to i64 %res64_zext = add i64 %lhs64, %rhs64_zext store volatile i64 %res64_zext, i64* @var64 ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw - %rhs64_zext_shift = shl i64 %rhs64_zext, 2 + %rhs64_zext2 = zext i32 %val32 to i64 + %rhs64_zext_shift = shl i64 %rhs64_zext2, 2 %res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift store volatile i64 %res64_zext_shift, i64* @var64 ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2 @@ -304,19 +305,20 @@ define void @addsub_i32rhs() minsize { ret void } -define void @sub_i32rhs() minsize { +define void @sub_i32rhs(i32 %in32) minsize { ; CHECK-LABEL: sub_i32rhs: %val32_tmp = load i32, i32* @var32 %lhs64 = load i64, i64* @var64 %val32 = add i32 %val32_tmp, 123 - %rhs64_zext = zext i32 %val32 to i64 + %rhs64_zext = zext i32 %in32 to i64 %res64_zext = sub i64 %lhs64, %rhs64_zext store volatile i64 %res64_zext, i64* @var64 ; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw - %rhs64_zext_shift = shl i64 %rhs64_zext, 2 + %rhs64_zext2 = zext i32 %val32 to i64 + %rhs64_zext_shift = shl i64 %rhs64_zext2, 2 %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift store volatile i64 %res64_zext_shift, i64* @var64 ; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2 @@ -333,3 +335,98 @@ define void @sub_i32rhs() minsize { ret void } + +; Check that implicit zext from w reg write is used instead of uxtw form of add. +define i64 @add_fold_uxtw(i32 %x, i64 %y) { +; CHECK-LABEL: add_fold_uxtw: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: add x0, x1, x[[TMP]] + %ret = add i64 %y, %ext + ret i64 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw +; form of sub and that mov WZR is folded to form a neg instruction. +define i64 @sub_fold_uxtw_xzr(i32 %x) { +; CHECK-LABEL: sub_fold_uxtw_xzr: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: neg x0, x[[TMP]] + %ret = sub i64 0, %ext + ret i64 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp. +define i1 @cmp_fold_uxtw(i32 %x, i64 %y) { +; CHECK-LABEL: cmp_fold_uxtw: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: cmp x1, x[[TMP]] +; CHECK-NEXT: cset + %ret = icmp eq i64 %y, %ext + ret i1 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw +; form of add, leading to madd selection. +define i64 @madd_fold_uxtw(i32 %x, i64 %y) { +; CHECK-LABEL: madd_fold_uxtw: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: madd x0, x1, x1, x[[TMP]] + %mul = mul i64 %y, %y + %ret = add i64 %mul, %ext + ret i64 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw +; form of sub, leading to sub/cmp folding. +; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp. +define i1 @cmp_sub_fold_uxtw(i32 %x, i64 %y, i64 %z) { +; CHECK-LABEL: cmp_sub_fold_uxtw: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: cmp x[[TMP2:[0-9]+]], x[[TMP]] +; CHECK-NEXT: cset + %sub = sub i64 %z, %ext + %ret = icmp eq i64 %sub, 0 + ret i1 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw +; form of add and add of -1 gets selected as sub. +define i64 @add_imm_fold_uxtw(i32 %x) { +; CHECK-LABEL: add_imm_fold_uxtw: +entry: +; CHECK: and w[[TMP:[0-9]+]], w0, #0x3 + %m = and i32 %x, 3 + %ext = zext i32 %m to i64 +; CHECK-NEXT: sub x0, x[[TMP]], #1 + %ret = add i64 %ext, -1 + ret i64 %ret +} + +; Check that implicit zext from w reg write is used instead of uxtw +; form of add and add lsl form gets selected. +define i64 @add_lsl_fold_uxtw(i32 %x, i64 %y) { +; CHECK-LABEL: add_lsl_fold_uxtw: +entry: +; CHECK: orr w[[TMP:[0-9]+]], w0, #0x3 + %m = or i32 %x, 3 + %ext = zext i32 %m to i64 + %shift = shl i64 %y, 3 +; CHECK-NEXT: add x0, x[[TMP]], x1, lsl #3 + %ret = add i64 %ext, %shift + ret i64 %ret +} -- 2.11.0