From 7923d71b4a7a88f97c8a3efe1eb1473a4b2f5bf3 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 1 Dec 2020 09:04:36 +0000 Subject: [PATCH] [ARM] PREDICATE_CAST demanded bits The PREDICATE_CAST node is used to model moves between MVE predicate registers and gpr's, and eventually become a VMSR p0, rn. When moving to a predicate only the bottom 16 bits of the sources register are demanded. This adds a simple fold for that, allowing it to potentially remove instructions like uxth. Differential Revision: https://reviews.llvm.org/D92213 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 7 +++++++ llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll | 4 +--- llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll | 7 +------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index c94b9e64632..0426a560805 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13844,6 +13844,13 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); } + // Only the bottom 16 bits of the source register are used. + if (Op.getValueType() == MVT::i32) { + APInt DemandedMask = APInt::getLowBitsSet(32, 16); + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) + return SDValue(N, 0); + } return SDValue(); } diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll index fff9ad87102..c7e553fa351 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -139,10 +139,9 @@ define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { ; CHECK-LE-NEXT: mov r4, sp ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: uxth r0, r0 ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: mov sp, r4 ; CHECK-LE-NEXT: pop {r4, r6, r7, pc} @@ -160,7 +159,6 @@ define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: uxth r0, r0 ; CHECK-BE-NEXT: sub.w r4, r7, #8 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: vmsr p0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll index afad0077bbe..17f57743c30 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll @@ -51,10 +51,8 @@ define arm_aapcs_vfpcc void @const(<8 x i16> %acc0, <8 x i16> %acc1, i32* nocapt ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r6, r7, lr} ; CHECK-NEXT: push {r4, r6, r7, lr} -; CHECK-NEXT: uxth r2, r1 +; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: mvns r1, r1 -; CHECK-NEXT: vmsr p0, r2 -; CHECK-NEXT: uxth r1, r1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvt.s16 r12, q1 ; CHECK-NEXT: vaddvt.s16 r2, q0 @@ -92,7 +90,6 @@ define arm_aapcs_vfpcc <4 x i32> @xorvpnot_i32(<4 x i32> %acc0, i16 signext %p0) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: mvns r0, r0 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr @@ -109,7 +106,6 @@ define arm_aapcs_vfpcc <8 x i16> @xorvpnot_i16(<8 x i16> %acc0, i16 signext %p0) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: mvns r0, r0 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr @@ -126,7 +122,6 @@ define arm_aapcs_vfpcc <16 x i8> @xorvpnot_i8(<16 x i8> %acc0, i16 signext %p0) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: mvns r0, r0 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr -- 2.11.0