From e2d935510ce6e5c7d357d04b34ba7c48925c1a0b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 27 Jun 2017 18:25:26 +0000 Subject: [PATCH] [AMDGPU] Combine and x, (sext cc from i1) => select cc, x, 0 Also factored out function to check if a boolean is an already deserialized value which does not require v_cndmask_b32 to be loaded. Added binary logical operators to its check. Differential Revision: https://reviews.llvm.org/D34500 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306439 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 30 ++++++++++++++++++++++++++-- test/CodeGen/AMDGPU/combine-and-sext-bool.ll | 27 +++++++++++++++++++++++++ test/CodeGen/AMDGPU/combine-cond-add-sub.ll | 20 +++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/AMDGPU/combine-and-sext-bool.ll diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index d0f4e00994d..8ba69c7d988 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4314,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } +// Returns true if argument is a boolean value which is not serialized into +// memory or argument and does not require v_cmdmask_b32 to be deserialized. +static bool isBoolSGPR(SDValue V) { + if (V.getValueType() != MVT::i1) + return false; + switch (V.getOpcode()) { + default: break; + case ISD::SETCC: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case AMDGPUISD::FP_CLASS: + return true; + } + return false; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -4402,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } + if (VT == MVT::i32 && + (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { + // and x, (sext cc from i1) => select cc, x, 0 + if (RHS.getOpcode() != ISD::SIGN_EXTEND) + std::swap(LHS, RHS); + if (isBoolSGPR(RHS.getOperand(0))) + return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), + LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); + } + return SDValue(); } @@ -4941,8 +4968,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { auto Cond = RHS.getOperand(0); - if (Cond.getOpcode() != ISD::SETCC && - Cond.getOpcode() != AMDGPUISD::FP_CLASS) + if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; diff --git a/test/CodeGen/AMDGPU/combine-and-sext-bool.ll b/test/CodeGen/AMDGPU/combine-and-sext-bool.ll new file mode 100644 index 00000000000..cd4ac4d58ad --- /dev/null +++ b/test/CodeGen/AMDGPU/combine-and-sext-bool.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}and_i1_sext_bool: +; GCN: v_cmp_{{gt|le}}_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e{{32|64}} [[VAL:v[0-9]+]], 0, v{{[0-9]+}}, [[CC]] +; GCN: store_dword {{.*}}[[VAL]] +; GCN-NOT: v_cndmask_b32_e64 v{{[0-9]+}}, {{0|-1}}, {{0|-1}} +; GCN-NOT: v_and_b32_e32 + +define amdgpu_kernel void @and_i1_sext_bool(i32 addrspace(1)* nocapture %arg) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = sext i1 %cmp to i32 + %and = and i32 %v, %ext + store i32 %and, i32 addrspace(1)* %gep, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +declare i32 @llvm.amdgcn.workitem.id.y() #0 + +attributes #0 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 187fb24dfb6..9e47c7d3449 100644 --- a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -150,6 +150,26 @@ bb: ret void } +; GCN-LABEL: {{^}}add_and: +; GCN: s_and_b64 [[CC:[^,]+]], +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask + +define amdgpu_kernel void @add_and(i32 addrspace(1)* nocapture %arg) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp1 = icmp ugt i32 %x, %y + %cmp2 = icmp ugt i32 %x, 1 + %cmp = and i1 %cmp1, %cmp2 + %ext = zext i1 %cmp to i32 + %add = add i32 %v, %ext + store i32 %add, i32 addrspace(1)* %gep, align 4 + ret void +} + declare i1 @llvm.amdgcn.class.f32(float, i32) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 -- 2.11.0