From 6ff1a723f7396bbb93385d08491032323de658f3 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 23 May 2017 19:54:48 +0000 Subject: [PATCH] [AMDGPU] Combine and (srl) into shl (bfe) Perform DAG combine: and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb Where nb is a number of trailing zeroes in mask. It replaces two instructions with two and BFE is generally a more expensive one. However this is only done if we are selecting a byte or word at an aligned boundary which results in a proper SDWA operand pattern. It is only done if SDWA is supported. TODO: improve SDWA pass to actually convert this pattern. It is not done now because we have an immediate in the instruction, which has be moved into a VGPR. Differential Revision: https://reviews.llvm.org/D33455 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303681 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 ++- lib/Target/AMDGPU/AMDGPUSubtarget.h | 8 +++---- lib/Target/AMDGPU/SIISelLowering.cpp | 40 ++++++++++++++++++++++++++----- test/CodeGen/AMDGPU/bfe-combine.ll | 41 ++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/AMDGPU/bfe-combine.ll diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9d7562d176d..723e8a7b54e 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3478,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DL); } - if ((OffsetVal + WidthVal) >= 32) { + if ((OffsetVal + WidthVal) >= 32 && + !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index d122af0e27b..24ebde5dc93 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -420,6 +420,10 @@ public: return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } + bool hasSDWA() const { + return HasSDWA; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -674,10 +678,6 @@ public: return HasInv2PiInlineImm; } - bool hasSDWA() const { - return HasSDWA; - } - bool hasDPP() const { return HasDPP; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca..8c939f4bf00 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4229,12 +4229,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll new file mode 100644 index 00000000000..791b49f0e14 --- /dev/null +++ b/test/CodeGen/AMDGPU/bfe-combine.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s + +; GCN-LABEL: {{^}}bfe_combine8: +; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8 +; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]] +; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}} +; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]] +; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] +; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx = add i32 %x, %id + %srl = lshr i32 %idx, 8 + %and = and i32 %srl, 255 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_combine16: +; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16 +; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]] +; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}} +; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]] +; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2 +; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] +; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx = add i32 %x, %id + %srl = lshr i32 %idx, 1 + %and = and i32 %srl, 2147450880 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 -- 2.11.0