From 6ff1a723f7396bbb93385d08491032323de658f3 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Tue, 23 May 2017 19:54:48 +0000
Subject: [PATCH] [AMDGPU] Combine and (srl) into shl (bfe)

Perform DAG combine:
and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
Where nb is a number of trailing zeroes in mask.

It replaces two instructions with two and BFE is generally a more
expensive one. However this is only done if we are selecting a byte
or word at an aligned boundary which results in a proper SDWA
operand pattern. It is only done if SDWA is supported.

TODO: improve SDWA pass to actually convert this pattern. It is not
done now because we have an immediate in the instruction, which has
be moved into a VGPR.

Differential Revision: https://reviews.llvm.org/D33455

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303681 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  3 ++-
 lib/Target/AMDGPU/AMDGPUSubtarget.h      |  8 +++----
 lib/Target/AMDGPU/SIISelLowering.cpp     | 40 ++++++++++++++++++++++++++-----
 test/CodeGen/AMDGPU/bfe-combine.ll       | 41 ++++++++++++++++++++++++++++++++
 4 files changed, 81 insertions(+), 11 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/bfe-combine.ll
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9d7562d176d..723e8a7b54e 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3478,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                        DL);
     }
 
-    if ((OffsetVal + WidthVal) >= 32) {
+    if ((OffsetVal + WidthVal) >= 32 &&
+        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
                          BitsFrom, ShiftVal);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d122af0e27b..24ebde5dc93 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -420,6 +420,10 @@ public:
     return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -674,10 +678,6 @@ public:
     return HasInv2PiInlineImm;
   }
 
-  bool hasSDWA() const {
-    return HasSDWA;
-  }
-
   bool hasDPP() const {
     return HasDPP;
   }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca..8c939f4bf00 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4229,12 +4229,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
 
-  if (VT == MVT::i64) {
-    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-    if (CRHS) {
-      if (SDValue Split
-          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
-        return Split;
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (VT == MVT::i64 && CRHS) {
+    if (SDValue Split
+        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+      return Split;
+  }
+
+  if (CRHS && VT == MVT::i32) {
+    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+    // nb = number of trailing zeroes in mask
+    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+    uint64_t Mask = CRHS->getZExtValue();
+    unsigned Bits = countPopulation(Mask);
+    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+        unsigned Shift = CShift->getZExtValue();
+        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+        unsigned Offset = NB + Shift;
+        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+          SDLoc SL(N);
+          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                    LHS->getOperand(0),
+                                    DAG.getConstant(Offset, SL, MVT::i32),
+                                    DAG.getConstant(Bits, SL, MVT::i32));
+          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+                                    DAG.getValueType(NarrowVT));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+          return Shl;
+        }
+      }
     }
   }
 
diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll
new file mode 100644
index 00000000000..791b49f0e14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s
+
+; GCN-LABEL: {{^}}bfe_combine8:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 8
+  %and = and i32 %srl, 255
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_combine16:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
+; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 1
+  %and = and i32 %srl, 2147450880
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
-- 
2.11.0