From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 7 Nov 2016 23:04:50 +0000 (+0000)
Subject: [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
X-Git-Tag: android-x86-7.1-r4~24847
X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=c312996e7a99eae912f6c1c67b6d1bef5a84c42c;p=android-x86%2Fexternal-llvm.git

[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies

Codegen prepare sinks comparisons close to a user is we have only one register
for conditions. For AMDGPU we have many SGPRs capable to hold vector conditions.
Changed BE to report we have many condition registers. That way IR LICM pass
would hoist an invariant comparison out of a loop and codegen prepare will not
sink it.

With that done a condition is calculated in one block and used in another.
Current behavior is to store workitem's condition in a VGPR using v_cndmask
and then restore it with yet another v_cmp instruction from that v_cndmask's
result. To mitigate the issue a forward propagation of a v_cmp 64 bit result
to an user is implemented. Additional side effect of this is that we may
consume less VGPRs in a cost of more SGPRs in case if holding of multiple
conditions is needed, and that is a clear win in most cases.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286171 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d7108416ccc..2e43d427e47 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -440,6 +440,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
+  setHasMultipleConditionRegisters(true);
 
   // SI at least has hardware support for floating point exceptions, but no way
   // of using or handling them is implemented. They are also optional in OpenCL
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 9e62980940b..3b7db8c465f 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -121,11 +121,31 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
           }
         }
 
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-          .addOperand(Dst)
-          .addImm(0)
-          .addImm(-1)
-          .addOperand(Src);
+        // If there are uses which are just a copy back from this new VReg_1
+        // to another SGPR_64 just forward propagate original SGPR_64.
+        SmallVector<MachineInstr *, 4> RegUses;
+        for (auto &Use : MRI.use_instructions(Dst.getReg()))
+          if (Use.isFullCopy())
+            RegUses.push_back(&Use);
+
+        while (!RegUses.empty()) {
+          MachineInstr *Use = RegUses.pop_back_val();
+          if (Use->getOperand(1).getReg() == Dst.getReg()) {
+            unsigned RegCopy = Use->getOperand(0).getReg();
+            if (!TargetRegisterInfo::isVirtualRegister(RegCopy))
+              continue;
+            Use->eraseFromParent();
+            MRI.replaceRegWith(RegCopy, Src.getReg());
+          }
+        }
+
+        if (!MRI.use_empty(Dst.getReg()))
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+            .addOperand(Dst)
+            .addImm(0)
+            .addImm(-1)
+            .addOperand(Src);
+
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index c298911504a..6d723013c1b 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -90,7 +90,7 @@ bb3:
 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
 ; GCN: s_load_dword [[CND:s[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
-; GCN-DAG: v_cmp_eq_f32_e64 vcc, [[CND]], 0
+; GCN-DAG: v_cmp_eq_f32_e64 {{vcc|(s\[[0-9]+:[0-9]+\])}}, [[CND]], 0
 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
@@ -492,8 +492,8 @@ ret:
 ; GCN: s_setpc_b64
 
 ; GCN: [[LONG_BR_DEST0]]
-; GCN: s_cmp_eq_u32
-; GCN-NEXT: s_cbranch_scc0
+; GCN: v_cmp_ne_u32_e32
+; GCN-NEXT: s_cbranch_vccz
 ; GCN: s_setpc_b64
 
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/host-cond.ll b/test/CodeGen/AMDGPU/host-cond.ll
new file mode 100644
index 00000000000..6831f224c91
--- /dev/null
+++ b/test/CodeGen/AMDGPU/host-cond.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Check that invariant compare is hoisted out of the loop.
+; At the same time condition shall not be serialized into a VGPR and deserialized later
+; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
+
+; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
+; CHECK: BB0_1:
+; CHECK-NOT: v_cmp
+; CHECK_NOT: v_cndmask
+; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
+; CHECK: BB0_2:
+
+define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp5 = icmp ult i32 %tmp, %arg3
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
+  %tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
+  br i1 %tmp5, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  %tmp10 = zext i32 %tmp7 to i64
+  %tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
+  %tmp12 = load float, float addrspace(1)* %tmp11, align 4
+  br label %bb3
+
+bb3:                                             ; preds = %bb2, %bb1
+  %tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
+  %tmp15 = fadd float %tmp8, %tmp14
+  %tmp16 = add i32 %tmp7, -1
+  %tmp17 = icmp eq i32 %tmp16, 0
+  br i1 %tmp17, label %bb4, label %bb1
+
+bb4:                                             ; preds = %bb3
+  store float %tmp15, float addrspace(1)* %arg, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }