AMDGPU: Fix DAG divergence for implicit function arguments

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 13 May 2020 17:24:32 +0000 (13:24 -0400)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 19 May 2020 22:11:34 +0000 (18:11 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 13 May 2020 17:24:32 +0000 (13:24 -0400)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 19 May 2020 22:11:34 +0000 (18:11 -0400)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 2eb5884..1b8ca7b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11026,30 +11026,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
      case ISD::CopyFromReg:
      {
        const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
-      const MachineFunction * MF = FLI->MF;
-      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-      const MachineRegisterInfo &MRI = MF->getRegInfo();
-      const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+      const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
        Register Reg = R->getReg();
-      if (Reg.isPhysical())
-        return !TRI.isSGPRReg(MRI, Reg);
-
-      if (MRI.isLiveIn(Reg)) {
-        // workitem.id.x workitem.id.y workitem.id.z
-        // Any VGPR formal argument is also considered divergent
-        if (!TRI.isSGPRReg(MRI, Reg))
-          return true;
-        // Formal arguments of non-entry functions
-        // are conservatively considered divergent
-        else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
-          return true;
-        return false;
-      }
-      const Value *V = FLI->getValueFromVirtualReg(Reg);
-      if (V)
+
+      // FIXME: Why does this need to consider isLiveIn?
+      if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+        return !TRI->isSGPRReg(MRI, Reg);
+
+      if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
          return KDA->isDivergent(V);
+
        assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
-      return !TRI.isSGPRReg(MRI, Reg);
+      return !TRI->isSGPRReg(MRI, Reg);
      }
      break;
      case ISD::LOAD: {
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll

index 46d02d3..764f935 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -40,6 +40,31 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
    ret void
  }
  
+; Test handling inside a non-kernel
+; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
+
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
+; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
+; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
+  store volatile i32 7, i32* %stof
+  ret void
+}
+
  ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
  ; HSA: enable_sgpr_private_segment_buffer = 1
  ; HSA: enable_sgpr_dispatch_ptr = 0
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll

index c1aca8f..e0e7636 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -2,9 +2,7 @@
  ; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
  
  ; GCN-LABEL: {{^}}use_dispatch_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
  define hidden void @use_dispatch_ptr() #1 {
    %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
    %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
@@ -13,9 +11,7 @@ define hidden void @use_dispatch_ptr() #1 {
  }
  
  ; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
  define hidden void @use_queue_ptr() #1 {
    %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
    %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
@@ -34,9 +30,7 @@ define hidden void @use_kernarg_segment_ptr() #1 {
  }
  
  ; GCN-LABEL: {{^}}use_implicitarg_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
  define hidden void @use_implicitarg_ptr() #1 {
    %implicit.arg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
    %header_ptr = bitcast i8 addrspace(4)* %implicit.arg.ptr to i32 addrspace(4)*
@@ -198,15 +192,9 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
  
  ; GCN-LABEL: {{^}}use_every_sgpr_input:
  ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
  ; GCN: ; use s[10:11]
  ; GCN: ; use s12
  ; GCN: ; use s13
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll

index 4991b72..92cf860 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -2,9 +2,7 @@
  ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
  
  ; GCN-LABEL: {{^}}use_dispatch_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
  define hidden void @use_dispatch_ptr() #1 {
    %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
    %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
@@ -23,9 +21,7 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
  }
  
  ; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
  define hidden void @use_queue_ptr() #1 {
    %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
    %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
@@ -44,10 +40,10 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
  }
  
  ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
-; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1]
+; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x10
  ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
  ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
-; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
  ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
  ; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}
  define hidden void @use_queue_ptr_addrspacecast() #1 {
@@ -401,15 +397,10 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
  
  ; GCN-LABEL: {{^}}use_every_sgpr_input:
  ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
+
  ; GCN: ; use s[10:11]
  ; GCN: ; use s12
  ; GCN: ; use s13
@@ -551,15 +542,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
  ; GCN: s_swappc_b64
  
  ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
-; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
-; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
-; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]]
-; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]]
-; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}}
-; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]]
-; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]]
-; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}}
+; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_X]]:[[HI_X]]{{\]}}, 0x0
+; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Y]]:[[HI_Y]]{{\]}}, 0x0
+; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Z]]:[[HI_Z]]{{\]}}, 0x0
  ; GCN: ; use
  ; GCN: ; use [[SAVE_X]]
  ; GCN: ; use [[SAVE_Y]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

index 851c3aa..c15963f 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -59,12 +59,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
  
  ; GCN-LABEL: {{^}}func_implicitarg_ptr:
  ; GCN: s_waitcnt
-; MESA: v_mov_b32_e32 v0, s4
-; MESA: v_mov_b32_e32 v1, s5
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s4
-; HSA: v_mov_b32_e32 v1, s5
-; HSA: flat_load_dword v0, v[0:1]
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @func_implicitarg_ptr() #0 {
@@ -76,12 +71,7 @@ define void @func_implicitarg_ptr() #0 {
  
  ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
  ; GCN: s_waitcnt
-; MESA: v_mov_b32_e32 v0, s4
-; MESA: v_mov_b32_e32 v1, s5
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s4
-; HSA: v_mov_b32_e32 v1, s5
-; HSA: flat_load_dword v0, v[0:1]
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @opencl_func_implicitarg_ptr() #0 {
@@ -165,16 +155,10 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {
  
  ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
  ; GCN: s_waitcnt
-; GCN-DAG: v_mov_b32_e32 v0, s4
-; GCN-DAG: v_mov_b32_e32 v1, s5
  ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
  ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
-
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-
-; HSA: flat_load_dword v0, v[0:1]
-
-; GCN: s_waitcnt vmcnt(0)
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_waitcnt lgkmcnt(0)
  define void @func_kernarg_implicitarg_ptr() #0 {
    %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
@@ -187,15 +171,10 @@ define void @func_kernarg_implicitarg_ptr() #0 {
  
  ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
  ; GCN: s_waitcnt
-; GCN-DAG: v_mov_b32_e32 v0, s4
-; GCN-DAG: v_mov_b32_e32 v1, s5
  ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
  ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
-
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: flat_load_dword v0, v[0:1]
-
-; GCN: s_waitcnt vmcnt(0)
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_waitcnt lgkmcnt(0)
  define void @opencl_func_kernarg_implicitarg_ptr() #0 {
    %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 13 May 2020 17:24:32 +0000 (13:24 -0400)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 19 May 2020 22:11:34 +0000 (18:11 -0400)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/addrspacecast.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll		patch \| blob \| history