From: Mark Searles <m.c.searles@gmail.com>
Date: Thu, 8 Jun 2017 18:21:19 +0000 (+0000)
Subject: [AMDGPU] Force qsads instrs to use different dest register than source registers
X-Git-Tag: android-x86-7.1-r4~15220
X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=c5293f3fa4db8d781336ab072f3511bb6350ca69;p=android-x86%2Fexternal-llvm.git

[AMDGPU] Force qsads instrs to use different dest register than source registers

The V_MQSAD_PK_U16_U8, V_QSAD_PK_U16_U8, and V_MQSAD_U32_U8 take more than 1 pass in hardware. For these three instructions, the destination registers must be different than all sources, so that the first pass does not overwrite sources for the following passes.

Differential Revision: https://reviews.llvm.org/D33783

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304998 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 77fc9551cff..900adb8b536 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -209,7 +209,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+
+let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
   let SchedRW = [WriteDouble];
@@ -232,8 +235,10 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 
 let SubtargetPredicate = isCIVI in {
 
+let Constraints = "@earlyclobber $vdst" in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 let isCommutable = 1 in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 3a2b87cd87f..31653f93fc3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -4,19 +4,29 @@
 declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0		
 define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
   ret void
 }
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index a8d03bf6bba..1cd9dfcf9b7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -3,46 +3,57 @@
 
 declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
-
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index be71225c5e0..7daca42d133 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -4,18 +4,28 @@
 declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
   ret void
 }