[AMDGPU] Force qsads instrs to use different dest register than source registers

author Mark Searles <m.c.searles@gmail.com>

Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)

committer Mark Searles <m.c.searles@gmail.com>

Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)
author Mark Searles <m.c.searles@gmail.com>
Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)
committer Mark Searles <m.c.searles@gmail.com>
Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td

index 77fc955..900adb8 100644 (file)
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -209,7 +209,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
  }
  
  def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+
+let Constraints = "@earlyclobber $vdst" in {
  def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+} // End Constraints = "@earlyclobber $vdst"
  
  def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
    let SchedRW = [WriteDouble];
@@ -232,8 +235,10 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
  
  let SubtargetPredicate = isCIVI in {
  
+let Constraints = "@earlyclobber $vdst" in {
  def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
  def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+} // End Constraints = "@earlyclobber $vdst"
  
  let isCommutable = 1 in {
  def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll

index 3a2b87c..31653f9 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -4,19 +4,29 @@
  declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
  
  ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
  define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0                
  define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
    ret void
  }
  
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll

index a8d03bf..1cd9dfc 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -3,46 +3,57 @@
  
  declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
  
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3                
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3                
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
  define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3                
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
  define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3                
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
  define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
    %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
-
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
    ret void
  }
  
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll

index be71225..7daca42 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -4,18 +4,28 @@
  declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
  
  ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
  define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0
  define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
    ret void
  }
author	Mark Searles <m.c.searles@gmail.com>
	Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)
committer	Mark Searles <m.c.searles@gmail.com>
	Thu, 8 Jun 2017 18:21:19 +0000 (18:21 +0000)
lib/Target/AMDGPU/VOP3Instructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll		patch \| blob \| history