From 0537f4f92ef7204376bd65bb7c32657832c5e4aa Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 May 2019 11:27:28 +0000 Subject: [PATCH] [AMDGPU] Regenerate vector sub tests git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@361485 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AMDGPU/sub.v2i16.ll | 656 +++++++++++++++++++++++++++++++-------- 1 file changed, 525 insertions(+), 131 deletions(-) diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index 442193f7229..c7a08a66511 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,16 +1,56 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16: -; GFX89: {{flat|global}}_load_dword -; GFX89: {{flat|global}}_load_dword - -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +; GFX9-LABEL: v_test_sub_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u16_e32 v2, v0, v1 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -22,15 +62,46 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ret void } -; GCN-LABEL: {{^}}s_test_sub_v2i16: -; GFX9: s_load_dword [[VAL0:s[0-9]+]] -; GFX9: s_load_dword [[VAL1:s[0-9]+]] -; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]] - -; VI: s_sub_i32 -; VI: s_sub_i32 define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { +; GFX9-LABEL: s_test_sub_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: s_test_sub_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[6:7], 0x0 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: s_sub_i32 s4, s4, s5 +; VI-NEXT: s_sub_i32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s6, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = sub <2 x i16> %a, %b @@ -38,10 +109,16 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ret void } -; GCN-LABEL: {{^}}s_test_sub_self_v2i16: -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] -; GCN: buffer_store_dword [[ZERO]] define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { +; GCN-LABEL: s_test_sub_self_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = sub <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -49,33 +126,83 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, < } ; FIXME: VI should not scalarize arg access. -; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: -; GCN: s_load_dword s -; GCN: s_load_dword s - -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} - -; VI: s_sub_i32 -; VI: s_sub_i32 -; VI: s_lshl_b32 -; VI: s_and_b32 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { +; GFX9-LABEL: s_test_sub_v2i16_kernarg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: s_test_sub_v2i16_kernarg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_sub_i32 s1, s1, s3 +; VI-NEXT: s_sub_i32 s0, s2, s0 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: -; GFX89-DAG: {{flat|global}}_load_dword - -; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] - -; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_or_b32 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +; GFX9-LABEL: v_test_sub_v2i16_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -86,14 +213,46 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant: -; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] - -; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +; GFX9-LABEL: v_test_sub_v2i16_neg_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_neg_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3df +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -103,15 +262,45 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* ret void } -; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} - -; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]] -; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_inline_neg1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u16_e32 v1, 1, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -121,14 +310,44 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* ret void } -; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} - -; VI: flat_load_dword [[LOAD:v[0-9]+]] -; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] -; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] -; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -139,17 +358,45 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac } ; The high element gives fp -; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] - -; VI-NOT: v_subrev_i16 -; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 -; VI: flat_load_dword -; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NOT: v_subrev_i16 -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s8, 1.0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_inline_fp_split: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -160,22 +407,55 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32: -; GFX9: global_load_dword [[A:v[0-9]+]] -; GFX9: global_load_dword [[B:v[0-9]+]] - -; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] -; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} - -; VI: flat_load_dword v[[A:[0-9]+]] -; VI: flat_load_dword v[[B:[0-9]+]] - -; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] -; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u16_e32 v0, v1, v2 +; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -189,21 +469,59 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64: -; GFX9: global_load_dword [[A:v[0-9]+]] -; GFX9: global_load_dword [[B:v[0-9]+]] - -; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] -; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] -; GFX9: buffer_store_dwordx4 - -; VI: flat_load_dword [[A:v[0-9]+]] -; VI: flat_load_dword [[B:v[0-9]+]] -; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]] -; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v4, v[4:5] +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u16_e32 v0, v2, v4 +; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -217,22 +535,57 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32: -; GFX9: global_load_dword [[A:v[0-9]+]] -; GFX9: global_load_dword [[B:v[0-9]+]] - -; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 -; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} - -; VI: flat_load_dword -; VI: flat_load_dword -; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v0, v1 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -246,21 +599,62 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64: -; GCN: {{flat|global}}_load_dword -; GCN: {{flat|global}}_load_dword - -; GFX9: v_pk_sub_i16 -; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} - -; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_sub_u16_e32 - -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 -; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v1, v0, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v0, v2 +; VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid -- 2.11.0