From: Matt Arsenault Date: Tue, 22 May 2018 20:42:00 +0000 (+0000) Subject: AMDGPU: Fix missing test coverage for some 16-bit and packed ops X-Git-Tag: android-x86-7.1-r4~707 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=46d48331d637a5dee400f63021c8e4d385314d58;p=android-x86%2Fexternal-llvm.git AMDGPU: Fix missing test coverage for some 16-bit and packed ops git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@333024 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll index fb8aeb80e15..e1d33b639f6 100644 --- a/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}fmul_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -8,7 +9,7 @@ ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_f16( @@ -28,7 +29,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_a( @@ -47,7 +48,7 @@ entry: ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] +; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_b( @@ -81,6 +82,8 @@ entry: ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] +; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16( @@ -104,11 +107,18 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + + ; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 +; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] + ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( @@ -130,11 +140,17 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + ; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 +; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] + ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( @@ -146,3 +162,59 @@ entry: store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}fmul_v4f16: +; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} + +; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}} + +; VI: v_mul_f16_sdwa +; VI: v_mul_f16_e32 +; VI: v_mul_f16_sdwa +; VI: v_mul_f16_e32 +; VI: v_or_b32 +; VI: v_or_b32 +define amdgpu_kernel void @fmul_v4f16( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %a, + <4 x half> addrspace(1)* %b) { +entry: + %a.val = load <4 x half>, <4 x half> addrspace(1)* %a + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = fmul <4 x half> %a.val, %b.val + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fmul_v4f16_imm_a: +; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 +; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 + +; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], [[K0]] +; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], [[K1]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}} + +; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 + +; VI-DAG: v_mul_f16_sdwa v[[MUL_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mul_f16_e32 v[[MUL_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] +; VI-DAG: v_add_f16_sdwa v[[MUL_LO_HI:[0-9]+]], v[[A_LO]], v[[A_LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_e32 v[[MUL_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] + +; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MUL_LO_LO]], v[[MUL_LO_HI]] +; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MUL_HI_LO]], v[[MUL_HI_HI]] + +; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} +define amdgpu_kernel void @fmul_v4f16_imm_a( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %b) { +entry: + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = fmul <4 x half> , %b.val + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index aa085a71aff..c57b545dc6d 100644 --- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,8 +1,11 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) +declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) +declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) ; GCN-LABEL: {{^}}maxnum_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -11,7 +14,7 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16( @@ -31,7 +34,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16_imm_a( @@ -49,7 +52,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] +; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16_imm_b( @@ -84,7 +87,9 @@ entry: ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + +; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -114,8 +119,13 @@ entry: ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI-NOT: and +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 +; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( @@ -143,8 +153,14 @@ entry: ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + + +; SIVI-NOT: and +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 +; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_b( @@ -156,3 +172,68 @@ entry: store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; FIXME: Scalarize with undef half +; GCN-LABEL: {{^}}maxnum_v3f16: +; GFX9: v_pk_max_f16 +; GFX9: v_pk_max_f16 +define amdgpu_kernel void @maxnum_v3f16( + <3 x half> addrspace(1)* %r, + <3 x half> addrspace(1)* %a, + <3 x half> addrspace(1)* %b) { +entry: + %a.val = load <3 x half>, <3 x half> addrspace(1)* %a + %b.val = load <3 x half>, <3 x half> addrspace(1)* %b + %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) + store <3 x half> %r.val, <3 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}maxnum_v4f16: +; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} +; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} +define amdgpu_kernel void @maxnum_v4f16( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %a, + <4 x half> addrspace(1)* %b) { +entry: + %a.val = load <4 x half>, <4 x half> addrspace(1)* %a + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fmax_v4f16_imm_a: +; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 +; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 + +; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]] +; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} + +; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 +; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 + +; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] +; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] + +; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]] +; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]] + +; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} +define amdgpu_kernel void @fmax_v4f16_imm_a( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %b) { +entry: + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> , <4 x half> %b.val) + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 5ab03aafe1f..8c81fdb118b 100644 --- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,8 +1,11 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) +declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) +declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) ; GCN-LABEL: {{^}}minnum_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -11,7 +14,7 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_f16( @@ -31,7 +34,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_f16_imm_a( @@ -49,7 +52,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] +; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_f16_imm_b( @@ -85,6 +88,8 @@ entry: ; VI-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16( @@ -115,10 +120,13 @@ entry: ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI-NOT: and +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 +; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] + ; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { @@ -142,9 +150,13 @@ entry: ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 +; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] + ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI-NOT: and +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_b( @@ -156,3 +168,68 @@ entry: store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; FIXME: Scalarize with undef half +; GCN-LABEL: {{^}}minnum_v3f16: +; GFX9: v_pk_min_f16 +; GFX9: v_pk_min_f16 +define amdgpu_kernel void @minnum_v3f16( + <3 x half> addrspace(1)* %r, + <3 x half> addrspace(1)* %a, + <3 x half> addrspace(1)* %b) { +entry: + %a.val = load <3 x half>, <3 x half> addrspace(1)* %a + %b.val = load <3 x half>, <3 x half> addrspace(1)* %b + %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) + store <3 x half> %r.val, <3 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}minnum_v4f16: +; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} +; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} +define amdgpu_kernel void @minnum_v4f16( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %a, + <4 x half> addrspace(1)* %b) { +entry: + %a.val = load <4 x half>, <4 x half> addrspace(1)* %a + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fmin_v4f16_imm_a: +; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} +; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 +; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 + +; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]] +; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} + +; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 +; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 + +; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] +; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] + +; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]] +; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]] + +; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} +define amdgpu_kernel void @fmin_v4f16_imm_a( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %b) { +entry: + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> , <4 x half> %b.val) + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +} diff --git a/test/CodeGen/AMDGPU/mul.i16.ll b/test/CodeGen/AMDGPU/mul.i16.ll new file mode 100644 index 00000000000..4196c1d8336 --- /dev/null +++ b/test/CodeGen/AMDGPU/mul.i16.ll @@ -0,0 +1,103 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s + +; GCN-LABEL: {{^}}v_mul_i16: +; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] +; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] +; SI: v_mul_u32_u24 + +; GFX89: v_mul_lo_u16_e32 v0, v0, v1 +define i16 @v_mul_i16(i16 %a, i16 %b) { + %r.val = mul i16 %a, %b + ret i16 %r.val +} + +; FIXME: Should emit scalar mul or maybe i16 v_mul here +; GCN-LABEL: {{^}}s_mul_i16: +; GCN: v_mul_u32_u24 +define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) { + %r.val = mul i16 %a, %b + store volatile i16 %r.val, i16 addrspace(1)* null + ret void +} + +; FIXME: Should emit u16 mul here. Instead it's worse than SI +; GCN-LABEL: {{^}}v_mul_i16_uniform_load: +; SI: v_mul_u32_u24 +; GFX89: v_mul_lo_i32 +define amdgpu_kernel void @v_mul_i16_uniform_load( + i16 addrspace(1)* %r, + i16 addrspace(1)* %a, + i16 addrspace(1)* %b) { +entry: + %a.val = load i16, i16 addrspace(1)* %a + %b.val = load i16, i16 addrspace(1)* %b + %r.val = mul i16 %a.val, %b.val + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}v_mul_v2i16: +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 + +; VI: v_mul_lo_u16_sdwa +; VI: v_mul_lo_u16_e32 +; VI: v_or_b32_e32 + + +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { + %r.val = mul <2 x i16> %a, %b + ret <2 x i16> %r.val +} + +; FIXME: Unpack garbage on gfx9 +; GCN-LABEL: {{^}}v_mul_v3i16: +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 + +; VI: v_mul_lo_u16 +; VI: v_mul_lo_u16 +; VI: v_mul_lo_u16 + +; GFX9: v_and_b32 +; GFX9: v_and_b32 +; GFX9: v_lshl_or_b32 +; GFX9: v_lshl_or_b32 +; GFX9: v_lshl_or_b32 + +; GFX9: v_pk_mul_lo_u16 +; GFX9: v_pk_mul_lo_u16 +; GFX9: s_setpc_b64 +define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { + %r.val = mul <3 x i16> %a, %b + ret <3 x i16> %r.val +} + +; GCN-LABEL: {{^}}v_mul_v4i16: +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 +; SI: v_mul_lo_i32 + +; VI: v_mul_lo_u16_sdwa +; VI: v_mul_lo_u16_e32 +; VI: v_mul_lo_u16_sdwa +; VI: v_mul_lo_u16_e32 +; VI: v_or_b32_e32 +; VI: v_or_b32_e32 + +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 +define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { + %r.val = mul <4 x i16> %a, %b + ret <4 x i16> %r.val +} diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index eb02084d8eb..c6a3cce8672 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -158,6 +158,8 @@ define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a } ; GCN-LABEL: {{^}}s_min_max_v2i16: +; GFX9: v_pk_max_i16 +; GFX9: v_pk_min_i16 define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { %cond0 = icmp sgt <2 x i16> %val0, %val1 %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 @@ -169,6 +171,8 @@ define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i } ; GCN-LABEL: {{^}}v_min_max_v2i16: +; GFX9: v_pk_max_i16 +; GFX9: v_pk_min_i16 define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 @@ -182,8 +186,12 @@ define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i ret void } -; GCN-LABEL: {{^}}s_min_max_v4i32: -define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { +; GCN-LABEL: {{^}}s_min_max_v4i16: +; GFX9: v_pk_max_i16 +; GFX9: v_pk_max_i16 +; GFX9: v_pk_min_i16 +; GFX9: v_pk_min_i16 +define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { %cond0 = icmp sgt <4 x i16> %val0, %val1 %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1 %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0