From 5df1ac7846c8ec4004c91d0835e9883f7a0636a4 Mon Sep 17 00:00:00 2001 From: alex-t Date: Fri, 31 Jan 2020 20:49:00 +0300 Subject: [PATCH] [AMDGPU] fixed divergence driven shift operations selection Differential Revision: https://reviews.llvm.org/D73483 Reviewers: rampitec --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 12 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 13 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 +- .../CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir | 12 +- .../CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir | 12 +- .../CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll | 26 ++-- llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll | 4 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 14 ++- llvm/test/CodeGen/AMDGPU/commute-shifts.ll | 2 +- llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 4 +- llvm/test/CodeGen/AMDGPU/inline-asm.ll | 7 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 ++- llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 2 +- llvm/test/CodeGen/AMDGPU/shift-select.ll | 134 +++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/shl.ll | 14 +-- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 12 +- 17 files changed, 222 insertions(+), 76 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/shift-select.ll diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e1ce7bf7303..93e6458f445 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -541,22 +541,22 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; } // End Defs = [SCC] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index aaadc3dbc72..e0e2b716c5e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -541,14 +541,17 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; } // End SubtargetPredicate = isGFX6GFX7 -let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { +let SubtargetPredicate = isGFX6GFX7GFX10 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>; -} // End isCommutable = 1 } // End SubtargetPredicate = isGFX6GFX7GFX10 +let SubtargetPredicate = isGFX6GFX7 in { +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, shl>; +} // End SubtargetPredicate = isGFX6GFX7 +} // End isCommutable = 1 + class DivergentBinOp : GCNPat< diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 0b12a9b5e98..717b5bde361 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -385,10 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile, shl>; def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile, srl>; def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile, sra>; +} // End SubtargetPredicate = isGFX6GFX7 +let SubtargetPredicate = isGFX6GFX7GFX10 in { def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile>; } // End SubtargetPredicate = isGFX6GFX7GFX10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir index e3b0d4d1879..eeefefbcf75 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir @@ -237,8 +237,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]] + ; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s64) = G_ASHR %0, %1 @@ -277,8 +277,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]] + ; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s64) = G_ASHR %0, %1 @@ -317,8 +317,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]] + ; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_ASHR %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir index 1923c824dc1..03e9720a50e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir @@ -237,8 +237,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]] + ; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s64) = G_LSHR %0, %1 @@ -277,8 +277,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]] + ; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s64) = G_LSHR %0, %1 @@ -317,8 +317,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]] + ; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_LSHR %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir index c60c0e40744..825341ee895 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir @@ -237,8 +237,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]] + ; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s64) = G_SHL %0, %1 @@ -277,8 +277,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]] + ; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s64) = G_SHL %0, %1 @@ -317,8 +317,8 @@ body: | ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]] + ; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_SHL %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 614fd46e700..78590eac828 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -2,9 +2,6 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; XFAIL: * -; FIXME: Merge with DAG test - define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: @@ -19,6 +16,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) ; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm +; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -43,9 +41,10 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -55,21 +54,20 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm +; ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX10-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 8 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, s0, v0 -; GFX10-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index 5af42210178..5ac3daf6577 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -42,8 +42,8 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]] ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index 4d47bda8edd..69237cfabb8 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -24,8 +24,11 @@ define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace( ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] +; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] + +; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] @@ -97,8 +100,11 @@ define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace( ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] +; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] + +; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index 1f0643e856a..22f0750706a 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -17,7 +17,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; SI-NEXT: v_and_b32_e32 v0, 7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshr_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll index 243ec5346f7..b8c5ae09b77 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -169,8 +169,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_d1_indexzext: diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll index cb06eb043e1..adf786c0e92 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=PRE-GFX8 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s ; CHECK-LABEL: {{^}}inline_asm: ; CHECK: s_endpgm @@ -241,7 +241,8 @@ entry: ; CHECK: ; def v0 ; CHECK: v_mov_b32_e32 v1, v0 ; CHECK: ; def v0 -; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1 +; PRE-GFX8: v_lshl_b32_e32 v{{[0-9]+}}, v1, v0 +; GFX8: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1 define amdgpu_kernel void @muliple_def_phys_vgpr() { entry: %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"() diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index a50a04a4cd0..844a200de73 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s @@ -123,8 +125,8 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; CI-NEXT: v_lshr_b32_e32 v2, v2, v3 +; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -490,10 +492,10 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; CI-NEXT: v_and_b32_e32 v4, s8, v4 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 ; CI-NEXT: v_and_b32_e32 v5, s8, v5 -; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, v9, v7 -; CI-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; CI-NEXT: v_lshr_b32_e32 v3, v3, v5 +; CI-NEXT: v_lshr_b32_e32 v5, v7, v9 +; CI-NEXT: v_lshr_b32_e32 v2, v2, v4 +; CI-NEXT: v_lshr_b32_e32 v4, v6, v8 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 40d4aa89763..996d81dfc3e 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addr ; GCN: {{buffer|flat|global}}_load_ushort [[VAL0:v[0-9]+]] ; GCN: {{buffer|flat|global}}_load_ushort [[VAL1:v[0-9]+]] -; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] +; SI: v_lshl_b32_e32 [[REG:v[0-9]+]], [[VAL0]], [[VAL1]] ; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/shift-select.ll b/llvm/test/CodeGen/AMDGPU/shift-select.ll new file mode 100644 index 00000000000..d825d148934 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shift-select.ll @@ -0,0 +1,134 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX6 %s +; RUN: llc -march=amdgcn -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8-10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8-10 %s + +; GCN-LABEL: name: s_shl_i32 +; GCN: S_LSHL_B32 +define amdgpu_kernel void @s_shl_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) { + %result = shl i32 %lhs, %rhs + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_shl_i32 +; GFX6: V_LSHL_B32_e32 +; GFX8-10: V_LSHLREV_B32_e32 +define amdgpu_kernel void @v_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = shl i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: s_lshr_i32 +; GCN: S_LSHR_B32 +define amdgpu_kernel void @s_lshr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) { + %result = lshr i32 %lhs, %rhs + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_lshr_i32 +; GFX6: V_LSHR_B32_e32 +; GFX8-10: V_LSHRREV_B32_e64 +define amdgpu_kernel void @v_lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = lshr i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: s_ashr_i32 +; GCN: S_ASHR_I32 +define amdgpu_kernel void @s_ashr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) #0 { + %result = ashr i32 %lhs, %rhs + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_ashr_i32 +; GFX6: V_ASHR_I32_e32 +; GFX8-10: V_ASHRREV_I32_e64 +define amdgpu_kernel void @v_ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = ashr i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: s_shl_i64 +; GCN: S_LSHL_B64 +define amdgpu_kernel void @s_shl_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) { + %result = shl i64 %lhs, %rhs + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_shl_i64 +; GFX6: V_LSHL_B64 +; GFX8: V_LSHLREV_B64 +define amdgpu_kernel void @v_shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %tid to i64 + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = shl i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: s_lshr_i64 +; GCN: S_LSHR_B64 +define amdgpu_kernel void @s_lshr_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) { + %result = lshr i64 %lhs, %rhs + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_lshr_i64 +; GFX6: V_LSHR_B64 +; GFX8: V_LSHRREV_B64 +define amdgpu_kernel void @v_lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %tid to i64 + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = lshr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: s_ashr_i64 +; GCN: S_ASHR_I64 +define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) { + %result = ashr i64 %lhs, %rhs + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: v_ashr_i64 +; GFX6: V_ASHR_I64 +; GFX8: V_ASHRREV_I64 +define amdgpu_kernel void @v_ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %tid to i64 + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = ashr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index ccf3f39a661..898314797b7 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -301,7 +301,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 a ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, v0, v2 +; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 ; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -425,8 +425,8 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, v3, v2 +; GCN-NEXT: v_lshl_b32_e32 v0, v1, v0 +; GCN-NEXT: v_lshl_b32_e32 v1, v2, v3 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 @@ -500,10 +500,10 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add ; GCN-NEXT: v_and_b32_e32 v9, s8, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, v5, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, v9, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, v4, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, v8, v2 +; GCN-NEXT: v_lshl_b32_e32 v5, v7, v5 +; GCN-NEXT: v_lshl_b32_e32 v3, v3, v9 +; GCN-NEXT: v_lshl_b32_e32 v4, v6, v4 +; GCN-NEXT: v_lshl_b32_e32 v2, v2, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_and_b32_e32 v3, s8, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index deafd6f7fe1..7cfc8b18d21 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -123,8 +123,8 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v5, s8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshlrev_b32_e32 v3, v3, v4 -; CI-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 +; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -491,10 +491,10 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a ; CI-NEXT: v_and_b32_e32 v9, s8, v5 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_lshlrev_b32_e32 v5, v5, v7 -; CI-NEXT: v_lshlrev_b32_e32 v3, v9, v3 -; CI-NEXT: v_lshlrev_b32_e32 v4, v4, v6 -; CI-NEXT: v_lshlrev_b32_e32 v2, v8, v2 +; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 +; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 +; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 +; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -- 2.11.0