From c9aa843961d2c3cb34e7cb2dc843b93d723e0692 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 23 Aug 2019 17:53:05 +0200 Subject: [PATCH] radv/gfx10: fix implementation of exclusive scans This implementation is loosely based on ROCm. https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/ockl/src/wfredscan.cl This fixes dEQP-VK.subgroups.arithmetic.*.subgroupexclusive* on GFX10. Fixes: 227c29a80de ("amd/common/gfx10: implement scan & reduce operations") Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/llvm/ac_llvm_build.c | 76 ++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 1cd687366d7..54513d79922 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -4049,7 +4049,37 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu if (inclusive) { result = src; } else if (ctx->chip_class >= GFX10) { - result = identity; + /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + + tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + + tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); + + if (maxprefix > 32) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp2 = LLVMBuildSelect(ctx->builder, active, + ac_build_readlane(ctx, src, + LLVMConstInt(ctx->i32, 31, false)), + tmp2, ""); + + active = LLVMBuildOr(ctx->builder, active, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 0x1f, false), ""), + LLVMConstInt(ctx->i32, 0x10, false), ""), ""); + src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } else if (maxprefix > 16) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 16, false), ""); + + src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } + + result = src; } else if (ctx->chip_class >= GFX8) { src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); result = src; @@ -4148,33 +4178,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu return result; if (ctx->chip_class >= GFX10) { - /* dpp_row_bcast{15,31} are not supported on gfx10. */ - LLVMBuilderRef builder = ctx->builder; LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef cc; - /* TODO-GFX10: Can we get better code-gen by putting this into - * a branch so that LLVM generates EXEC mask manipulations? */ - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), ""); - cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + LLVMValueRef active; + + tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); + + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 16, false), ""), + ctx->i32_0, ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) return result; - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false)); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildICmp(builder, LLVMIntUGE, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + + active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); return result; } -- 2.11.0