From 295605c930270a5b90f847b79474507d8b0c9e9c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 24 Aug 2017 11:40:31 -0700 Subject: [PATCH] intel/cs: Push subgroup ID instead of base thread ID We're going to want subgroup ID for SPIR-V subgroups eventually anyway. We really only want to push one and calculate the other from it. It makes a bit more sense to push the subgroup ID because it's simpler to calculate and because it's a real API thing. The only advantage to pushing the base thread ID is to avoid a single SHL in the shader. Reviewed-by: Iago Toral Quiroga --- src/compiler/nir/nir_intrinsics.h | 4 +--- src/intel/compiler/brw_compiler.h | 2 +- src/intel/compiler/brw_fs.cpp | 30 ++++++++++++------------ src/intel/compiler/brw_fs.h | 2 +- src/intel/compiler/brw_fs_nir.cpp | 8 +++---- src/intel/compiler/brw_nir.h | 3 ++- src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 15 ++++++++---- src/intel/vulkan/anv_cmd_buffer.c | 6 ++--- src/mesa/drivers/dri/i965/gen6_constant_state.c | 6 ++--- 9 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h index 47022dd135b..bb8cfac6620 100644 --- a/src/compiler/nir/nir_intrinsics.h +++ b/src/compiler/nir/nir_intrinsics.h @@ -355,6 +355,7 @@ SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx) SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx) SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx) SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx) +SYSTEM_VALUE(subgroup_id, 1, 0, xx, xx, xx) /* Blend constant color values. Float values are clamped. */ SYSTEM_VALUE(blend_const_color_r_float, 1, 0, xx, xx, xx) @@ -364,9 +365,6 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_rgba8888_unorm, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_aaaa8888_unorm, 1, 0, xx, xx, xx) -/* Intel specific system values */ -SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx) - /** * Barycentric coordinate intrinsics. * diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 662f645e183..df6ee018546 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -552,7 +552,7 @@ enum brw_param_builtin { BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X, BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y, - BRW_PARAM_BUILTIN_THREAD_LOCAL_ID, + BRW_PARAM_BUILTIN_SUBGROUP_ID, }; #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 006b72b19e1..40e64a48201 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -996,7 +996,7 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; - this->thread_local_id = v->thread_local_id; + this->subgroup_id = v->subgroup_id; } void @@ -1931,14 +1931,14 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, } static int -get_thread_local_id_param_index(const brw_stage_prog_data *prog_data) +get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) { if (prog_data->nr_params == 0) return -1; /* The local thread id is always the last parameter in the list */ uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; - if (last_param == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID) + if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) return prog_data->nr_params - 1; return -1; @@ -2019,7 +2019,7 @@ fs_visitor::assign_constant_locations() } } - int thread_local_id_index = get_thread_local_id_param_index(stage_prog_data); + int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); /* Only allow 16 registers (128 uniform components) as push constants. * @@ -2030,7 +2030,7 @@ fs_visitor::assign_constant_locations() * brw_curbe.c. */ unsigned int max_push_components = 16 * 8; - if (thread_local_id_index >= 0) + if (subgroup_id_index >= 0) max_push_components--; /* Save a slot for the thread ID */ /* We push small arrays, but no bigger than 16 floats. This is big enough @@ -2075,8 +2075,8 @@ fs_visitor::assign_constant_locations() if (!is_live[u]) continue; - /* Skip thread_local_id_index to put it in the last push register. */ - if (thread_local_id_index == (int)u) + /* Skip subgroup_id_index to put it in the last push register. */ + if (subgroup_id_index == (int)u) continue; set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, @@ -2090,8 +2090,8 @@ fs_visitor::assign_constant_locations() } /* Add the CS local thread ID uniform at the end of the push constants */ - if (thread_local_id_index >= 0) - push_constant_loc[thread_local_id_index] = num_push_constants++; + if (subgroup_id_index >= 0) + push_constant_loc[subgroup_id_index] = num_push_constants++; /* As the uniforms are going to be reordered, stash the old array and * create two new arrays for push/pull params. @@ -6778,20 +6778,20 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, struct brw_cs_prog_data *cs_prog_data) { const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - int thread_local_id_index = get_thread_local_id_param_index(prog_data); + int subgroup_id_index = get_subgroup_id_param_index(prog_data); bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell; /* The thread ID should be stored in the last param dword */ - assert(thread_local_id_index == -1 || - thread_local_id_index == (int)prog_data->nr_params - 1); + assert(subgroup_id_index == -1 || + subgroup_id_index == (int)prog_data->nr_params - 1); unsigned cross_thread_dwords, per_thread_dwords; if (!cross_thread_supported) { cross_thread_dwords = 0u; per_thread_dwords = prog_data->nr_params; - } else if (thread_local_id_index >= 0) { + } else if (subgroup_id_index >= 0) { /* Fill all but the last register with cross-thread payload */ - cross_thread_dwords = 8 * (thread_local_id_index / 8); + cross_thread_dwords = 8 * (subgroup_id_index / 8); per_thread_dwords = prog_data->nr_params - cross_thread_dwords; assert(per_thread_dwords > 0 && per_thread_dwords <= 8); } else { @@ -6834,7 +6834,7 @@ compile_cs_to_nir(const struct brw_compiler *compiler, { nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); - brw_nir_lower_cs_intrinsics(shader); + brw_nir_lower_cs_intrinsics(shader, dispatch_width); return brw_postprocess_nir(shader, compiler, true); } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index f51a4d8889b..40dd83f45e4 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -315,7 +315,7 @@ public: */ int *push_constant_loc; - fs_reg thread_local_id; + fs_reg subgroup_id; fs_reg frag_depth; fs_reg frag_stencil; fs_reg sample_mask; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 77d8bae4db6..39e7e692874 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -95,8 +95,8 @@ fs_visitor::nir_setup_uniforms() */ assert(uniforms == prog_data->nr_params); uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); - *param = BRW_PARAM_BUILTIN_THREAD_LOCAL_ID; - thread_local_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; + subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); } } @@ -3422,8 +3422,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, cs_prog_data->uses_barrier = true; break; - case nir_intrinsic_load_intel_thread_local_id: - bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), thread_local_id); + case nir_intrinsic_load_subgroup_id: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); break; case nir_intrinsic_load_local_invocation_id: diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 3e407122681..0118cfadc1f 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -95,7 +95,8 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir); nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir); -bool brw_nir_lower_cs_intrinsics(nir_shader *nir); +bool brw_nir_lower_cs_intrinsics(nir_shader *nir, + unsigned dispatch_width); void brw_nir_lower_vs_inputs(nir_shader *nir, bool use_legacy_snorm_formula, const uint8_t *vs_attrib_wa_flags); diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index 07d2dccd041..66eef6be0a6 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -26,6 +26,7 @@ struct lower_intrinsics_state { nir_shader *nir; + unsigned dispatch_width; nir_function_impl *impl; bool progress; nir_builder builder; @@ -56,12 +57,14 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, * gl_LocalInvocationIndex = * cs_thread_local_id + subgroup_invocation; */ - nir_ssa_def *thread_local_id; - if (state->local_workgroup_size <= 8) - thread_local_id = nir_imm_int(b, 0); + nir_ssa_def *subgroup_id; + if (state->local_workgroup_size <= state->dispatch_width) + subgroup_id = nir_imm_int(b, 0); else - thread_local_id = nir_load_intel_thread_local_id(b); + subgroup_id = nir_load_subgroup_id(b); + nir_ssa_def *thread_local_id = + nir_imul(b, subgroup_id, nir_imm_int(b, state->dispatch_width)); nir_ssa_def *channel = nir_load_subgroup_invocation(b); sysval = nir_iadd(b, channel, thread_local_id); break; @@ -127,7 +130,8 @@ lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state) } bool -brw_nir_lower_cs_intrinsics(nir_shader *nir) +brw_nir_lower_cs_intrinsics(nir_shader *nir, + unsigned dispatch_width) { assert(nir->info.stage == MESA_SHADER_COMPUTE); @@ -135,6 +139,7 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir) struct lower_intrinsics_state state; memset(&state, 0, sizeof(state)); state.nir = nir; + state.dispatch_width = dispatch_width; state.local_workgroup_size = nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * nir->info.cs.local_size[2]; diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index b45f8f83757..69acafaae26 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -710,7 +710,7 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) for (unsigned i = 0; i < cs_prog_data->push.cross_thread.dwords; i++) { - assert(prog_data->param[i] != BRW_PARAM_BUILTIN_THREAD_LOCAL_ID); + assert(prog_data->param[i] != BRW_PARAM_BUILTIN_SUBGROUP_ID); u32_map[i] = anv_push_constant_value(data, prog_data->param[i]); } } @@ -722,8 +722,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) cs_prog_data->push.cross_thread.regs); unsigned src = cs_prog_data->push.cross_thread.dwords; for ( ; src < prog_data->nr_params; src++, dst++) { - if (prog_data->param[src] == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID) { - u32_map[dst] = t * cs_prog_data->simd_size; + if (prog_data->param[src] == BRW_PARAM_BUILTIN_SUBGROUP_ID) { + u32_map[dst] = t; } else { u32_map[dst] = anv_push_constant_value(data, prog_data->param[src]); diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c index acf7454cef5..d89e7bde24b 100644 --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c @@ -317,7 +317,7 @@ brw_upload_cs_push_constants(struct brw_context *brw, for (unsigned i = 0; i < cs_prog_data->push.cross_thread.dwords; i++) { - assert(prog_data->param[i] != BRW_PARAM_BUILTIN_THREAD_LOCAL_ID); + assert(prog_data->param[i] != BRW_PARAM_BUILTIN_SUBGROUP_ID); param_copy[i] = brw_param_value(brw, prog, stage_state, prog_data->param[i]); } @@ -330,8 +330,8 @@ brw_upload_cs_push_constants(struct brw_context *brw, cs_prog_data->push.cross_thread.regs); unsigned src = cs_prog_data->push.cross_thread.dwords; for ( ; src < prog_data->nr_params; src++, dst++) { - if (prog_data->param[src] == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID) { - param[dst] = t * cs_prog_data->simd_size; + if (prog_data->param[src] == BRW_PARAM_BUILTIN_SUBGROUP_ID) { + param[dst] = t; } else { param[dst] = brw_param_value(brw, prog, stage_state, prog_data->param[src]); -- 2.11.0