From 36fd65381756ed1b8f774f7fcdd555941a3d39e1 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 11 Mar 2015 23:14:31 -0700 Subject: [PATCH] i965: Add scalar geometry shader support. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is hidden behind INTEL_SCALAR_GS=1 for now, as we don't yet support instanced geometry shaders, and Orbital Explorer's shader spills like crazy. But the infrastructure is in place, and it's largely working. v2: Lots of rebasing. v3: (feedback from Kristian Høgsberg) - Handle stride and subreg_offset correctly for ATTRs; use a helper. - Fix missing emit_shader_time_end() call. - Delete dead code after early EOT in static vertex case to avoid tripping asserts in emit_shader_time_end(). - Use proper D/UD type in intexp2(). - Fix "EndPrimitve" and "to that" typos. - Assert that invocations == 1 so we know this is missing. Signed-off-by: Kenneth Graunke Reviewed-by: Kristian Høgsberg --- src/mesa/drivers/dri/i965/brw_fs.cpp | 208 +++++++++++- src/mesa/drivers/dri/i965/brw_fs.h | 17 +- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 391 ++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 49 ++- src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 25 ++ 5 files changed, 666 insertions(+), 24 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 5ab8c15bc0c..4cc962613b3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -43,6 +43,7 @@ #include "brw_wm.h" #include "brw_fs.h" #include "brw_cs.h" +#include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" #include "brw_dead_control_flow.h" #include "main/uniforms.h" @@ -1361,6 +1362,57 @@ fs_visitor::emit_discard_jump() } void +fs_visitor::emit_gs_thread_end() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + if (gs_compile->control_data_header_size_bits > 0) { + emit_gs_control_data_bits(this->final_gs_vertex_count); + } + + const fs_builder abld = bld.annotate("thread end"); + fs_inst *inst; + + if (gs_prog_data->static_vertex_count != -1) { + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { + if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) { + prev->eot = true; + + /* Delete now dead instructions. */ + foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { + if (dead == prev) + break; + dead->remove(); + } + return; + } else if (prev->is_control_flow() || prev->has_side_effects()) { + break; + } + } + fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr); + inst->mlen = 1; + } else { + fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); + sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + sources[1] = this->final_gs_vertex_count; + abld.LOAD_PAYLOAD(payload, sources, 2, 2); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->mlen = 2; + } + inst->eot = true; + inst->offset = 0; +} + +void fs_visitor::assign_curb_setup() { if (dispatch_width == 8) { @@ -1532,6 +1584,26 @@ fs_visitor::assign_urb_setup() } void +fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) +{ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + int grf = payload.num_regs + + prog_data->curb_read_length + + inst->src[i].reg + + inst->src[i].reg_offset; + + inst->src[i].file = HW_REG; + inst->src[i].fixed_hw_reg = + stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + inst->src[i].subreg_offset), + inst->exec_size * inst->src[i].stride, + inst->exec_size, inst->src[i].stride); + } + } +} + +void fs_visitor::assign_vs_urb_setup() { brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; @@ -1548,24 +1620,44 @@ fs_visitor::assign_vs_urb_setup() /* Rewrite all ATTR file references to the hw grf that they land in. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == ATTR) { - int grf = payload.num_regs + - prog_data->curb_read_length + - inst->src[i].reg + - inst->src[i].reg_offset; - - inst->src[i].file = HW_REG; - inst->src[i].fixed_hw_reg = - stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), - inst->src[i].subreg_offset), - inst->exec_size * inst->src[i].stride, - inst->exec_size, inst->src[i].stride); - } + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_gs_urb_setup() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; + + first_non_payload_grf += + 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in; + + const unsigned first_icp_handle = payload.num_regs - + (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* Lower URB_READ_SIMD8 opcodes into real messages. */ + if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) { + assert(inst->src[0].file == IMM); + inst->src[0] = retype(brw_vec8_grf(first_icp_handle + + inst->src[0].fixed_hw_reg.dw1.ud, + 0), BRW_REGISTER_TYPE_UD); + /* for now, assume constant - we can do per-slot offsets later */ + assert(inst->src[1].file == IMM); + inst->offset = inst->src[1].fixed_hw_reg.dw1.ud; + inst->src[1] = fs_reg(); + inst->mlen = 1; + inst->base_mrf = -1; } + + /* Rewrite all ATTR file references to HW_REGs. */ + convert_attr_sources_to_hw_regs(inst); } } + /** * Split large virtual GRFs into separate components if we can. * @@ -4763,6 +4855,45 @@ fs_visitor::setup_vs_payload() * */ void +fs_visitor::setup_gs_payload() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + struct brw_vue_prog_data *vue_prog_data = + (struct brw_vue_prog_data *) prog_data; + + /* R0: thread header, R1: output URB handles */ + payload.num_regs = 2; + + if (gs_prog_data->include_primitive_id) { + /* R2: Primitive ID 0..7 */ + payload.num_regs++; + } + + /* Use a maximum of 32 registers for push-model inputs. */ + const unsigned max_push_components = 32; + + /* If pushing our inputs would take too many registers, reduce the URB read + * length (which is in HWords, or 8 registers), and resort to pulling. + * + * Note that the GS reads HWords for every vertex - so we + * have to multiply by VerticesIn to obtain the total storage requirement. + */ + if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in > + max_push_components) { + gs_prog_data->base.include_vue_handles = true; + + /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */ + payload.num_regs += nir->info.gs.vertices_in; + + vue_prog_data->urb_read_length = + ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8; + } +} + +void fs_visitor::setup_cs_payload() { assert(devinfo->gen >= 7); @@ -5019,6 +5150,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) } bool +fs_visitor::run_gs() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + setup_gs_payload(); + + this->final_gs_vertex_count = vgrf(glsl_type::uint_type); + + if (gs_compile->control_data_header_size_bits > 0) { + /* Create a VGRF to store accumulated control data bits. */ + this->control_data_bits = vgrf(glsl_type::uint_type); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (gs_compile->control_data_header_size_bits <= 32) { + const fs_builder abld = bld.annotate("initialize control data bits"); + abld.MOV(this->control_data_bits, fs_reg(0u)); + } + } + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + emit_gs_thread_end(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_gs_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(); + + return !failed; +} + +bool fs_visitor::run_fs(bool do_rep_send) { brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index caf56555981..2dfcab1c51a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -132,18 +132,22 @@ public: bool run_fs(bool do_rep_send); bool run_vs(gl_clip_plane *clip_planes); + bool run_gs(); bool run_cs(); void optimize(); void allocate_registers(); void setup_payload_gen4(); void setup_payload_gen6(); void setup_vs_payload(); + void setup_gs_payload(); void setup_cs_payload(); void fixup_3src_null_dest(); void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); + void convert_attr_sources_to_hw_regs(fs_inst *inst); void assign_vs_urb_setup(); + void assign_gs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); void calculate_payload_ranges(int payload_node_count, @@ -281,7 +285,16 @@ public: fs_reg color1, fs_reg color2, fs_reg src0_alpha, unsigned components); void emit_fb_writes(); - void emit_urb_writes(); + void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg()); + void set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id); + void emit_gs_control_data_bits(const fs_reg &vertex_count); + void emit_gs_end_primitive(const nir_src &vertex_count_nir_src); + void emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id); + void emit_gs_thread_end(); + void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src, + unsigned offset, unsigned num_components); void emit_cs_terminate(); fs_reg *emit_cs_local_invocation_id_setup(); fs_reg *emit_cs_work_group_id_setup(); @@ -389,6 +402,8 @@ public: fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg shader_start_time; fs_reg userplane[MAX_CLIP_PLANES]; + fs_reg final_gs_vertex_count; + fs_reg control_data_bits; unsigned grf_used; bool spilled_any_registers; diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 7eeff93e465..b6eab069a1f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -28,6 +28,7 @@ #include "program/prog_to_nir.h" #include "brw_fs.h" #include "brw_fs_surface_builder.h" +#include "brw_vec4_gs_visitor.h" #include "brw_nir.h" #include "brw_fs_surface_builder.h" #include "brw_vec4_gs_visitor.h" @@ -102,6 +103,7 @@ fs_visitor::nir_setup_outputs() switch (stage) { case MESA_SHADER_VERTEX: + case MESA_SHADER_GEOMETRY: for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) { int output = var->data.location + i; this->outputs[output] = offset(reg, bld, 4 * i); @@ -1194,6 +1196,375 @@ emit_pixel_interpolater_send(const fs_builder &bld, return inst; } +/** + * Computes 1 << x, given a D/UD register containing some value x. + */ +static fs_reg +intexp2(const fs_builder &bld, const fs_reg &x) +{ + assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); + + fs_reg result = bld.vgrf(x.type, 1); + fs_reg one = bld.vgrf(x.type, 1); + + bld.MOV(one, retype(fs_reg(1), one.type)); + bld.SHL(result, one, x); + return result; +} + +void +fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + /* We can only do EndPrimitive() functionality when the control data + * consists of cut bits. Fortunately, the only time it isn't is when the + * output type is points, in which case EndPrimitive() is a no-op. + */ + if (gs_prog_data->control_data_format != + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { + return; + } + + /* Cut bits use one bit per vertex. */ + assert(gs_compile->control_data_bits_per_vertex == 1); + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting + * vertex n, 0 otherwise. So all we need to do here is mark bit + * (vertex_count - 1) % 32 in the cut_bits register to indicate that + * EndPrimitive() was called after emitting vertex (vertex_count - 1); + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. + * + * Note that if EndPrimitive() is called before emitting any vertices, this + * will cause us to set bit 31 of the control_data_bits register to 1. + * That's fine because: + * + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be + * output, so the hardware will ignore cut bit 31. + * + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the + * last vertex, so setting cut bit 31 has no effect (since the primitive + * is automatically ended when the GS terminates). + * + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the + * control_data_bits register to 0 when the first vertex is emitted. + */ + + const fs_builder abld = bld.annotate("end primitive"); + + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); + fs_reg mask = intexp2(abld, prev_count); + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << + * ((vertex_count - 1) % 32). + */ + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) +{ + assert(stage == MESA_SHADER_GEOMETRY); + assert(gs_compile->control_data_bits_per_vertex != 0); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + const fs_builder abld = bld.annotate("emit control data bits"); + const fs_builder fwa_bld = bld.exec_all(); + + /* We use a single UD register to accumulate control data bits (32 bits + * for each of the SIMD8 channels). So we need to write a DWord (32 bits) + * at a time. + * + * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. + * We have select a 128-bit group via the Global and Per-Slot Offsets, then + * use the Channel Mask phase to enable/disable which DWord within that + * group to write. (Remember, different SIMD8 channels may have emitted + * different numbers of vertices, so we may need per-slot offsets.) + * + * Channel masking presents an annoying problem: we may have to replicate + * the data up to 4 times: + * + * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. + * + * To avoid penalizing shaders that emit a small number of vertices, we + * can avoid these sometimes: if the size of the control data header is + * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land + * land in the same 128-bit group, so we can skip per-slot offsets. + * + * Similarly, if the control data header is <= 32 bits, there is only one + * DWord, so we can skip channel masks. + */ + enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + + fs_reg channel_mask, per_slot_offset; + + if (gs_compile->control_data_header_size_bits > 32) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + channel_mask = vgrf(glsl_type::uint_type); + } + + if (gs_compile->control_data_header_size_bits > 128) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; + per_slot_offset = vgrf(glsl_type::uint_type); + } + + /* Figure out which DWord we're trying to write to using the formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + */ + if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { + fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); + unsigned log2_bits_per_vertex = + _mesa_fls(gs_compile->control_data_bits_per_vertex); + abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex)); + + if (per_slot_offset.file != BAD_FILE) { + /* Set the per-slot offset to dword_index / 4, so that we'll write to + * the appropriate OWord within the control data header. + */ + abld.SHR(per_slot_offset, dword_index, fs_reg(2u)); + } + + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. + */ + fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fwa_bld.AND(channel, dword_index, fs_reg(3u)); + channel_mask = intexp2(fwa_bld, channel); + /* Then the channel masks need to be in bits 23:16. */ + fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u)); + } + + /* Store the control data bits in the message payload and send it. */ + int mlen = 2; + if (channel_mask.file != BAD_FILE) + mlen += 4; /* channel masks, plus 3 extra copies of the data */ + if (per_slot_offset.file != BAD_FILE) + mlen++; + + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); + int i = 0; + sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + if (per_slot_offset.file != BAD_FILE) + sources[i++] = per_slot_offset; + if (channel_mask.file != BAD_FILE) + sources[i++] = channel_mask; + while (i < mlen) { + sources[i++] = this->control_data_bits; + } + + abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); + fs_inst *inst = abld.emit(opcode, reg_undef, payload); + inst->mlen = mlen; + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (gs_prog_data->static_vertex_count == -1) + inst->offset = 2; +} + +void +fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id) +{ + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(gs_compile->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + const fs_builder abld = bld.annotate("set stream control data bits", NULL); + + /* reg::sid = stream_id */ + fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(sid, fs_reg(stream_id)); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(shift_count, vertex_count, fs_reg(1u)); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(mask, sid, shift_count); + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) + return; + + /* If we're outputting 32 control data bits or less, then we can wait + * until the shader is over to output them all. Otherwise we need to + * output them as we go. Now is the time to do it, since we're about to + * output the vertex_count'th vertex, so it's guaranteed that the + * control data bits associated with the (vertex_count - 1)th vertex are + * correct. + */ + if (gs_compile->control_data_header_size_bits > 32) { + const fs_builder abld = + bld.annotate("emit vertex: emit control data bits"); + + /* Only emit control data bits if we've finished accumulating a batch + * of 32 bits. This is the case when: + * + * (vertex_count * bits_per_vertex) % 32 == 0 + * + * (in other words, when the last 5 bits of vertex_count * + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some + * integer n (which is always the case, since bits_per_vertex is + * always 1 or 2), this is equivalent to requiring that the last 5-n + * bits of vertex_count are 0: + * + * vertex_count & (2^(5-n) - 1) == 0 + * + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is + * equivalent to: + * + * vertex_count & (32 / bits_per_vertex - 1) == 0 + * + * TODO: If vertex_count is an immediate, we could do some of this math + * at compile time... + */ + fs_inst *inst = + abld.AND(bld.null_reg_d(), vertex_count, + fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u)); + inst->conditional_mod = BRW_CONDITIONAL_Z; + + abld.IF(BRW_PREDICATE_NORMAL); + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we can skip emitting them. + */ + abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u), + BRW_CONDITIONAL_NEQ); + abld.IF(BRW_PREDICATE_NORMAL); + emit_gs_control_data_bits(vertex_count); + abld.emit(BRW_OPCODE_ENDIF); + + /* Reset control_data_bits to 0 so we can start accumulating a new + * batch. + * + * Note: in the case where vertex_count == 0, this neutralizes the + * effect of any call to EndPrimitive() that the shader may have + * made before outputting its first vertex. + */ + inst = abld.MOV(this->control_data_bits, fs_reg(0u)); + inst->force_writemask_all = true; + abld.emit(BRW_OPCODE_ENDIF); + } + + emit_urb_writes(vertex_count); + + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for GL_POINTS outputs that don't use streams). + */ + if (gs_compile->control_data_header_size_bits > 0 && + gs_prog_data->control_data_format == + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + set_gs_stream_control_data_bits(vertex_count, stream_id); + } +} + +void +fs_visitor::emit_gs_input_load(const fs_reg &dst, + const nir_src &vertex_src, + unsigned input_offset, + unsigned num_components) +{ + const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data; + const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0]; + + const unsigned array_stride = vue_prog_data->urb_read_length * 8; + + const bool pushed = 4 * input_offset < array_stride; + + if (input_offset == 0) { + /* This is the VUE header, containing VARYING_SLOT_LAYER [.y], + * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. + * Only gl_PointSize is available as a GS input, so they must + * be asking for that input. + */ + if (pushed) { + bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type)); + } else { + fs_reg tmp = bld.vgrf(dst.type, 4); + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, + fs_reg(vertex), fs_reg(0)); + inst->regs_written = 4; + bld.MOV(dst, offset(tmp, bld, 3)); + } + } else { + if (pushed) { + int index = vertex * array_stride + 4 * input_offset; + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type)); + } + } else { + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, + fs_reg(vertex), fs_reg(input_offset)); + inst->regs_written = num_components; + } + } +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -1579,6 +1950,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_per_vertex_input_indirect: + assert(!"Not allowed"); + /* fallthrough */ + case nir_intrinsic_load_per_vertex_input: + emit_gs_input_load(dest, instr->src[0], instr->const_index[0], + instr->num_components); + break; + /* Handle ARB_gpu_shader5 interpolation intrinsics * * It's worth a quick word of explanation as to why we handle the full @@ -1929,6 +2308,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_emit_vertex_with_counter: + emit_gs_vertex(instr->src[0], instr->const_index[0]); + break; + + case nir_intrinsic_end_primitive_with_counter: + emit_gs_end_primitive(instr->src[0]); + break; + + case nir_intrinsic_set_vertex_count: + bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); + break; + default: unreachable("unknown intrinsic"); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index b6d1c3b6d4a..ef92098286c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -880,7 +880,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes) } void -fs_visitor::emit_urb_writes() +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) { int slot, urb_offset, length; int starting_urb_offset = 0; @@ -916,9 +916,13 @@ fs_visitor::emit_urb_writes() return; } + opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + int header_size = 1; + fs_reg per_slot_offsets; + if (stage == MESA_SHADER_GEOMETRY) { const struct brw_gs_prog_data *gs_prog_data = - (const struct brw_gs_prog_data *) prog_data; + (const struct brw_gs_prog_data *) this->prog_data; /* We need to increment the Global Offset to skip over the control data * header and the extra "Vertex Count" field (1 HWord) at the beginning @@ -927,6 +931,27 @@ fs_visitor::emit_urb_writes() starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; if (gs_prog_data->static_vertex_count == -1) starting_urb_offset += 2; + + /* We also need to use per-slot offsets. The per-slot offset is the + * Vertex Count. SIMD8 mode processes 8 different primitives at a + * time; each may output a different number of vertices. + */ + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT; + header_size++; + + /* The URB offset is in 128-bit units, so we need to multiply by 2 */ + const int output_vertex_size_owords = + gs_prog_data->output_vertex_size_hwords * 2; + + fs_reg offset; + if (gs_vertex_count.file == IMM) { + per_slot_offsets = fs_reg(output_vertex_size_owords * + gs_vertex_count.fixed_hw_reg.dw1.ud); + } else { + per_slot_offsets = vgrf(glsl_type::int_type); + bld.MUL(per_slot_offsets, gs_vertex_count, + fs_reg(output_vertex_size_owords)); + } } length = 0; @@ -1023,19 +1048,25 @@ fs_visitor::emit_urb_writes() if (length == 8 || last) flush = true; if (flush) { - fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1); - fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1), + fs_reg *payload_sources = + ralloc_array(mem_ctx, fs_reg, length + header_size); + fs_reg payload = fs_reg(GRF, alloc.allocate(length + header_size), BRW_REGISTER_TYPE_F); payload_sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); - memcpy(&payload_sources[1], sources, length * sizeof sources[0]); - abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1); + if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT) + payload_sources[1] = per_slot_offsets; + + memcpy(&payload_sources[header_size], sources, + length * sizeof sources[0]); + + abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, + header_size); - fs_inst *inst = - abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + fs_inst *inst = abld.emit(opcode, reg_undef, payload); inst->eot = last && stage == MESA_SHADER_VERTEX; - inst->mlen = length + 1; + inst->mlen = length + header_size; inst->offset = urb_offset; urb_offset = starting_urb_offset + slot + 1; length = 0; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index cfb5cd95cb1..49c10837334 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -29,6 +29,7 @@ #include "brw_vec4_gs_visitor.h" #include "gen6_gs_visitor.h" +#include "brw_fs.h" namespace brw { @@ -812,6 +813,30 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, * program. */ + if (compiler->scalar_gs) { + /* TODO: Support instanced GS. We have basically no tests... */ + assert(prog_data->invocations == 1); + + fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader, + shader_time_index); + if (v.run_gs()) { + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, &c.key, + &prog_data->base.base, v.promoted_constants, + false, "GS"); + if (unlikely(INTEL_DEBUG & DEBUG_GS)) { + const char *label = + shader->info.label ? shader->info.label : "unnamed"; + char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s", + label, shader->info.name); + g.enable_debug(name); + } + g.generate_code(v.cfg, 8); + return g.get_assembly(final_assembly_size); + } + } + if (compiler->devinfo->gen >= 7) { /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do * so without spilling. If the GS invocations count > 1, then we can't use -- 2.11.0