From 0fe0320dc074023489e2852771edc487c0142927 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 4 Aug 2017 17:38:57 +0200 Subject: [PATCH] radeonsi: use optimal packet order when doing a pipeline sync MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Process most new SET packets in parallel with previous draw calls, then flush caches and wait, start the draw, and do L2 prefetches last. This decreases the [CP busy / SPI busy] ratio (verified with GRBM perf counters). In other words, the time window when shaders are idle (between (the wait and the draw) is much shorter now. Tested-by: Dieter Nützel Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_state_draw.c | 117 +++++++++++++++++++-------- 1 file changed, 83 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 9df5b7a5886..23e9778fa54 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1162,14 +1162,49 @@ void si_ce_post_draw_synchronization(struct si_context *sctx) } } +static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, + unsigned skip_atom_mask) +{ + /* Emit state atoms. */ + unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; + while (mask) { + struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; + + atom->emit(&sctx->b, atom); + } + sctx->dirty_atoms &= skip_atom_mask; + + /* Emit states. */ + mask = sctx->dirty_states; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct si_pm4_state *state = sctx->queued.array[i]; + + if (!state || sctx->emitted.array[i] == state) + continue; + + si_pm4_emit(sctx, state); + sctx->emitted.array[i] = state; + } + sctx->dirty_states = 0; + + /* Emit draw states. */ + unsigned num_patches = 0; + + si_emit_rasterizer_prim_state(sctx); + if (sctx->tes_shader.cso) + si_emit_derived_tess_state(sctx, info, &num_patches); + si_emit_vs_state(sctx, info); + si_emit_draw_registers(sctx, info, num_patches); +} + void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct pipe_resource *indexbuf = info->index.resource; - unsigned mask, dirty_tex_counter; + unsigned dirty_tex_counter; enum pipe_prim_type rast_prim; - unsigned num_patches = 0; unsigned index_size = info->index_size; unsigned index_offset = info->indirect ? info->start * index_size : 0; @@ -1251,9 +1286,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (sctx->do_update_shaders && !si_update_shaders(sctx)) return; - if (!si_upload_graphics_shader_descriptors(sctx)) - return; - if (index_size) { /* Translate or upload, if needed. */ /* 8-bit indices are supported on VI. */ @@ -1342,44 +1374,61 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - /* Flush caches before the first state atom, which does L2 prefetches. */ - if (sctx->b.flags) + /* Use optimal packet order based on whether we need to sync the pipeline. */ + if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* If we have to wait for idle, set all states first, so that all + * SET packets are processed in parallel with previous draw calls. + * Then upload descriptors, set shader pointers, and draw, and + * prefetch at the end. This ensures that the time the CUs + * are idle is very short. (there are only SET_SH packets between + * the wait and the draw) + */ + struct r600_atom *shader_pointers = &sctx->shader_pointers.atom; + + /* Emit all states except shader pointers. */ + si_emit_all_states(sctx, info, 1 << shader_pointers->id); si_emit_cache_flush(sctx); - if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx); + /* <-- CUs are idle here. */ + if (!si_upload_graphics_shader_descriptors(sctx)) + return; - /* Emit state atoms. */ - mask = sctx->dirty_atoms; - while (mask) { - struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; + /* Set shader pointers after descriptors are uploaded. */ + if (si_is_atom_dirty(sctx, shader_pointers)) { + shader_pointers->emit(&sctx->b, NULL); + sctx->dirty_atoms = 0; + } - atom->emit(&sctx->b, atom); - } - sctx->dirty_atoms = 0; + si_ce_pre_draw_synchronization(sctx); + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + /* <-- CUs are busy here. */ - /* Emit states. */ - mask = sctx->dirty_states; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct si_pm4_state *state = sctx->queued.array[i]; + /* Start prefetches after the draw has been started. Both will run + * in parallel, but starting the draw first is more important. + */ + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx); + } else { + /* If we don't wait for idle, start prefetches first, then set + * states, and draw at the end. + */ + if (sctx->b.flags) + si_emit_cache_flush(sctx); - if (!state || sctx->emitted.array[i] == state) - continue; + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx); - si_pm4_emit(sctx, state); - sctx->emitted.array[i] = state; - } - sctx->dirty_states = 0; + if (!si_upload_graphics_shader_descriptors(sctx)) + return; - si_emit_rasterizer_prim_state(sctx); - if (sctx->tes_shader.cso) - si_emit_derived_tess_state(sctx, info, &num_patches); - si_emit_vs_state(sctx, info); - si_emit_draw_registers(sctx, info, num_patches); + si_emit_all_states(sctx, info, 0); + si_ce_pre_draw_synchronization(sctx); + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + } - si_ce_pre_draw_synchronization(sctx); - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); si_ce_post_draw_synchronization(sctx); if (sctx->trace_buf) -- 2.11.0