From 093c94456bc99308bd80bcc952d1f77ea71a831c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 18 Dec 2019 11:57:41 -0800 Subject: [PATCH] freedreno/ir3: move nop padding to legalize This way we can deal with it in one place, *after* all the blocks have been scheduled. Which will simplify life for a post-RA sched pass. This has the benefit of already taking into account nop's that legalize has to insert for non-delay related reasons. Signed-off-by: Rob Clark Part-of: --- src/freedreno/ir3/ir3.h | 1 + src/freedreno/ir3/ir3_a6xx.c | 13 -------- src/freedreno/ir3/ir3_delay.c | 21 ++++++++++++ src/freedreno/ir3/ir3_legalize.c | 72 +++++++++++++++++++++++++++++----------- src/freedreno/ir3/ir3_sched.c | 52 ----------------------------- 5 files changed, 74 insertions(+), 85 deletions(-) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 03abaafa393..ac294934133 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1120,6 +1120,7 @@ unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd, bool pred); unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, bool soft, bool pred); +void ir3_remove_nops(struct ir3 *ir); /* depth calculation: */ struct ir3_shader_variant; diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index b75489b6b6a..fd18fc3aa3c 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -365,19 +365,6 @@ get_atomic_dest_mov(struct ir3_instruction *atomic) list_delinit(&mov->node); list_add(&mov->node, &atomic->node); - /* And because this is after instruction scheduling, we don't really - * have a good way to know if extra delay slots are needed. For - * example, if the result is consumed by an stib (storeImage()) there - * would be no extra delay slots in place already, but 5 are needed. - * Just plan for the worst and hope nobody looks at the resulting - * code that is generated :-( - */ - struct ir3_instruction *nop = ir3_NOP(atomic->block); - nop->repeat = 5; - - list_delinit(&nop->node); - list_add(&nop->node, &mov->node); - return atomic->data = mov; } diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index 506e2969326..207c8cb91cc 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -335,3 +335,24 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, return delay; } + +/** + * Remove nop instructions. The scheduler can insert placeholder nop's + * so that ir3_delay_calc() can account for nop's that won't be needed + * due to nop's triggered by a previous instruction. However, before + * legalize, we want to remove these. The legalize pass can insert + * some nop's if needed to hold (for example) sync flags. This final + * remaining nops are inserted by legalize after this. + */ +void +ir3_remove_nops(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + if (instr->opc == OPC_NOP) { + list_del(&instr->node); + } + } + } + +} diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index db21507181c..4b95b905e20 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -211,26 +211,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) ir3_NOP(block); - if (is_nop(n) && !list_is_empty(&block->instr_list)) { - struct ir3_instruction *last = list_last_entry(&block->instr_list, - struct ir3_instruction, node); - if (is_nop(last) && (last->repeat < 5)) { - last->repeat++; - last->flags |= n->flags; - continue; - } - - /* NOTE: I think the nopN encoding works for a5xx and - * probably a4xx, but not a3xx. So far only tested on - * a6xx. - */ - if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) && - ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) { - last->nop++; - continue; - } - } - if (ctx->compiler->samgq_workaround && ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) { struct ir3_instruction *samgp; @@ -573,6 +553,54 @@ mark_xvergence_points(struct ir3 *ir) } } +/* Insert nop's required to make this a legal/valid shader program: */ +static void +nop_sched(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + struct ir3_instruction *last = NULL; + struct list_head instr_list; + + /* remove all the instructions from the list, we'll be adding + * them back in as we go + */ + list_replace(&block->instr_list, &instr_list); + list_inithead(&block->instr_list); + + foreach_instr_safe (instr, &instr_list) { + unsigned delay = ir3_delay_calc(block, instr, false, true); + + /* NOTE: I think the nopN encoding works for a5xx and + * probably a4xx, but not a3xx. So far only tested on + * a6xx. + */ + + if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last && + ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) { + /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ + unsigned transfer = MIN2(delay, 3 - last->nop); + last->nop += transfer; + delay -= transfer; + } + + if ((delay > 0) && last && (last->opc == OPC_NOP)) { + /* the previous nop can encode at most 5 repeats: */ + unsigned transfer = MIN2(delay, 5 - last->repeat); + last->repeat += transfer; + delay -= transfer; + } + + if (delay > 0) { + debug_assert(delay <= 6); + ir3_NOP(block)->repeat = delay - 1; + } + + list_addtail(&instr->node, &block->instr_list); + last = instr; + } + } +} + void ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) { @@ -589,6 +617,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) block->data = rzalloc(ctx, struct ir3_legalize_block_data); } + ir3_remove_nops(ir); + /* process each block: */ do { progress = false; @@ -599,6 +629,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) *max_bary = ctx->max_bary; + nop_sched(ir); + do { ir3_count_instructions(ir); } while(resolve_jumps(ir)); diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index ec5ad6e872e..13ec6e023ac 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -717,7 +717,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) if (instr) { unsigned delay = ir3_delay_calc(ctx->block, instr, false, false); - d("delay=%u", delay); /* and if we run out of instructions that can be scheduled, @@ -770,18 +769,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) if (block->successors[1]) { /* if/else, conditional branches to "then" or "else": */ struct ir3_instruction *br; - unsigned delay = 6; debug_assert(ctx->pred); debug_assert(block->condition); - delay -= ir3_distance(ctx->block, ctx->pred, delay, false); - - while (delay > 0) { - ir3_NOP(block); - delay--; - } - /* create "else" branch first (since "then" block should * frequently/always end up being a fall-thru): */ @@ -814,45 +805,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) */ } -/* After scheduling individual blocks, we still could have cases where - * one (or more) paths into a block, a value produced by a previous - * has too few delay slots to be legal. We can't deal with this in the - * first pass, because loops (ie. we can't ensure all predecessor blocks - * are already scheduled in the first pass). All we can really do at - * this point is stuff in extra nop's until things are legal. - */ -static void -sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) -{ - unsigned n = 0; - - ctx->block = block; - - foreach_instr_safe (instr, &block->instr_list) { - unsigned delay = 0; - - set_foreach(block->predecessors, entry) { - struct ir3_block *pred = (struct ir3_block *)entry->key; - unsigned d = ir3_delay_calc(pred, instr, false, true); - delay = MAX2(d, delay); - } - - while (delay > n) { - struct ir3_instruction *nop = ir3_NOP(block); - - /* move to before instr: */ - list_delinit(&nop->node); - list_addtail(&nop->node, &instr->node); - - n++; - } - - /* we can bail once we hit worst case delay: */ - if (++n > 6) - break; - } -} - int ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx ctx = {0}; @@ -865,10 +817,6 @@ int ir3_sched(struct ir3 *ir) sched_block(&ctx, block); } - foreach_block (block, &ir->block_list) { - sched_intra_block(&ctx, block); - } - if (ctx.error) return -1; -- 2.11.0