OSDN Git Service

freedreno/ir3: move nop padding to legalize
authorRob Clark <robdclark@chromium.org>
Wed, 18 Dec 2019 19:57:41 +0000 (11:57 -0800)
committerMarge Bot <eric+marge@anholt.net>
Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
This way we can deal with it in one place, *after* all the blocks have
been scheduled.  Which will simplify life for a post-RA sched pass.

This has the benefit of already taking into account nop's that legalize
has to insert for non-delay related reasons.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>

src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_a6xx.c
src/freedreno/ir3/ir3_delay.c
src/freedreno/ir3/ir3_legalize.c
src/freedreno/ir3/ir3_sched.c

index 03abaaf..ac29493 100644 (file)
@@ -1120,6 +1120,7 @@ unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
                unsigned maxd, bool pred);
 unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
                bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
 
 /* depth calculation: */
 struct ir3_shader_variant;
index b75489b..fd18fc3 100644 (file)
@@ -365,19 +365,6 @@ get_atomic_dest_mov(struct ir3_instruction *atomic)
        list_delinit(&mov->node);
        list_add(&mov->node, &atomic->node);
 
-       /* And because this is after instruction scheduling, we don't really
-        * have a good way to know if extra delay slots are needed.  For
-        * example, if the result is consumed by an stib (storeImage()) there
-        * would be no extra delay slots in place already, but 5 are needed.
-        * Just plan for the worst and hope nobody looks at the resulting
-        * code that is generated :-(
-        */
-       struct ir3_instruction *nop = ir3_NOP(atomic->block);
-       nop->repeat = 5;
-
-       list_delinit(&nop->node);
-       list_add(&nop->node, &mov->node);
-
        return atomic->data = mov;
 }
 
index 506e296..207c8cb 100644 (file)
@@ -335,3 +335,24 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 
        return delay;
 }
+
+/**
+ * Remove nop instructions.  The scheduler can insert placeholder nop's
+ * so that ir3_delay_calc() can account for nop's that won't be needed
+ * due to nop's triggered by a previous instruction.  However, before
+ * legalize, we want to remove these.  The legalize pass can insert
+ * some nop's if needed to hold (for example) sync flags.  This final
+ * remaining nops are inserted by legalize after this.
+ */
+void
+ir3_remove_nops(struct ir3 *ir)
+{
+       foreach_block (block, &ir->block_list) {
+               foreach_instr_safe (instr, &block->instr_list) {
+                       if (instr->opc == OPC_NOP) {
+                               list_del(&instr->node);
+                       }
+               }
+       }
+
+}
index db21507..4b95b90 100644 (file)
@@ -211,26 +211,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
                        ir3_NOP(block);
 
-               if (is_nop(n) && !list_is_empty(&block->instr_list)) {
-                       struct ir3_instruction *last = list_last_entry(&block->instr_list,
-                                       struct ir3_instruction, node);
-                       if (is_nop(last) && (last->repeat < 5)) {
-                               last->repeat++;
-                               last->flags |= n->flags;
-                               continue;
-                       }
-
-                       /* NOTE: I think the nopN encoding works for a5xx and
-                        * probably a4xx, but not a3xx.  So far only tested on
-                        * a6xx.
-                        */
-                       if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) &&
-                                       ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
-                               last->nop++;
-                               continue;
-                       }
-               }
-
                if (ctx->compiler->samgq_workaround &&
                        ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
                        struct ir3_instruction *samgp;
@@ -573,6 +553,54 @@ mark_xvergence_points(struct ir3 *ir)
        }
 }
 
+/* Insert nop's required to make this a legal/valid shader program: */
+static void
+nop_sched(struct ir3 *ir)
+{
+       foreach_block (block, &ir->block_list) {
+               struct ir3_instruction *last = NULL;
+               struct list_head instr_list;
+
+               /* remove all the instructions from the list, we'll be adding
+                * them back in as we go
+                */
+               list_replace(&block->instr_list, &instr_list);
+               list_inithead(&block->instr_list);
+
+               foreach_instr_safe (instr, &instr_list) {
+                       unsigned delay = ir3_delay_calc(block, instr, false, true);
+
+                       /* NOTE: I think the nopN encoding works for a5xx and
+                        * probably a4xx, but not a3xx.  So far only tested on
+                        * a6xx.
+                        */
+
+                       if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+                                       ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
+                               /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+                               unsigned transfer = MIN2(delay, 3 - last->nop);
+                               last->nop += transfer;
+                               delay -= transfer;
+                       }
+
+                       if ((delay > 0) && last && (last->opc == OPC_NOP)) {
+                               /* the previous nop can encode at most 5 repeats: */
+                               unsigned transfer = MIN2(delay, 5 - last->repeat);
+                               last->repeat += transfer;
+                               delay -= transfer;
+                       }
+
+                       if (delay > 0) {
+                               debug_assert(delay <= 6);
+                               ir3_NOP(block)->repeat = delay - 1;
+                       }
+
+                       list_addtail(&instr->node, &block->instr_list);
+                       last = instr;
+               }
+       }
+}
+
 void
 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 {
@@ -589,6 +617,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
                block->data = rzalloc(ctx, struct ir3_legalize_block_data);
        }
 
+       ir3_remove_nops(ir);
+
        /* process each block: */
        do {
                progress = false;
@@ -599,6 +629,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 
        *max_bary = ctx->max_bary;
 
+       nop_sched(ir);
+
        do {
                ir3_count_instructions(ir);
        } while(resolve_jumps(ir));
index ec5ad6e..13ec6e0 100644 (file)
@@ -717,7 +717,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
                if (instr) {
                        unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
-
                        d("delay=%u", delay);
 
                        /* and if we run out of instructions that can be scheduled,
@@ -770,18 +769,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
        if (block->successors[1]) {
                /* if/else, conditional branches to "then" or "else": */
                struct ir3_instruction *br;
-               unsigned delay = 6;
 
                debug_assert(ctx->pred);
                debug_assert(block->condition);
 
-               delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
-
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
                /* create "else" branch first (since "then" block should
                 * frequently/always end up being a fall-thru):
                 */
@@ -814,45 +805,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         */
 }
 
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-       unsigned n = 0;
-
-       ctx->block = block;
-
-       foreach_instr_safe (instr, &block->instr_list) {
-               unsigned delay = 0;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned d = ir3_delay_calc(pred, instr, false, true);
-                       delay = MAX2(d, delay);
-               }
-
-               while (delay > n) {
-                       struct ir3_instruction *nop = ir3_NOP(block);
-
-                       /* move to before instr: */
-                       list_delinit(&nop->node);
-                       list_addtail(&nop->node, &instr->node);
-
-                       n++;
-               }
-
-               /* we can bail once we hit worst case delay: */
-               if (++n > 6)
-                       break;
-       }
-}
-
 int ir3_sched(struct ir3 *ir)
 {
        struct ir3_sched_ctx ctx = {0};
@@ -865,10 +817,6 @@ int ir3_sched(struct ir3 *ir)
                sched_block(&ctx, block);
        }
 
-       foreach_block (block, &ir->block_list) {
-               sched_intra_block(&ctx, block);
-       }
-
        if (ctx.error)
                return -1;