freedreno/ir3: move nop padding to legalize

author Rob Clark <robdclark@chromium.org>

Wed, 18 Dec 2019 19:57:41 +0000 (11:57 -0800)

committer Marge Bot <eric+marge@anholt.net>

Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
author Rob Clark <robdclark@chromium.org>
Wed, 18 Dec 2019 19:57:41 +0000 (11:57 -0800)
committer Marge Bot <eric+marge@anholt.net>
Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index 03abaaf..ac29493 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1120,6 +1120,7 @@ unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
                 unsigned maxd, bool pred);
  unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
                 bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
  
  /* depth calculation: */
  struct ir3_shader_variant;
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c

index b75489b..fd18fc3 100644 (file)
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -365,19 +365,6 @@ get_atomic_dest_mov(struct ir3_instruction *atomic)
         list_delinit(&mov->node);
         list_add(&mov->node, &atomic->node);
  
-       /* And because this is after instruction scheduling, we don't really
-        * have a good way to know if extra delay slots are needed.  For
-        * example, if the result is consumed by an stib (storeImage()) there
-        * would be no extra delay slots in place already, but 5 are needed.
-        * Just plan for the worst and hope nobody looks at the resulting
-        * code that is generated :-(
-        */
-       struct ir3_instruction *nop = ir3_NOP(atomic->block);
-       nop->repeat = 5;
-
-       list_delinit(&nop->node);
-       list_add(&nop->node, &mov->node);
-
         return atomic->data = mov;
  }
  
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c

index 506e296..207c8cb 100644 (file)
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -335,3 +335,24 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
  
         return delay;
  }
+
+/**
+ * Remove nop instructions.  The scheduler can insert placeholder nop's
+ * so that ir3_delay_calc() can account for nop's that won't be needed
+ * due to nop's triggered by a previous instruction.  However, before
+ * legalize, we want to remove these.  The legalize pass can insert
+ * some nop's if needed to hold (for example) sync flags.  This final
+ * remaining nops are inserted by legalize after this.
+ */
+void
+ir3_remove_nops(struct ir3 *ir)
+{
+       foreach_block (block, &ir->block_list) {
+               foreach_instr_safe (instr, &block->instr_list) {
+                       if (instr->opc == OPC_NOP) {
+                               list_del(&instr->node);
+                       }
+               }
+       }
+
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c

index db21507..4b95b90 100644 (file)
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -211,26 +211,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                 if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
                         ir3_NOP(block);
  
-               if (is_nop(n) && !list_is_empty(&block->instr_list)) {
-                       struct ir3_instruction *last = list_last_entry(&block->instr_list,
-                                       struct ir3_instruction, node);
-                       if (is_nop(last) && (last->repeat < 5)) {
-                               last->repeat++;
-                               last->flags |= n->flags;
-                               continue;
-                       }
-
-                       /* NOTE: I think the nopN encoding works for a5xx and
-                        * probably a4xx, but not a3xx.  So far only tested on
-                        * a6xx.
-                        */
-                       if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) &&
-                                       ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
-                               last->nop++;
-                               continue;
-                       }
-               }
-
                 if (ctx->compiler->samgq_workaround &&
                         ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
                         struct ir3_instruction *samgp;
@@ -573,6 +553,54 @@ mark_xvergence_points(struct ir3 *ir)
         }
  }
  
+/* Insert nop's required to make this a legal/valid shader program: */
+static void
+nop_sched(struct ir3 *ir)
+{
+       foreach_block (block, &ir->block_list) {
+               struct ir3_instruction *last = NULL;
+               struct list_head instr_list;
+
+               /* remove all the instructions from the list, we'll be adding
+                * them back in as we go
+                */
+               list_replace(&block->instr_list, &instr_list);
+               list_inithead(&block->instr_list);
+
+               foreach_instr_safe (instr, &instr_list) {
+                       unsigned delay = ir3_delay_calc(block, instr, false, true);
+
+                       /* NOTE: I think the nopN encoding works for a5xx and
+                        * probably a4xx, but not a3xx.  So far only tested on
+                        * a6xx.
+                        */
+
+                       if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+                                       ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
+                               /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+                               unsigned transfer = MIN2(delay, 3 - last->nop);
+                               last->nop += transfer;
+                               delay -= transfer;
+                       }
+
+                       if ((delay > 0) && last && (last->opc == OPC_NOP)) {
+                               /* the previous nop can encode at most 5 repeats: */
+                               unsigned transfer = MIN2(delay, 5 - last->repeat);
+                               last->repeat += transfer;
+                               delay -= transfer;
+                       }
+
+                       if (delay > 0) {
+                               debug_assert(delay <= 6);
+                               ir3_NOP(block)->repeat = delay - 1;
+                       }
+
+                       list_addtail(&instr->node, &block->instr_list);
+                       last = instr;
+               }
+       }
+}
+
  void
  ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
  {
@@ -589,6 +617,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
                 block->data = rzalloc(ctx, struct ir3_legalize_block_data);
         }
  
+       ir3_remove_nops(ir);
+
         /* process each block: */
         do {
                 progress = false;
@@ -599,6 +629,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
  
         *max_bary = ctx->max_bary;
  
+       nop_sched(ir);
+
         do {
                 ir3_count_instructions(ir);
         } while(resolve_jumps(ir));
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

index ec5ad6e..13ec6e0 100644 (file)
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -717,7 +717,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
  
                 if (instr) {
                         unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
-
                         d("delay=%u", delay);
  
                         /* and if we run out of instructions that can be scheduled,
@@ -770,18 +769,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         if (block->successors[1]) {
                 /* if/else, conditional branches to "then" or "else": */
                 struct ir3_instruction *br;
-               unsigned delay = 6;
  
                 debug_assert(ctx->pred);
                 debug_assert(block->condition);
  
-               delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
-
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
                 /* create "else" branch first (since "then" block should
                  * frequently/always end up being a fall-thru):
                  */
@@ -814,45 +805,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
          */
  }
  
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-       unsigned n = 0;
-
-       ctx->block = block;
-
-       foreach_instr_safe (instr, &block->instr_list) {
-               unsigned delay = 0;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned d = ir3_delay_calc(pred, instr, false, true);
-                       delay = MAX2(d, delay);
-               }
-
-               while (delay > n) {
-                       struct ir3_instruction *nop = ir3_NOP(block);
-
-                       /* move to before instr: */
-                       list_delinit(&nop->node);
-                       list_addtail(&nop->node, &instr->node);
-
-                       n++;
-               }
-
-               /* we can bail once we hit worst case delay: */
-               if (++n > 6)
-                       break;
-       }
-}
-
  int ir3_sched(struct ir3 *ir)
  {
         struct ir3_sched_ctx ctx = {0};
@@ -865,10 +817,6 @@ int ir3_sched(struct ir3 *ir)
                 sched_block(&ctx, block);
         }
  
-       foreach_block (block, &ir->block_list) {
-               sched_intra_block(&ctx, block);
-       }
-
         if (ctx.error)
                 return -1;
author	Rob Clark <robdclark@chromium.org>
	Wed, 18 Dec 2019 19:57:41 +0000 (11:57 -0800)
committer	Marge Bot <eric+marge@anholt.net>
	Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
src/freedreno/ir3/ir3.h		patch \| blob \| history
src/freedreno/ir3/ir3_a6xx.c		patch \| blob \| history
src/freedreno/ir3/ir3_delay.c		patch \| blob \| history
src/freedreno/ir3/ir3_legalize.c		patch \| blob \| history
src/freedreno/ir3/ir3_sched.c		patch \| blob \| history