From 093c94456bc99308bd80bcc952d1f77ea71a831c Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Wed, 18 Dec 2019 11:57:41 -0800
Subject: [PATCH] freedreno/ir3: move nop padding to legalize

This way we can deal with it in one place, *after* all the blocks have
been scheduled.  Which will simplify life for a post-RA sched pass.

This has the benefit of already taking into account nop's that legalize
has to insert for non-delay related reasons.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>
---
 src/freedreno/ir3/ir3.h          |  1 +
 src/freedreno/ir3/ir3_a6xx.c     | 13 --------
 src/freedreno/ir3/ir3_delay.c    | 21 ++++++++++++
 src/freedreno/ir3/ir3_legalize.c | 72 +++++++++++++++++++++++++++++-----------
 src/freedreno/ir3/ir3_sched.c    | 52 -----------------------------
 5 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 03abaafa393..ac294934133 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1120,6 +1120,7 @@ unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
 		unsigned maxd, bool pred);
 unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 		bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
 
 /* depth calculation: */
 struct ir3_shader_variant;
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c
index b75489b6b6a..fd18fc3aa3c 100644
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -365,19 +365,6 @@ get_atomic_dest_mov(struct ir3_instruction *atomic)
 	list_delinit(&mov->node);
 	list_add(&mov->node, &atomic->node);
 
-	/* And because this is after instruction scheduling, we don't really
-	 * have a good way to know if extra delay slots are needed.  For
-	 * example, if the result is consumed by an stib (storeImage()) there
-	 * would be no extra delay slots in place already, but 5 are needed.
-	 * Just plan for the worst and hope nobody looks at the resulting
-	 * code that is generated :-(
-	 */
-	struct ir3_instruction *nop = ir3_NOP(atomic->block);
-	nop->repeat = 5;
-
-	list_delinit(&nop->node);
-	list_add(&nop->node, &mov->node);
-
 	return atomic->data = mov;
 }
 
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index 506e2969326..207c8cb91cc 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -335,3 +335,24 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 
 	return delay;
 }
+
+/**
+ * Remove nop instructions.  The scheduler can insert placeholder nop's
+ * so that ir3_delay_calc() can account for nop's that won't be needed
+ * due to nop's triggered by a previous instruction.  However, before
+ * legalize, we want to remove these.  The legalize pass can insert
+ * some nop's if needed to hold (for example) sync flags.  This final
+ * remaining nops are inserted by legalize after this.
+ */
+void
+ir3_remove_nops(struct ir3 *ir)
+{
+	foreach_block (block, &ir->block_list) {
+		foreach_instr_safe (instr, &block->instr_list) {
+			if (instr->opc == OPC_NOP) {
+				list_del(&instr->node);
+			}
+		}
+	}
+
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index db21507181c..4b95b905e20 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -211,26 +211,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
 			ir3_NOP(block);
 
-		if (is_nop(n) && !list_is_empty(&block->instr_list)) {
-			struct ir3_instruction *last = list_last_entry(&block->instr_list,
-					struct ir3_instruction, node);
-			if (is_nop(last) && (last->repeat < 5)) {
-				last->repeat++;
-				last->flags |= n->flags;
-				continue;
-			}
-
-			/* NOTE: I think the nopN encoding works for a5xx and
-			 * probably a4xx, but not a3xx.  So far only tested on
-			 * a6xx.
-			 */
-			if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) &&
-					((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
-				last->nop++;
-				continue;
-			}
-		}
-
 		if (ctx->compiler->samgq_workaround &&
 			ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
 			struct ir3_instruction *samgp;
@@ -573,6 +553,54 @@ mark_xvergence_points(struct ir3 *ir)
 	}
 }
 
+/* Insert nop's required to make this a legal/valid shader program: */
+static void
+nop_sched(struct ir3 *ir)
+{
+	foreach_block (block, &ir->block_list) {
+		struct ir3_instruction *last = NULL;
+		struct list_head instr_list;
+
+		/* remove all the instructions from the list, we'll be adding
+		 * them back in as we go
+		 */
+		list_replace(&block->instr_list, &instr_list);
+		list_inithead(&block->instr_list);
+
+		foreach_instr_safe (instr, &instr_list) {
+			unsigned delay = ir3_delay_calc(block, instr, false, true);
+
+			/* NOTE: I think the nopN encoding works for a5xx and
+			 * probably a4xx, but not a3xx.  So far only tested on
+			 * a6xx.
+			 */
+
+			if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+					((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
+				/* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+				unsigned transfer = MIN2(delay, 3 - last->nop);
+				last->nop += transfer;
+				delay -= transfer;
+			}
+
+			if ((delay > 0) && last && (last->opc == OPC_NOP)) {
+				/* the previous nop can encode at most 5 repeats: */
+				unsigned transfer = MIN2(delay, 5 - last->repeat);
+				last->repeat += transfer;
+				delay -= transfer;
+			}
+
+			if (delay > 0) {
+				debug_assert(delay <= 6);
+				ir3_NOP(block)->repeat = delay - 1;
+			}
+
+			list_addtail(&instr->node, &block->instr_list);
+			last = instr;
+		}
+	}
+}
+
 void
 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 {
@@ -589,6 +617,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 		block->data = rzalloc(ctx, struct ir3_legalize_block_data);
 	}
 
+	ir3_remove_nops(ir);
+
 	/* process each block: */
 	do {
 		progress = false;
@@ -599,6 +629,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 
 	*max_bary = ctx->max_bary;
 
+	nop_sched(ir);
+
 	do {
 		ir3_count_instructions(ir);
 	} while(resolve_jumps(ir));
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index ec5ad6e872e..13ec6e023ac 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -717,7 +717,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 		if (instr) {
 			unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
-
 			d("delay=%u", delay);
 
 			/* and if we run out of instructions that can be scheduled,
@@ -770,18 +769,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	if (block->successors[1]) {
 		/* if/else, conditional branches to "then" or "else": */
 		struct ir3_instruction *br;
-		unsigned delay = 6;
 
 		debug_assert(ctx->pred);
 		debug_assert(block->condition);
 
-		delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
-
-		while (delay > 0) {
-			ir3_NOP(block);
-			delay--;
-		}
-
 		/* create "else" branch first (since "then" block should
 		 * frequently/always end up being a fall-thru):
 		 */
@@ -814,45 +805,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 */
 }
 
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-	unsigned n = 0;
-
-	ctx->block = block;
-
-	foreach_instr_safe (instr, &block->instr_list) {
-		unsigned delay = 0;
-
-		set_foreach(block->predecessors, entry) {
-			struct ir3_block *pred = (struct ir3_block *)entry->key;
-			unsigned d = ir3_delay_calc(pred, instr, false, true);
-			delay = MAX2(d, delay);
-		}
-
-		while (delay > n) {
-			struct ir3_instruction *nop = ir3_NOP(block);
-
-			/* move to before instr: */
-			list_delinit(&nop->node);
-			list_addtail(&nop->node, &instr->node);
-
-			n++;
-		}
-
-		/* we can bail once we hit worst case delay: */
-		if (++n > 6)
-			break;
-	}
-}
-
 int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
@@ -865,10 +817,6 @@ int ir3_sched(struct ir3 *ir)
 		sched_block(&ctx, block);
 	}
 
-	foreach_block (block, &ir->block_list) {
-		sched_intra_block(&ctx, block);
-	}
-
 	if (ctx.error)
 		return -1;
 
-- 
2.11.0