From 2cf59861a8128a91bfdd6fe62bf69cb4593373e3 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 22 May 2018 18:19:16 -0700
Subject: [PATCH] nir: Add partial redundancy elimination for compares

This pass attempts to dectect code sequences like

    if (x < y) {
        z = y - x;
	...
    }

and replace them with sequences like

    t = x - y;
    if (t < 0) {
        z = -t;
	...
    }

On architectures where the subtract can generate the flags used by the
if-statement, this saves an instruction.  It's also possible that moving
an instruction out of the if-statement will allow
nir_opt_peephole_select to convert the whole thing to a bcsel.

Currently only floating point compares and adds are supported.  Adding
support for integer will be a challenge due to integer overflow.  There
are a couple possible solutions, but they may not apply to all
architectures.

v2: Fix a typo in the commit message and a couple typos in comments.
Fix possible NULL pointer deref from result of push_block().  Add
missing (-A + B) case.  Suggested by Caio.

v3: Fix is_not_const_zero to work correctly with types other than
nir_type_float32.  Suggested by Ken.

v4: Add some comments explaining how this works.  Suggested by Ken.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/Makefile.sources             |   1 +
 src/compiler/nir/meson.build              |   1 +
 src/compiler/nir/nir.h                    |   2 +
 src/compiler/nir/nir_opt_comparison_pre.c | 383 ++++++++++++++++++++++++++++++
 src/compiler/nir/nir_search_helpers.h     |  27 +++
 5 files changed, 414 insertions(+)
 create mode 100644 src/compiler/nir/nir_opt_comparison_pre.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index e542d86a37a..5fddb6d4db2 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -278,6 +278,7 @@ NIR_FILES = \
 	nir/nir_move_vec_src_uses_to_dest.c \
 	nir/nir_normalize_cubemap_coords.c \
 	nir/nir_opt_combine_stores.c \
+	nir/nir_opt_comparison_pre.c \
 	nir/nir_opt_conditional_discard.c \
 	nir/nir_opt_constant_folding.c \
 	nir/nir_opt_copy_prop_vars.c \
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 0900f56648e..c65f2ff62ff 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -160,6 +160,7 @@ files_libnir = files(
   'nir_move_vec_src_uses_to_dest.c',
   'nir_normalize_cubemap_coords.c',
   'nir_opt_combine_stores.c',
+  'nir_opt_comparison_pre.c',
   'nir_opt_conditional_discard.c',
   'nir_opt_constant_folding.c',
   'nir_opt_copy_prop_vars.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 3ddf97bb12c..806e47dd7bb 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3401,6 +3401,8 @@ bool nir_lower_phis_to_regs_block(nir_block *block);
 bool nir_lower_ssa_defs_to_regs_block(nir_block *block);
 bool nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl);
 
+bool nir_opt_comparison_pre(nir_shader *shader);
+
 bool nir_opt_algebraic(nir_shader *shader);
 bool nir_opt_algebraic_before_ffma(nir_shader *shader);
 bool nir_opt_algebraic_late(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_comparison_pre.c b/src/compiler/nir/nir_opt_comparison_pre.c
new file mode 100644
index 00000000000..ab31a2bf554
--- /dev/null
+++ b/src/compiler/nir/nir_opt_comparison_pre.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright Â© 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_instr_set.h"
+#include "nir_search_helpers.h"
+#include "nir_builder.h"
+#include "util/u_vector.h"
+
+/* Partial redundancy elimination of compares
+ *
+ * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
+ * instructions like 'b - a'.  The comparison is replaced by the arithmetic
+ * instruction, and the result is compared with zero.  For example,
+ *
+ *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
+ *       if ssa_111 {
+ *               block block_1:
+ *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
+ *              ...
+ *
+ * becomes
+ *
+ *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
+ *       vec1 32 ssa_112 = flt 0.0, ssa_111
+ *       if ssa_112 {
+ *               block block_1:
+ *              ...
+ */
+
+struct block_queue {
+   /**
+    * Stack of blocks from the current location in the CFG to the entry point
+    * of the function.
+    *
+    * This is sort of a poor man's dominator tree.
+    */
+   struct exec_list blocks;
+
+   /** List of freed block_instructions structures that can be reused. */
+   struct exec_list reusable_blocks;
+};
+
+struct block_instructions {
+   struct exec_node node;
+
+   /**
+    * Set of comparison instructions from the block that are candidates for
+    * being replaced by add instructions.
+    */
+   struct u_vector instructions;
+};
+
+static void
+block_queue_init(struct block_queue *bq)
+{
+   exec_list_make_empty(&bq->blocks);
+   exec_list_make_empty(&bq->reusable_blocks);
+}
+
+static void
+block_queue_finish(struct block_queue *bq)
+{
+   struct block_instructions *n;
+
+   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
+      u_vector_finish(&n->instructions);
+      free(n);
+   }
+
+   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
+      free(n);
+   }
+}
+
+static struct block_instructions *
+push_block(struct block_queue *bq)
+{
+   struct block_instructions *bi =
+      (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
+
+   if (bi == NULL) {
+      bi = calloc(1, sizeof(struct block_instructions));
+
+      if (bi == NULL)
+         return NULL;
+   }
+
+   if (!u_vector_init(&bi->instructions,
+                      sizeof(struct nir_alu_instr *),
+                      8 * sizeof(struct nir_alu_instr *)))
+      return NULL;
+
+   exec_list_push_tail(&bq->blocks, &bi->node);
+
+   return bi;
+}
+
+static void
+pop_block(struct block_queue *bq, struct block_instructions *bi)
+{
+   u_vector_finish(&bi->instructions);
+   exec_node_remove(&bi->node);
+   exec_list_push_head(&bq->reusable_blocks, &bi->node);
+}
+
+static void
+add_instruction_for_block(struct block_instructions *bi,
+                          struct nir_alu_instr *alu)
+{
+   struct nir_alu_instr **data =
+      u_vector_add(&bi->instructions);
+
+   *data = alu;
+}
+
+static void
+rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
+                            nir_alu_instr *orig_add, bool zero_on_left)
+{
+   void *const mem_ctx = ralloc_parent(orig_cmp);
+
+   bld->cursor = nir_before_instr(&orig_cmp->instr);
+
+   /* This is somewhat tricky.  The compare instruction may be something like
+    * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
+    * b).  This is problematic because the SSA value for the fneg(a) may not
+    * exist yet at the compare instruction.
+    *
+    * We fabricate the operands of the new add.  This is done using
+    * information provided by zero_on_left.  If zero_on_left is true, we know
+    * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
+    * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
+    * zero_on_left is false, the resulting compare instruction is (fcmp,
+    * (fadd, x, y), 0.0) and x = a and y = -b.
+    */
+   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
+   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
+
+   nir_ssa_def *const fadd = zero_on_left
+      ? nir_fadd(bld, b, nir_fneg(bld, a))
+      : nir_fadd(bld, a, nir_fneg(bld, b));
+
+   nir_ssa_def *const zero =
+      nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
+
+   nir_ssa_def *const cmp = zero_on_left
+      ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
+      : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
+
+   /* Generating extra moves of the results is the easy way to make sure the
+    * writemasks match the original instructions.  Later optimization passes
+    * will clean these up.  This is similar to nir_replace_instr (in
+    * nir_search.c).
+    */
+   nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_imov);
+   mov_add->dest.write_mask = orig_add->dest.write_mask;
+   nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
+                     orig_add->dest.dest.ssa.num_components,
+                     orig_add->dest.dest.ssa.bit_size, NULL);
+   mov_add->src[0].src = nir_src_for_ssa(fadd);
+
+   nir_builder_instr_insert(bld, &mov_add->instr);
+
+   nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_imov);
+   mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
+   nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
+                     orig_cmp->dest.dest.ssa.num_components,
+                     orig_cmp->dest.dest.ssa.bit_size, NULL);
+   mov_cmp->src[0].src = nir_src_for_ssa(cmp);
+
+   nir_builder_instr_insert(bld, &mov_cmp->instr);
+
+   nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
+                            nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
+   nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
+                            nir_src_for_ssa(&mov_add->dest.dest.ssa));
+
+   /* We know these have no more uses because we just rewrote them all, so we
+    * can remove them.
+    */
+   nir_instr_remove(&orig_cmp->instr);
+   nir_instr_remove(&orig_add->instr);
+}
+
+static bool
+comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
+{
+   bool progress = false;
+
+   struct block_instructions *bi = push_block(bq);
+   if (bi == NULL)
+      return false;
+
+   /* Starting with the current block, examine each instruction.  If the
+    * instruction is a comparison that matches the 'Â±a cmp Â±b' pattern, add it
+    * to the block_instructions::instructions set.  If the instruction is an
+    * add instruction, walk up the block queue looking at the stored
+    * instructions.  If a matching comparison is found, move the addition and
+    * replace the comparison with a different comparison based on the result
+    * of the addition.  All of the blocks in the queue are guaranteed to be
+    * dominators of the current block.
+    *
+    * After processing the current block, recurse into the blocks dominated by
+    * the current block.
+    */
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_alu)
+         continue;
+
+      struct nir_alu_instr *const alu = nir_instr_as_alu(instr);
+
+      if (alu->dest.dest.ssa.num_components != 1)
+         continue;
+
+      if (alu->dest.saturate)
+         continue;
+
+      static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
+
+      switch (alu->op) {
+      case nir_op_fadd: {
+         /* If the instruction is fadd, check it against comparison
+          * instructions that dominate it.
+          */
+         struct block_instructions *b =
+            (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
+
+         while (b->node.next != NULL) {
+            nir_alu_instr **a;
+            bool rewrote_compare = false;
+
+            u_vector_foreach(a, &b->instructions) {
+               nir_alu_instr *const cmp = *a;
+
+               if (cmp == NULL)
+                  continue;
+
+               /* The operands of both instructions are, with some liberty,
+                * commutative.  Check all four permutations.  The third and
+                * fourth permutations are negations of the first two.
+                */
+               if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
+                    nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
+                   (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
+                    nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
+                  /* These are the cases where (A cmp B) matches either (A +
+                   * -B) or (-B + A)
+                   *
+                   *    A cmp B <=> A + -B cmp 0
+                   */
+                  rewrite_compare_instruction(bld, cmp, alu, false);
+
+                  *a = NULL;
+                  rewrote_compare = true;
+                  break;
+               } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
+                           nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
+                          (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
+                           nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
+                  /* This is the case where (A cmp B) matches (B + -A) or (-A
+                   * + B).
+                   *
+                   *    A cmp B <=> 0 cmp B + -A
+                   */
+                  rewrite_compare_instruction(bld, cmp, alu, true);
+
+                  *a = NULL;
+                  rewrote_compare = true;
+                  break;
+               }
+            }
+
+            /* Bail after a compare in the most dominating block is found.
+             * This is necessary because 'alu' has been removed from the
+             * instruction stream.  Should there be a matching compare in
+             * another block, calling rewrite_compare_instruction again will
+             * try to operate on a node that is not in the list as if it were
+             * in the list.
+             *
+             * FINISHME: There may be opportunity for additional optimization
+             * here.  I discovered this problem due to a shader in Guacamelee.
+             * It may be possible to rewrite the matching compares that are
+             * encountered later to reuse the result from the compare that was
+             * first rewritten.  It's also possible that this is just taken
+             * care of by calling the optimization pass repeatedly.
+             */
+            if (rewrote_compare) {
+               progress = true;
+               break;
+            }
+
+            b = (struct block_instructions *) b->node.next;
+         }
+
+         break;
+      }
+
+      case nir_op_flt:
+      case nir_op_fge:
+      case nir_op_fne:
+      case nir_op_feq:
+         /* If the instruction is a comparison that is used by an if-statement
+          * and neither operand is immediate value 0, add it to the set.
+          */
+         if (is_used_by_if(alu) &&
+             is_not_const_zero(alu, 0, 1, swizzle) &&
+             is_not_const_zero(alu, 1, 1, swizzle))
+            add_instruction_for_block(bi, alu);
+
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   for (unsigned i = 0; i < block->num_dom_children; i++) {
+      nir_block *child = block->dom_children[i];
+
+      if (comparison_pre_block(child, bq, bld))
+         progress = true;
+   }
+
+   pop_block(bq, bi);
+
+   return progress;
+}
+
+static bool
+nir_opt_comparison_pre_impl(nir_function_impl *impl)
+{
+   struct block_queue bq;
+   nir_builder bld;
+
+   block_queue_init(&bq);
+   nir_builder_init(&bld, impl);
+
+   nir_metadata_require(impl, nir_metadata_dominance);
+
+   const bool progress =
+      comparison_pre_block(nir_start_block(impl), &bq, &bld);
+
+   block_queue_finish(&bq);
+
+   if (progress)
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+
+   return progress;
+}
+
+bool
+nir_opt_comparison_pre(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= nir_opt_comparison_pre_impl(function->impl);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
index 456de81e175..1624508993d 100644
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -110,6 +110,33 @@ is_zero_to_one(nir_alu_instr *instr, unsigned src, unsigned num_components,
 }
 
 static inline bool
+is_not_const_zero(nir_alu_instr *instr, unsigned src, unsigned num_components,
+                  const uint8_t *swizzle)
+{
+   if (nir_src_as_const_value(instr->src[src].src) == NULL)
+      return true;
+
+   for (unsigned i = 0; i < num_components; i++) {
+      switch (nir_op_infos[instr->op].input_types[src]) {
+      case nir_type_float:
+         if (nir_src_comp_as_float(instr->src[src].src, swizzle[i]) == 0.0)
+            return false;
+         break;
+      case nir_type_bool:
+      case nir_type_int:
+      case nir_type_uint:
+         if (nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) == 0)
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static inline bool
 is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components,
              UNUSED const uint8_t *swizzle)
 {
-- 
2.11.0