From d56ae2d1605fc1b5a3fdf5aba9aefc3c7692a4ba Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 13 Jan 2016 20:33:15 -0800
Subject: [PATCH] i965: Apply VS attribute workarounds in NIR.

This patch re-implements the pre-Haswell VS attribute workarounds.
Instead of emitting shader code in the vec4 backend, we now simply
call a NIR pass to emit the necessary code.

This simplifies the vec4 backend.  Beyond deleting code, it removes
the primary use of ATTR as a destination.  It also eliminates the
requirement that the vec4 VS backend express the ATTR file in terms
of VERT_ATTRIB_* locations, giving us a bit more flexibility.

This approach is a little different: rather than munging the attributes
at the top, we emit code to fix them up when they're accessed.  However,
we run the optimizer afterwards, so CSE should eliminate the redundant
math.  It may even be able to fuse it with other calculations based on
the input value.

shader-db does not handle non-default NOS settings, so I have no
statistics about this patch.

Note that the scalar backend does not implement VS attribute
workarounds, as they are unnecessary on hardware which allows SIMD8 VS.

v2: Do one multiply for FIXED rescaling and select components from
    either the original or scaled copy, rather than multiplying each
    component separately (suggested by Matt Turner).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/drivers/dri/i965/Makefile.sources         |   1 +
 src/mesa/drivers/dri/i965/brw_nir.c                |  19 ++-
 src/mesa/drivers/dri/i965/brw_nir.h                |   7 +-
 .../dri/i965/brw_nir_attribute_workarounds.c       | 176 +++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp           |   2 +-
 src/mesa/drivers/dri/i965/brw_vec4.cpp             |   3 +
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp         |   2 +-
 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp  | 109 -------------
 8 files changed, 202 insertions(+), 117 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index caabb0decfb..300c13909d4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -43,6 +43,7 @@ i965_compiler_FILES = \
 	brw_nir.h \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
+	brw_nir_attribute_workarounds.c \
 	brw_nir_opt_peephole_ffma.c \
 	brw_nir_uniforms.cpp \
 	brw_packed_float.c \
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 46b51163579..41059b3227e 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -205,7 +205,9 @@ remap_patch_urb_offsets(nir_block *block, void *closure)
 static void
 brw_nir_lower_inputs(nir_shader *nir,
                      const struct brw_device_info *devinfo,
-                     bool is_scalar)
+                     bool is_scalar,
+                     bool use_legacy_snorm_formula,
+                     const uint8_t *vs_attrib_wa_flags)
 {
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
@@ -225,6 +227,9 @@ brw_nir_lower_inputs(nir_shader *nir,
 
       add_const_offset_to_base(nir, nir_var_shader_in);
 
+      brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula,
+                                          vs_attrib_wa_flags);
+
       if (is_scalar) {
          /* Finally, translate VERT_ATTRIB_* values into the actual registers.
           *
@@ -501,12 +506,15 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
 nir_shader *
 brw_nir_lower_io(nir_shader *nir,
                  const struct brw_device_info *devinfo,
-                 bool is_scalar)
+                 bool is_scalar,
+                 bool use_legacy_snorm_formula,
+                 const uint8_t *vs_attrib_wa_flags)
 {
    bool progress; /* Written by OPT and OPT_V */
    (void)progress;
 
-   OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
+   OPT_V(brw_nir_lower_inputs, devinfo, is_scalar,
+         use_legacy_snorm_formula, vs_attrib_wa_flags);
    OPT_V(brw_nir_lower_outputs, devinfo, is_scalar);
    OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4);
 
@@ -617,9 +625,10 @@ brw_create_nir(struct brw_context *brw,
       OPT_V(nir_lower_atomics, shader_prog);
    }
 
-   if (nir->stage != MESA_SHADER_TESS_CTRL &&
+   if (nir->stage != MESA_SHADER_VERTEX &&
+       nir->stage != MESA_SHADER_TESS_CTRL &&
        nir->stage != MESA_SHADER_TESS_EVAL) {
-      nir = brw_nir_lower_io(nir, devinfo, is_scalar);
+      nir = brw_nir_lower_io(nir, devinfo, is_scalar, false, NULL);
    }
 
    return nir;
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 079d8b25174..9a90e36964b 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -84,11 +84,16 @@ nir_shader *brw_create_nir(struct brw_context *brw,
 nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
 nir_shader *brw_nir_lower_io(nir_shader *nir,
                             const struct brw_device_info *devinfo,
-                            bool is_scalar);
+                            bool is_scalar,
+                            bool use_legacy_snorm_formula,
+                            const uint8_t *vs_attrib_wa_flags);
 nir_shader *brw_postprocess_nir(nir_shader *nir,
                                 const struct brw_device_info *devinfo,
                                 bool is_scalar);
 
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+                                         bool use_legacy_snorm_formula,
+                                         const uint8_t *attrib_wa_flags);
 
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
                                       const struct brw_device_info *devinfo,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c b/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
new file mode 100644
index 00000000000..9c65e540d79
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright Â© 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+#include "brw_vs.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats.  This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+struct attr_wa_state {
+   nir_builder builder;
+   bool impl_progress;
+   bool use_legacy_snorm_formula;
+   const uint8_t *wa_flags;
+};
+
+static bool
+apply_attr_wa_block(nir_block *block, void *void_state)
+{
+   struct attr_wa_state *state = void_state;
+   nir_builder *b = &state->builder;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic != nir_intrinsic_load_input)
+         continue;
+
+      uint8_t wa_flags = state->wa_flags[intrin->const_index[0]];
+      if (wa_flags == 0)
+         continue;
+
+      b->cursor = nir_after_instr(instr);
+
+      nir_ssa_def *val = &intrin->dest.ssa;
+
+      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+       * come in as floating point conversions of the integer values.
+       */
+      if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+         nir_ssa_def *scaled =
+            nir_fmul(b, val, nir_imm_float(b, 1.0f / 65536.0f));
+         nir_ssa_def *comps[4];
+         for (int i = 0; i < val->num_components; i++) {
+            bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+            comps[i] = nir_channel(b, rescale ? scaled : val, i);
+         }
+         val = nir_vec(b, comps, val->num_components);
+      }
+
+      /* Do sign recovery for 2101010 formats if required. */
+      if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+         /* sign recovery shift: <22, 22, 22, 30> */
+         nir_ssa_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+         val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+      }
+
+      /* Apply BGRA swizzle if required. */
+      if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+         val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4, true);
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+         /* ES 3.0 has different rules for converting signed normalized
+          * fixed-point numbers than desktop GL.
+          */
+         if ((wa_flags & BRW_ATTRIB_WA_SIGN) &&
+             !state->use_legacy_snorm_formula) {
+            /* According to equation 2.2 of the ES 3.0 specification,
+             * signed normalization conversion is done by:
+             *
+             * f = c / (2^(b-1)-1)
+             */
+            nir_ssa_def *es3_normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+                               1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+            val = nir_fmax(b,
+                           nir_fmul(b, nir_i2f(b, val), es3_normalize_factor),
+                           nir_imm_float(b, -1.0f));
+         } else {
+            /* The following equations are from the OpenGL 3.2 specification:
+             *
+             * 2.1 unsigned normalization
+             * f = c/(2^n-1)
+             *
+             * 2.2 signed normalization
+             * f = (2c+1)/(2^n-1)
+             *
+             * Both of these share a common divisor, which we handle by
+             * multiplying by 1 / (2^b - 1) for b = <10, 10, 10, 2>.
+             */
+            nir_ssa_def *normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+                               1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2)  - 1));
+
+            if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+               /* For signed normalization, the numerator is 2c+1. */
+               nir_ssa_def *two = nir_imm_float(b, 2.0f);
+               nir_ssa_def *one = nir_imm_float(b, 1.0f);
+               val = nir_fadd(b, nir_fmul(b, nir_i2f(b, val), two), one);
+            } else {
+               /* For unsigned normalization, the numerator is just c. */
+               val = nir_u2f(b, val);
+            }
+            val = nir_fmul(b, val, normalize_factor);
+         }
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+         val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f(b, val)
+                                               : nir_u2f(b, val);
+      }
+
+      nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, nir_src_for_ssa(val),
+                                     val->parent_instr);
+      state->impl_progress = true;
+   }
+
+   return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+                                    bool use_legacy_snorm_formula,
+                                    const uint8_t *attrib_wa_flags)
+{
+   bool progress = false;
+   struct attr_wa_state state = {
+      .use_legacy_snorm_formula = use_legacy_snorm_formula,
+      .wa_flags = attrib_wa_flags,
+   };
+
+   nir_foreach_function(shader, func) {
+      if (!func->impl)
+         continue;
+
+      nir_builder_init(&state.builder, func->impl);
+      state.impl_progress = false;
+
+      nir_foreach_block(func->impl, apply_attr_wa_block, &state);
+
+      if (state.impl_progress) {
+         nir_metadata_preserve(func->impl, nir_metadata_block_index |
+                                           nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 6a6efa9aea2..8518622c0b6 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -1229,7 +1229,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    nir->info.inputs_read = key->inputs_read;
    nir->info.patch_inputs_read = key->patch_inputs_read;
-   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index e8bc2ec241f..109080af9a6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1992,6 +1992,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                       is_scalar);
+   shader = brw_nir_lower_io(shader, compiler->devinfo, is_scalar,
+                             use_legacy_snorm_formula,
+                             key->gl_attrib_wa_flags);
    shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);
 
    const unsigned *assembly = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 0d56356a016..d8bb00f5d03 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -516,7 +516,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    nir->info.outputs_written = key->outputs_written;
    nir->info.patch_outputs_written = key->patch_outputs_written;
-   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 1d6914902b3..f3cfc8892d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -30,115 +30,6 @@ namespace brw {
 void
 vec4_vs_visitor::emit_prolog()
 {
-   dst_reg sign_recovery_shift;
-   dst_reg normalize_factor;
-   dst_reg es3_normalize_factor;
-
-   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
-         uint8_t wa_flags = key->gl_attrib_wa_flags[i];
-         dst_reg reg(ATTR, i);
-         dst_reg reg_d = reg;
-         reg_d.type = BRW_REGISTER_TYPE_D;
-         dst_reg reg_ud = reg;
-         reg_ud.type = BRW_REGISTER_TYPE_UD;
-
-         /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
-          * come in as floating point conversions of the integer values.
-          */
-         if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
-            dst_reg dst = reg;
-            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-            dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
-            emit(MUL(dst, src_reg(dst), brw_imm_f(1.0f / 65536.0f)));
-         }
-
-         /* Do sign recovery for 2101010 formats if required. */
-         if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-            if (sign_recovery_shift.file == BAD_FILE) {
-               /* shift constant: <22,22,22,30> */
-               sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), brw_imm_ud(22u)));
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), brw_imm_ud(30u)));
-            }
-
-            emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
-            emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
-         }
-
-         /* Apply BGRA swizzle if required. */
-         if (wa_flags & BRW_ATTRIB_WA_BGRA) {
-            src_reg temp = src_reg(reg);
-            temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
-            emit(MOV(reg, temp));
-         }
-
-         if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
-            /* ES 3.0 has different rules for converting signed normalized
-             * fixed-point numbers than desktop GL.
-             */
-            if ((wa_flags & BRW_ATTRIB_WA_SIGN) && !use_legacy_snorm_formula) {
-               /* According to equation 2.2 of the ES 3.0 specification,
-                * signed normalization conversion is done by:
-                *
-                * f = c / (2^(b-1)-1)
-                */
-               if (es3_normalize_factor.file == BAD_FILE) {
-                  /* mul constant: 1 / (2^(b-1) - 1) */
-                  es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
-                           brw_imm_f(1.0f / ((1<<9) - 1))));
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
-                           brw_imm_f(1.0f / ((1<<1) - 1))));
-               }
-
-               dst_reg dst = reg;
-               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg(reg_d)));
-               emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
-               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), brw_imm_f(-1.0f));
-            } else {
-               /* The following equations are from the OpenGL 3.2 specification:
-                *
-                * 2.1 unsigned normalization
-                * f = c/(2^n-1)
-                *
-                * 2.2 signed normalization
-                * f = (2c+1)/(2^n-1)
-                *
-                * Both of these share a common divisor, which is represented by
-                * "normalize_factor" in the code below.
-                */
-               if (normalize_factor.file == BAD_FILE) {
-                  /* 1 / (2^b - 1) for b=<10,10,10,2> */
-                  normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
-                           brw_imm_f(1.0f / ((1<<10) - 1))));
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_W),
-                           brw_imm_f(1.0f / ((1<<2) - 1))));
-               }
-
-               dst_reg dst = reg;
-               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
-
-               /* For signed normalization, we want the numerator to be 2c+1. */
-               if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-                  emit(MUL(dst, src_reg(dst), brw_imm_f(2.0f)));
-                  emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
-               }
-
-               emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
-            }
-         }
-
-         if (wa_flags & BRW_ATTRIB_WA_SCALE) {
-            dst_reg dst = reg;
-            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-            emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
-         }
-      }
-   }
 }
 
 
-- 
2.11.0