From 662f1b48bd1a02907bb42ecda889a3aa52a5755d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 11 Mar 2011 19:19:01 -0800
Subject: [PATCH] i965/fs: Add initial support for 16-wide dispatch on gen6.

At this point it doesn't do uniforms, which have to be laid out the
same between 8 and 16.  Other than that, it supports everything but
flow control, which was the thing that forced us to choose 8-wide for
general GLSL support.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h   |   3 +
 src/mesa/drivers/dri/i965/brw_fs.cpp      | 242 +++++++++++++++++++++---------
 src/mesa/drivers/dri/i965/brw_fs.h        |  13 ++
 src/mesa/drivers/dri/i965/brw_wm.c        |  13 +-
 src/mesa/drivers/dri/i965/brw_wm.h        |   4 +-
 src/mesa/drivers/dri/i965/gen6_wm_state.c |  16 +-
 6 files changed, 210 insertions(+), 81 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1daa49abfb3..6bf8a1c83c7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -204,13 +204,16 @@ struct brw_wm_prog_data {
    GLuint urb_read_length;
 
    GLuint first_curbe_grf;
+   GLuint first_curbe_grf_16;
    GLuint total_grf;
+   GLuint total_grf_16;
    GLuint total_scratch;
 
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    GLboolean error;
    int dispatch_width;
+   uint32_t prog_offset_16;
 
    /* Pointer to tracked values (only valid once
     * _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bb71463bebc..8785957b6e6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
    }
 }
 
+void
+fs_visitor::push_force_uncompressed()
+{
+   force_uncompressed_stack++;
+}
+
+void
+fs_visitor::pop_force_uncompressed()
+{
+   force_uncompressed_stack--;
+   assert(force_uncompressed_stack >= 0);
+}
+
+void
+fs_visitor::push_force_sechalf()
+{
+   force_sechalf_stack++;
+}
+
+void
+fs_visitor::pop_force_sechalf()
+{
+   force_sechalf_stack--;
+   assert(force_sechalf_stack >= 0);
+}
+
 /**
  * Returns how many MRFs an FS opcode will write over.
  *
@@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir)
 {
    fs_inst *inst;
 
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
    /* Don't point the annotation at the if statement, because then it plus
     * the then and else blocks get printed.
     */
@@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir)
 {
    fs_reg counter = reg_undef;
 
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
    if (ir->counter) {
       this->base_ir = ir->counter;
       ir->counter->accept(this);
@@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst)
    fs_inst *list_inst = new(mem_ctx) fs_inst;
    *list_inst = inst;
 
+   if (force_uncompressed_stack > 0)
+      list_inst->force_uncompressed = true;
+   else if (force_sechalf_stack > 0)
+      list_inst->force_sechalf = true;
+
    list_inst->annotation = this->current_annotation;
    list_inst->ir = this->base_ir;
 
@@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes()
    this->current_annotation = "FB write header";
    GLboolean header_present = GL_TRUE;
    int nr = 0;
+   int reg_width = c->dispatch_width / 8;
 
    if (intel->gen >= 6 &&
        !this->kill_emitted &&
@@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes()
    }
 
    if (c->aa_dest_stencil_reg) {
+      push_force_uncompressed();
       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
 	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+      pop_force_uncompressed();
    }
 
    /* Reserve space for color. It'll be filled in per MRT below. */
    int color_mrf = nr;
-   nr += 4;
+   nr += 4 * reg_width;
 
    if (c->source_depth_to_render_target) {
+      if (intel->gen == 6 && c->dispatch_width == 16) {
+	 /* For outputting oDepth on gen6, SIMD8 writes have to be
+	  * used.  This would require 8-wide moves of each half to
+	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
+	  * Just bail on doing so for now.
+	  */
+	 fail("Missing support for simd16 depth writes on gen6\n");
+      }
+
       if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));
 
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
       } else {
 	 /* Pass through the payload depth. */
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
       }
+      nr += reg_width;
    }
 
    if (c->dest_depth_reg) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+      nr += reg_width;
    }
 
    fs_reg color = reg_undef;
@@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes()
 						 target);
       if (this->frag_color || this->frag_data) {
 	 for (int i = 0; i < 4; i++) {
-	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
+	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
 	    color.reg_offset++;
 	 }
       }
@@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
    brw_pop_insn_state(p);
 
    brw_fb_WRITE(p,
-		8, /* dispatch_width */
+		c->dispatch_width,
 		inst->base_mrf,
 		implied_header,
 		inst->target,
@@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs()
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->nr_payload_regs;
    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
+   if (c->dispatch_width == 8) {
+      c->prog_data.first_curbe_grf = c->nr_payload_regs;
+   } else {
+      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
+   }
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup()
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == UNIFORM) {
 	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
-	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
+	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
 
@@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup()
 void
 fs_visitor::assign_urb_setup()
 {
-   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
+   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
 
    /* Offset all the urb_setup[] index by the actual position of the
     * setup regs, now that the location of the constants has been chosen.
@@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 void
 fs_visitor::generate_code()
 {
-   int last_native_inst = 0;
+   int last_native_inst = p->nr_insn;
    const char *last_annotation_string = NULL;
    ir_instruction *last_annotation_ir = NULL;
 
@@ -3532,8 +3589,8 @@ fs_visitor::generate_code()
 
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d:\n",
-	     ctx->Shader.CurrentFragmentProgram->Name);
+      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
    }
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3566,6 +3623,14 @@ fs_visitor::generate_code()
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
 
+      if (inst->force_uncompressed || c->dispatch_width == 8) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      } else if (inst->force_sechalf) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+
       switch (inst->opcode) {
       case BRW_OPCODE_MOV:
 	 brw_MOV(p, dst, src[0]);
@@ -3804,108 +3869,149 @@ fs_visitor::generate_code()
    }
 }
 
-GLboolean
-brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+bool
+fs_visitor::run()
 {
-   struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
-   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+   uint32_t prog_offset_16 = 0;
 
-   if (!prog)
-      return GL_FALSE;
+   brw_wm_payload_setup(brw, c);
 
-   struct brw_shader *shader =
-     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-   if (!shader)
-      return GL_FALSE;
+   if (c->dispatch_width == 16) {
+      if (c->prog_data.curb_read_length) {
+	 /* Haven't hooked in support for uniforms through the 16-wide
+	  * version yet.
+	  */
+	 return GL_FALSE;
+      }
 
-   /* We always use 8-wide mode, at least for now.  For one, flow
-    * control only works in 8-wide.  Also, when we're fragment shader
-    * bound, we're almost always under register pressure as well, so
-    * 8-wide would save us from the performance cliff of spilling
-    * regs.
-    */
-   c->dispatch_width = 8;
+      /* align to 64 byte boundary. */
+      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
+	 brw_NOP(p);
+      }
 
-   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
-      _mesa_print_ir(shader->ir, NULL);
-      printf("\n");
-   }
+      /* Save off the start of this 16-wide program in case we succeed. */
+      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
 
-   /* Now the main event: Visit the shader IR and generate our FS IR for it.
-    */
-   fs_visitor v(c, shader);
+      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   }
 
    if (0) {
-      v.emit_dummy_fs();
+      emit_dummy_fs();
    } else {
-      v.calculate_urb_setup();
+      calculate_urb_setup();
       if (intel->gen < 6)
-	 v.emit_interpolation_setup_gen4();
+	 emit_interpolation_setup_gen4();
       else
-	 v.emit_interpolation_setup_gen6();
+	 emit_interpolation_setup_gen6();
 
       /* Generate FS IR for main().  (the visitor only descends into
        * functions called "main").
        */
       foreach_iter(exec_list_iterator, iter, *shader->ir) {
 	 ir_instruction *ir = (ir_instruction *)iter.get();
-	 v.base_ir = ir;
-	 ir->accept(&v);
+	 base_ir = ir;
+	 ir->accept(this);
       }
 
-      v.emit_fb_writes();
+      emit_fb_writes();
 
-      v.split_virtual_grfs();
+      split_virtual_grfs();
 
-      v.setup_paramvalues_refs();
-      v.setup_pull_constants();
+      setup_paramvalues_refs();
+      setup_pull_constants();
 
       bool progress;
       do {
 	 progress = false;
 
-	 progress = v.remove_duplicate_mrf_writes() || progress;
+	 progress = remove_duplicate_mrf_writes() || progress;
 
-	 progress = v.propagate_constants() || progress;
-	 progress = v.register_coalesce() || progress;
-	 progress = v.compute_to_mrf() || progress;
-	 progress = v.dead_code_eliminate() || progress;
+	 progress = propagate_constants() || progress;
+	 progress = register_coalesce() || progress;
+	 progress = compute_to_mrf() || progress;
+	 progress = dead_code_eliminate() || progress;
       } while (progress);
 
-      v.schedule_instructions();
+      schedule_instructions();
 
-      v.assign_curb_setup();
-      v.assign_urb_setup();
+      assign_curb_setup();
+      assign_urb_setup();
 
       if (0) {
 	 /* Debug of register spilling: Go spill everything. */
-	 int virtual_grf_count = v.virtual_grf_next;
+	 int virtual_grf_count = virtual_grf_next;
 	 for (int i = 1; i < virtual_grf_count; i++) {
-	    v.spill_reg(i);
+	    spill_reg(i);
 	 }
       }
 
       if (0)
-	 v.assign_regs_trivial();
+	 assign_regs_trivial();
       else {
-	 while (!v.assign_regs()) {
-	    if (v.failed)
+	 while (!assign_regs()) {
+	    if (failed)
 	       break;
 	 }
       }
    }
+   assert(force_uncompressed_stack == 0);
+   assert(force_sechalf_stack == 0);
 
-   if (!v.failed)
-      v.generate_code();
-
-   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
+   if (!failed)
+      generate_code();
 
-   if (v.failed)
+   if (failed)
       return GL_FALSE;
 
-   c->prog_data.total_grf = v.grf_used;
+   if (c->dispatch_width == 8) {
+      c->prog_data.total_grf = grf_used;
+   } else {
+      c->prog_data.total_grf_16 = grf_used;
+      c->prog_data.prog_offset_16 = prog_offset_16;
+   }
+
+   return !failed;
+}
 
-   return GL_TRUE;
+bool
+brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+
+   if (!prog)
+      return false;
+
+   struct brw_shader *shader =
+     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   if (!shader)
+      return false;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
+      _mesa_print_ir(shader->ir, NULL);
+      printf("\n");
+   }
+
+   /* Now the main event: Visit the shader IR and generate our FS IR for it.
+    */
+   c->dispatch_width = 8;
+
+   fs_visitor v(c, shader);
+   if (!v.run()) {
+      /* FINISHME: Cleanly fail, test at link time, etc. */
+      assert(!"not reached");
+      return false;
+   }
+
+   if (intel->gen >= 6) {
+      c->dispatch_width = 16;
+      fs_visitor v2(c, shader);
+      v2.run();
+   }
+
+   c->prog_data.dispatch_width = 8;
+
+   return true;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index fd83fcb3829..b158992071e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -343,6 +343,8 @@ public:
    bool eot;
    bool header_present;
    bool shadow_compare;
+   bool force_uncompressed;
+   bool force_sechalf;
    uint32_t offset; /* spill/unspill offset */
 
    /** @{
@@ -405,6 +407,8 @@ public:
       this->live_intervals_valid = false;
 
       this->kill_emitted = false;
+      this->force_uncompressed_stack = 0;
+      this->force_sechalf_stack = 0;
    }
 
    ~fs_visitor()
@@ -461,6 +465,7 @@ public:
       return emit(fs_inst(opcode, dst, src0, src1, src2));
    }
 
+   bool run();
    void setup_paramvalues_refs();
    void assign_curb_setup();
    void calculate_urb_setup();
@@ -481,6 +486,11 @@ public:
    void schedule_instructions();
    void fail(const char *msg, ...);
 
+   void push_force_uncompressed();
+   void pop_force_uncompressed();
+   void push_force_sechalf();
+   void pop_force_sechalf();
+
    void generate_code();
    void generate_fb_write(fs_inst *inst);
    void generate_pixel_xy(struct brw_reg dst, bool is_x);
@@ -568,6 +578,9 @@ public:
    fs_reg reg_null_cmp;
 
    int grf_used;
+
+   int force_uncompressed_stack;
+   int force_sechalf_stack;
 };
 
 GLboolean brw_do_channel_expressions(struct exec_list *instructions);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index c4b2157db55..4564fb6b1ad 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_emit(c);
 }
 
-static void
+void
 brw_wm_payload_setup(struct brw_context *brw,
 		     struct brw_wm_compile *c)
 {
@@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   brw_wm_payload_setup(brw, c);
-
    if (!brw_wm_fs_emit(brw, c)) {
-      /*
-       * Shader which use GLSL features such as flow control are handled
-       * differently from "simple" shaders.
-       */
+      /* Fallback for fixed function and ARB_fp shaders. */
       c->dispatch_width = 16;
       brw_wm_payload_setup(brw, c);
       brw_wm_non_glsl_emit(brw, c);
+      c->prog_data.dispatch_width = 16;
    }
-   c->prog_data.dispatch_width = c->dispatch_width;
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
@@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
    struct brw_wm_prog_key key;
    struct brw_fragment_program *fp = (struct brw_fragment_program *)
       brw->fragment_program;
-     
+
    brw_wm_populate_key(brw, &key);
 
    /* Make an early check for the key.
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 5d1e4045928..8e5a9cdb86c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 void brw_wm_lookup_iz(struct intel_context *intel,
 		      struct brw_wm_compile *c);
 
-GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 /* brw_wm_emit.c */
 void emit_alu1(struct brw_compile *p,
@@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint
 
 bool brw_color_buffer_write_enabled(struct brw_context *brw);
 bool brw_render_target_supported(gl_format format);
+void brw_wm_payload_setup(struct brw_context *brw,
+			  struct brw_wm_compile *c);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 8215cb15a9c..d4fca788cb9 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw)
    dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
    dw4 |= (brw->wm.prog_data->first_curbe_grf <<
 	   GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
+	   GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
 
    dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
    /* CACHE_NEW_WM_PROG */
-   if (brw->wm.prog_data->dispatch_width == 8)
+   if (brw->wm.prog_data->dispatch_width == 8) {
       dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-   else
+      if (brw->wm.prog_data->prog_offset_16)
+	 dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   } else {
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   }
 
    /* _NEW_LINE */
    if (ctx->Line.StippleFlag)
@@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw)
    OUT_BATCH(dw5);
    OUT_BATCH(dw6);
    OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(0); /* kernel 2 pointer */
+   if (brw->wm.prog_data->prog_offset_16) {
+      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		brw->wm.prog_data->prog_offset_16);
+   } else {
+      OUT_BATCH(0); /* kernel 2 pointer */
+   }
    ADVANCE_BATCH();
 }
 
-- 
2.11.0