From e9daead784921e453906853a4a78a2f3135af2e0 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Nov 2013 17:38:23 -0800 Subject: [PATCH] i965/fs: Try a different pre-scheduling heuristic if the first spills. Since LIFO fails on some shaders in one particular way, and non-LIFO systematically fails in another way on different kinds of shaders, try them both, and pick whichever one successfully register allocates first. Slightly prefer non-LIFO in case we produce extra dependencies in register allocation, since it should start out with fewer stalls than LIFO. This is madness, but I haven't come up with another way to get unigine tropics to not spill while keeping other programs from not spilling and retaining the non-unigine performance wins from texture-grf. total instructions in shared programs: 1626728 -> 1626288 (-0.03%) instructions in affected programs: 1015 -> 575 (-43.35%) GAINED: 50 LOST: 0 Improves Unigine Tropics performance by 14.5257% +/- 0.241838% (n=38) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445 Cc: "10.0" Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs.cpp | 25 +++++-- src/mesa/drivers/dri/i965/brw_fs.h | 4 +- src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 10 +-- .../drivers/dri/i965/brw_schedule_instructions.cpp | 85 ++++++++++++---------- src/mesa/drivers/dri/i965/brw_shader.h | 6 ++ 5 files changed, 76 insertions(+), 54 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index afa82c9abbf..f89390c346c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3286,15 +3286,28 @@ fs_visitor::run() assign_curb_setup(); assign_urb_setup(); - schedule_instructions(false); + schedule_instructions(SCHEDULE_PRE_NON_LIFO); if (0) assign_regs_trivial(); else { - while (!assign_regs()) { - if (failed) - break; - } + if (!assign_regs(false)) { + /* Try a non-spilling register allocation again with a different + * scheduling heuristic. + */ + schedule_instructions(SCHEDULE_PRE_LIFO); + if (!assign_regs(false)) { + if (dispatch_width == 16) { + fail("Failure to register allocate. Reduce number of " + "live scalar values to avoid this."); + } else { + while (!assign_regs(true)) { + if (failed) + break; + } + } + } + } } } assert(force_uncompressed_stack == 0); @@ -3309,7 +3322,7 @@ fs_visitor::run() if (failed) return false; - schedule_instructions(true); + schedule_instructions(SCHEDULE_POST); if (dispatch_width == 8) { c->prog_data.reg_blocks = brw_register_blocks(grf_used); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index dcd5b19e4d7..529bd3a558a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -291,7 +291,7 @@ public: void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); - bool assign_regs(); + bool assign_regs(bool allow_spilling); void assign_regs_trivial(); void get_used_mrfs(bool *mrf_used); void setup_payload_interference(struct ra_graph *g, int payload_reg_count, @@ -322,7 +322,7 @@ public: bool remove_dead_constants(); bool remove_duplicate_mrf_writes(); bool virtual_grf_interferes(int a, int b); - void schedule_instructions(bool post_reg_alloc); + void schedule_instructions(instruction_scheduler_mode mode); void insert_gen4_send_dependency_workarounds(); void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst); void insert_gen4_post_send_dependency_workarounds(fs_inst *inst); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index d9e80d07f48..8567afd3c16 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -417,7 +417,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node) } bool -fs_visitor::assign_regs() +fs_visitor::assign_regs(bool allow_spilling) { /* Most of this allocation was written for a reg_width of 1 * (dispatch_width == 8). In extending to 16-wide, the code was @@ -496,14 +496,10 @@ fs_visitor::assign_regs() if (reg == -1) { fail("no register to spill:\n"); dump_instructions(); - } else if (dispatch_width == 16) { - fail("Failure to register allocate. Reduce number of live scalar " - "values to avoid this."); - } else { - spill_reg(reg); + } else if (allow_spilling) { + spill_reg(reg); } - ralloc_free(g); return false; diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 5710380f12e..befea0a787d 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -391,14 +391,16 @@ schedule_node::set_latency_gen7(bool is_haswell) class instruction_scheduler { public: - instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc) + instruction_scheduler(backend_visitor *v, int grf_count, + instruction_scheduler_mode mode) { this->bv = v; this->mem_ctx = ralloc_context(NULL); this->grf_count = grf_count; this->instructions.make_empty(); this->instructions_to_schedule = 0; - this->post_reg_alloc = post_reg_alloc; + this->post_reg_alloc = (mode == SCHEDULE_POST); + this->mode = mode; this->time = 0; if (!post_reg_alloc) { this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count); @@ -447,6 +449,8 @@ public: exec_list instructions; backend_visitor *bv; + instruction_scheduler_mode mode; + /** * Number of instructions left to schedule that reference each vgrf. * @@ -467,7 +471,8 @@ public: class fs_instruction_scheduler : public instruction_scheduler { public: - fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc); + fs_instruction_scheduler(fs_visitor *v, int grf_count, + instruction_scheduler_mode mode); void calculate_deps(); bool is_compressed(fs_inst *inst); schedule_node *choose_instruction_to_schedule(); @@ -481,8 +486,8 @@ public: fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, int grf_count, - bool post_reg_alloc) - : instruction_scheduler(v, grf_count, post_reg_alloc), + instruction_scheduler_mode mode) + : instruction_scheduler(v, grf_count, mode), v(v) { } @@ -569,7 +574,7 @@ public: vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, int grf_count) - : instruction_scheduler(v, grf_count, true), + : instruction_scheduler(v, grf_count, SCHEDULE_POST), v(v) { } @@ -1179,40 +1184,42 @@ fs_instruction_scheduler::choose_instruction_to_schedule() continue; } - /* Prefer instructions that recently became available for scheduling. - * These are the things that are most likely to (eventually) make a - * variable dead and reduce register pressure. Typical register - * pressure estimates don't work for us because most of our pressure - * comes from texturing, where no single instruction to schedule will - * make a vec4 value dead. - */ - if (n->cand_generation > chosen->cand_generation) { - chosen = n; - continue; - } else if (n->cand_generation < chosen->cand_generation) { - continue; - } - - /* On MRF-using chips, prefer non-SEND instructions. If we don't do - * this, then because we prefer instructions that just became - * candidates, we'll end up in a pattern of scheduling a SEND, then - * the MRFs for the next SEND, then the next SEND, then the MRFs, - * etc., without ever consuming the results of a send. - */ - if (v->brw->gen < 7) { - fs_inst *chosen_inst = (fs_inst *)chosen->inst; - - /* We use regs_written > 1 as our test for the kind of send - * instruction to avoid -- only sends generate many regs, and a - * single-result send is probably actually reducing register - * pressure. + if (mode == SCHEDULE_PRE_LIFO) { + /* Prefer instructions that recently became available for + * scheduling. These are the things that are most likely to + * (eventually) make a variable dead and reduce register pressure. + * Typical register pressure estimates don't work for us because + * most of our pressure comes from texturing, where no single + * instruction to schedule will make a vec4 value dead. */ - if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) { + if (n->cand_generation > chosen->cand_generation) { chosen = n; continue; - } else if (inst->regs_written > chosen_inst->regs_written) { + } else if (n->cand_generation < chosen->cand_generation) { continue; } + + /* On MRF-using chips, prefer non-SEND instructions. If we don't + * do this, then because we prefer instructions that just became + * candidates, we'll end up in a pattern of scheduling a SEND, + * then the MRFs for the next SEND, then the next SEND, then the + * MRFs, etc., without ever consuming the results of a send. + */ + if (v->brw->gen < 7) { + fs_inst *chosen_inst = (fs_inst *)chosen->inst; + + /* We use regs_written > 1 as our test for the kind of send + * instruction to avoid -- only sends generate many regs, and a + * single-result send is probably actually reducing register + * pressure. + */ + if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) { + chosen = n; + continue; + } else if (inst->regs_written > chosen_inst->regs_written) { + continue; + } + } } /* For instructions pushed on the cands list at the same time, prefer @@ -1407,18 +1414,18 @@ instruction_scheduler::run(exec_list *all_instructions) } void -fs_visitor::schedule_instructions(bool post_reg_alloc) +fs_visitor::schedule_instructions(instruction_scheduler_mode mode) { int grf_count; - if (post_reg_alloc) + if (mode == SCHEDULE_POST) grf_count = grf_used; else grf_count = virtual_grf_count; - fs_instruction_scheduler sched(this, grf_count, post_reg_alloc); + fs_instruction_scheduler sched(this, grf_count, mode); sched.run(&instructions); - if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) { + if (unlikely(INTEL_DEBUG & DEBUG_WM) && mode == SCHEDULE_POST) { printf("fs%d estimated execution time: %d cycles\n", dispatch_width, sched.time); } diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 88c23115e08..aba24c58b62 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -59,6 +59,12 @@ public: bool predicate_inverse; }; +enum instruction_scheduler_mode { + SCHEDULE_PRE_NON_LIFO, + SCHEDULE_PRE_LIFO, + SCHEDULE_POST, +}; + class backend_visitor : public ir_visitor { public: -- 2.11.0