From 514fd1c55e617bb325979cbee4a89f0727c3b567 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 12 Sep 2014 16:17:37 -0700 Subject: [PATCH] i965/fs: Use the GRF for FB writes on gen >= 7 On gen 7, the MRF was removed and we gained the ability to do send instructions directly from the GRF. This commit enables that functinoality for FB writes. v2: Make handling of components more sane. i965/fs: Force a high register for the final FB write v2: Renamed the array for the range mappings and added a comment Signed-off-by: Jason Ekstrand Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 + src/mesa/drivers/dri/i965/brw_fs.h | 2 +- src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 34 ++++- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 162 +++++++++++++--------- src/mesa/drivers/dri/i965/brw_shader.cpp | 1 + src/mesa/drivers/dri/i965/intel_screen.h | 10 ++ 6 files changed, 142 insertions(+), 71 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 3277b586ede..b9bd94c3070 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -512,6 +512,8 @@ fs_inst::is_send_from_grf() const return true; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: return src[1].file == GRF; + case FS_OPCODE_FB_WRITE: + return src[0].file == GRF; default: if (is_tex()) return src[0].file == GRF; @@ -915,6 +917,8 @@ fs_inst::regs_read(fs_visitor *v, int arg) const { if (is_tex() && arg == 0 && src[0].file == GRF) { return mlen; + } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) { + return mlen; } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) { return mlen; } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) { diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 77d76807f37..49024d87849 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -523,7 +523,7 @@ public: const struct prog_instruction *fpi, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg one); - void emit_color_write(fs_reg color, int index, int first_color_mrf); + int setup_color_payload(fs_reg *dst, fs_reg color, unsigned components); void emit_alpha_test(); fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2, fs_reg src0_alpha, unsigned components); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index ef5720c5368..63d9c05a575 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -113,6 +113,10 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) class_sizes[class_count++] = 8; } + memset(screen->wm_reg_sets[index].class_to_ra_reg_range, 0, + sizeof(screen->wm_reg_sets[index].class_to_ra_reg_range)); + int *class_to_ra_reg_range = screen->wm_reg_sets[index].class_to_ra_reg_range; + /* Compute the total number of registers across all classes. */ int ra_reg_count = 0; for (int i = 0; i < class_count; i++) { @@ -131,6 +135,14 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) } else { ra_reg_count += base_reg_count - (class_sizes[i] - 1); } + /* Mark the last register. We'll fill in the beginnings later. */ + class_to_ra_reg_range[class_sizes[i]] = ra_reg_count; + } + + /* Fill out the rest of the range markers */ + for (int i = 1; i < 17; ++i) { + if (class_to_ra_reg_range[i] == 0) + class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1]; } uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count); @@ -505,9 +517,29 @@ fs_visitor::assign_regs(bool allow_spilling) } setup_payload_interference(g, payload_node_count, first_payload_node); - if (brw->gen >= 7) + if (brw->gen >= 7) { setup_mrf_hack_interference(g, first_mrf_hack_node); + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* When we do send-from-GRF for FB writes, we need to ensure that + * the last write instruction sends from a high register. This is + * because the vertex fetcher wants to start filling the low + * payload registers while the pixel data port is still working on + * writing out the memory. If we don't do this, we get rendering + * artifacts. + * + * We could just do "something high". Instead, we just pick the + * highest register that works. + */ + if (inst->opcode == FS_OPCODE_FB_WRITE && inst->eot) { + int size = virtual_grf_sizes[inst->src[0].reg]; + int reg = screen->wm_reg_sets[rsi].class_to_ra_reg_range[size] - 1; + ra_set_node_reg(g, inst->src[0].reg, reg); + break; + } + } + } + if (dispatch_width > 8) { /* In 16-wide dispatch we have an issue where a compressed * instruction is actually two instructions executed simultaneiously. diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index c4cc2e9caf8..9f65b1f4cd0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -2967,20 +2967,24 @@ fs_visitor::emit_interpolation_setup_gen6() this->current_annotation = NULL; } -void -fs_visitor::emit_color_write(fs_reg color, int index, int first_color_mrf) +int +fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components) { - assert(stage == MESA_SHADER_FRAGMENT); brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - int reg_width = dispatch_width / 8; fs_inst *inst; - fs_reg mrf; - /* If there's no color data to be written, skip it. */ - if (color.file == BAD_FILE) - return; + if (color.file == BAD_FILE) { + return 4 * (dispatch_width / 8); + } - color = offset(color, index); + uint8_t colors_enabled; + if (components == 0) { + /* We want to write one component to the alpha channel */ + colors_enabled = 0x8; + } else { + /* Enable the first components-many channels */ + colors_enabled = (1 << components) - 1; + } if (dispatch_width == 8 || brw->gen >= 6) { /* SIMD8 write looks like: @@ -2999,10 +3003,20 @@ fs_visitor::emit_color_write(fs_reg color, int index, int first_color_mrf) * m + 6: a0 * m + 7: a1 */ - inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width, - color.type), - color)); - inst->saturate = key->clamp_fragment_color; + int len = 0; + for (unsigned i = 0; i < 4; ++i) { + if (colors_enabled & (1 << i)) { + dst[len] = fs_reg(GRF, virtual_grf_alloc(color.width / 8), + color.type, color.width); + inst = emit(MOV(dst[len], offset(color, i))); + inst->saturate = key->clamp_fragment_color; + } else if (color.width == 16) { + /* We need two BAD_FILE slots for a 16-wide color */ + len++; + } + len++; + } + return len; } else { /* pre-gen6 SIMD16 single source DP write looks like: * m + 0: r0 @@ -3014,26 +3028,19 @@ fs_visitor::emit_color_write(fs_reg color, int index, int first_color_mrf) * m + 6: b1 * m + 7: a1 */ - if (brw->has_compr4) { - /* By setting the high bit of the MRF register number, we - * indicate that we want COMPR4 mode - instead of doing the - * usual destination + 1 for the second half we get - * destination + 4. - */ - inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index, - color.type), - color)); - inst->saturate = key->clamp_fragment_color; - } else { - inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type), - color)); - inst->saturate = key->clamp_fragment_color; - - inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type), - half(color, 1))); - inst->force_sechalf = true; - inst->saturate = key->clamp_fragment_color; + for (unsigned i = 0; i < 4; ++i) { + if (colors_enabled & (1 << i)) { + dst[i] = fs_reg(GRF, virtual_grf_alloc(1), color.type); + inst = emit(MOV(dst[i], half(offset(color, i), 0))); + inst->saturate = key->clamp_fragment_color; + + dst[i + 4] = fs_reg(GRF, virtual_grf_alloc(1), color.type); + inst = emit(MOV(dst[i + 4], half(offset(color, i), 1))); + inst->saturate = key->clamp_fragment_color; + inst->force_sechalf = true; + } } + return 8; } } @@ -3101,12 +3108,13 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, this->current_annotation = "FB write header"; bool header_present = true; + int reg_size = dispatch_width / 8; + /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. */ - int base_mrf = 1; - int nr = base_mrf; - int reg_width = dispatch_width / 8; + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15); + int length = 0; /* From the Sandy Bridge PRM, volume 4, page 198: * @@ -3123,12 +3131,14 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, } if (header_present) - /* m2, m3 header */ - nr += 2; + /* Allocate 2 registers for a header */ + length += 2; if (payload.aa_dest_stencil_reg) { - emit(MOV(fs_reg(MRF, nr++), + sources[length] = fs_reg(GRF, virtual_grf_alloc(1)); + emit(MOV(sources[length], fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)))); + length++; } prog_data->uses_omask = @@ -3136,9 +3146,13 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, if (prog_data->uses_omask) { this->current_annotation = "FB write oMask"; assert(this->sample_mask.file != BAD_FILE); - /* Hand over gl_SampleMask. Only lower 16 bits are relevant. */ - emit(FS_OPCODE_SET_OMASK, fs_reg(MRF, nr, BRW_REGISTER_TYPE_UW), this->sample_mask); - nr += 1; + /* Hand over gl_SampleMask. Only lower 16 bits are relevant. Since + * it's unsinged single words, one vgrf is always 16-wide. + */ + sources[length] = fs_reg(GRF, virtual_grf_alloc(1), + BRW_REGISTER_TYPE_UW, 16); + emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask); + length++; } if (color0.file == BAD_FILE) { @@ -3146,28 +3160,20 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, * alpha out the pipeline to our null renderbuffer to support * alpha-testing, alpha-to-coverage, and so on. */ - emit_color_write(this->outputs[0], 3, nr); - nr += 4 * reg_width; + length += setup_color_payload(sources + length, this->outputs[0], 0); } else if (color1.file == BAD_FILE) { if (src0_alpha.file != BAD_FILE) { - fs_inst *inst; - inst = emit(MOV(fs_reg(MRF, nr, src0_alpha.type), src0_alpha)); + sources[length] = fs_reg(GRF, virtual_grf_alloc(reg_size), + src0_alpha.type, src0_alpha.width); + fs_inst *inst = emit(MOV(sources[length], src0_alpha)); inst->saturate = key->clamp_fragment_color; - nr += reg_width; + length++; } - for (unsigned i = 0; i < components; i++) - emit_color_write(color0, i, nr); - - nr += 4 * reg_width; + length += setup_color_payload(sources + length, color0, components); } else { - for (unsigned i = 0; i < components; i++) - emit_color_write(color0, i, nr); - nr += 4 * reg_width; - - for (unsigned i = 0; i < components; i++) - emit_color_write(color1, i, nr); - nr += 4 * reg_width; + length += setup_color_payload(sources + length, color0, components); + length += setup_color_payload(sources + length, color1, components); } if (source_depth_to_render_target) { @@ -3180,33 +3186,51 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, no16("Missing support for simd16 depth writes on gen6\n"); } + sources[length] = fs_reg(this, glsl_type::float_type); if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { /* Hand over gl_FragDepth. */ assert(this->frag_depth.file != BAD_FILE); - emit(MOV(fs_reg(MRF, nr), this->frag_depth)); + emit(MOV(sources[length], this->frag_depth)); } else { /* Pass through the payload depth. */ - emit(MOV(fs_reg(MRF, nr), + emit(MOV(sources[length], fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)))); } - nr += reg_width; + length++; } if (payload.dest_depth_reg) { - emit(MOV(fs_reg(MRF, nr), + sources[length] = fs_reg(this, glsl_type::float_type); + emit(MOV(sources[length], fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)))); - nr += reg_width; + length++; } - fs_inst *inst = emit(FS_OPCODE_FB_WRITE); - inst->base_mrf = base_mrf; - inst->mlen = nr - base_mrf; - inst->header_present = header_present; + fs_inst *load; + fs_inst *write; + if (brw->gen >= 7) { + /* Send from the GRF */ + fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F); + load = emit(LOAD_PAYLOAD(payload, sources, length)); + payload.reg = virtual_grf_alloc(load->regs_written); + load->dst = payload; + write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload); + write->base_mrf = -1; + } else { + /* Send from the MRF */ + load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), + sources, length)); + write = emit(FS_OPCODE_FB_WRITE); + write->base_mrf = 1; + } + + write->mlen = load->regs_written; + write->header_present = header_present; if ((brw->gen >= 8 || brw->is_haswell) && prog_data->uses_kill) { - inst->predicate = BRW_PREDICATE_NORMAL; - inst->flag_subreg = 1; + write->predicate = BRW_PREDICATE_NORMAL; + write->flag_subreg = 1; } - return inst; + return write; } void diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 92089dbf8a7..4f58f2865e1 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -738,6 +738,7 @@ backend_instruction::has_side_effects() const { switch (opcode) { case SHADER_OPCODE_UNTYPED_ATOMIC: + case FS_OPCODE_FB_WRITE: return true; default: return false; diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h index ea0be2c52da..393315ea292 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.h +++ b/src/mesa/drivers/dri/i965/intel_screen.h @@ -89,6 +89,16 @@ struct intel_screen int classes[16]; /** + * Mapping from classes to ra_reg ranges. Each of the per-size + * classes corresponds to a range of ra_reg nodes. This array stores + * those ranges in the form of first ra_reg in each class and the + * total number of ra_reg elements in the last array element. This + * way the range of the i'th class is given by: + * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) + */ + int class_to_ra_reg_range[17]; + + /** * Mapping for register-allocated objects in *regs to the first * GRF for that object. */ -- 2.11.0