2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
33 #include <sys/types.h>
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "program/prog_parameter.h"
39 #include "program/prog_print.h"
40 #include "program/register_allocate.h"
41 #include "program/sampler.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
47 #include "brw_shader.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
52 #define MAX_INSTRUCTION (1 << 30)
55 fs_visitor::type_size(const struct glsl_type *type)
59 switch (type->base_type) {
64 return type->components();
66 return type_size(type->fields.array) * type->length;
67 case GLSL_TYPE_STRUCT:
69 for (i = 0; i < type->length; i++) {
70 size += type_size(type->fields.structure[i].type);
73 case GLSL_TYPE_SAMPLER:
74 /* Samplers take up no register space, since they're baked in at
79 assert(!"not reached");
85 fs_visitor::fail(const char *format, ...)
96 msg = ralloc_vasprintf(mem_ctx, format, va);
98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
100 this->fail_msg = msg;
102 if (INTEL_DEBUG & DEBUG_WM) {
103 fprintf(stderr, "%s", msg);
108 fs_visitor::push_force_uncompressed()
110 force_uncompressed_stack++;
114 fs_visitor::pop_force_uncompressed()
116 force_uncompressed_stack--;
117 assert(force_uncompressed_stack >= 0);
121 fs_visitor::push_force_sechalf()
123 force_sechalf_stack++;
127 fs_visitor::pop_force_sechalf()
129 force_sechalf_stack--;
130 assert(force_sechalf_stack >= 0);
134 * Returns how many MRFs an FS opcode will write over.
136 * Note that this is not the 0 or 1 implied writes in an actual gen
137 * instruction -- the FS opcodes often generate MOVs in addition.
140 fs_visitor::implied_mrf_writes(fs_inst *inst)
145 switch (inst->opcode) {
146 case SHADER_OPCODE_RCP:
147 case SHADER_OPCODE_RSQ:
148 case SHADER_OPCODE_SQRT:
149 case SHADER_OPCODE_EXP2:
150 case SHADER_OPCODE_LOG2:
151 case SHADER_OPCODE_SIN:
152 case SHADER_OPCODE_COS:
153 return 1 * c->dispatch_width / 8;
154 case SHADER_OPCODE_POW:
155 case SHADER_OPCODE_INT_QUOTIENT:
156 case SHADER_OPCODE_INT_REMAINDER:
157 return 2 * c->dispatch_width / 8;
158 case SHADER_OPCODE_TEX:
160 case SHADER_OPCODE_TXD:
161 case SHADER_OPCODE_TXF:
162 case SHADER_OPCODE_TXL:
163 case SHADER_OPCODE_TXS:
165 case FS_OPCODE_FB_WRITE:
167 case FS_OPCODE_PULL_CONSTANT_LOAD:
168 case FS_OPCODE_UNSPILL:
170 case FS_OPCODE_SPILL:
173 assert(!"not reached");
179 fs_visitor::virtual_grf_alloc(int size)
181 if (virtual_grf_array_size <= virtual_grf_next) {
182 if (virtual_grf_array_size == 0)
183 virtual_grf_array_size = 16;
185 virtual_grf_array_size *= 2;
186 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
187 virtual_grf_array_size);
189 virtual_grf_sizes[virtual_grf_next] = size;
190 return virtual_grf_next++;
193 /** Fixed HW reg constructor. */
194 fs_reg::fs_reg(enum register_file file, int reg)
199 this->type = BRW_REGISTER_TYPE_F;
202 /** Fixed HW reg constructor. */
203 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
211 /** Automatic reg constructor. */
212 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
217 this->reg = v->virtual_grf_alloc(v->type_size(type));
218 this->reg_offset = 0;
219 this->type = brw_type_for_base_type(type);
223 fs_visitor::variable_storage(ir_variable *var)
225 return (fs_reg *)hash_table_find(this->variable_ht, var);
229 import_uniforms_callback(const void *key,
233 struct hash_table *dst_ht = (struct hash_table *)closure;
234 const fs_reg *reg = (const fs_reg *)data;
236 if (reg->file != UNIFORM)
239 hash_table_insert(dst_ht, data, key);
242 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
243 * This brings in those uniform definitions
246 fs_visitor::import_uniforms(fs_visitor *v)
248 hash_table_call_foreach(v->variable_ht,
249 import_uniforms_callback,
251 this->params_remap = v->params_remap;
254 /* Our support for uniforms is piggy-backed on the struct
255 * gl_fragment_program, because that's where the values actually
256 * get stored, rather than in some global gl_shader_program uniform
260 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
262 unsigned int offset = 0;
264 if (type->is_matrix()) {
265 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
266 type->vector_elements,
269 for (unsigned int i = 0; i < type->matrix_columns; i++) {
270 offset += setup_uniform_values(loc + offset, column);
276 switch (type->base_type) {
277 case GLSL_TYPE_FLOAT:
281 for (unsigned int i = 0; i < type->vector_elements; i++) {
282 unsigned int param = c->prog_data.nr_params++;
284 assert(param < ARRAY_SIZE(c->prog_data.param));
286 if (ctx->Const.NativeIntegers) {
287 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
289 switch (type->base_type) {
290 case GLSL_TYPE_FLOAT:
291 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
294 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
297 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
300 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
303 assert(!"not reached");
304 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
308 this->param_index[param] = loc;
309 this->param_offset[param] = i;
313 case GLSL_TYPE_STRUCT:
314 for (unsigned int i = 0; i < type->length; i++) {
315 offset += setup_uniform_values(loc + offset,
316 type->fields.structure[i].type);
320 case GLSL_TYPE_ARRAY:
321 for (unsigned int i = 0; i < type->length; i++) {
322 offset += setup_uniform_values(loc + offset, type->fields.array);
326 case GLSL_TYPE_SAMPLER:
327 /* The sampler takes up a slot, but we don't use any values from it. */
331 assert(!"not reached");
337 /* Our support for builtin uniforms is even scarier than non-builtin.
338 * It sits on top of the PROG_STATE_VAR parameters that are
339 * automatically updated from GL context state.
342 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
344 const ir_state_slot *const slots = ir->state_slots;
345 assert(ir->state_slots != NULL);
347 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
348 /* This state reference has already been setup by ir_to_mesa, but we'll
349 * get the same index back here.
351 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
352 (gl_state_index *)slots[i].tokens);
354 /* Add each of the unique swizzles of the element as a parameter.
355 * This'll end up matching the expected layout of the
356 * array/matrix/structure we're trying to fill in.
359 for (unsigned int j = 0; j < 4; j++) {
360 int swiz = GET_SWZ(slots[i].swizzle, j);
361 if (swiz == last_swiz)
365 c->prog_data.param_convert[c->prog_data.nr_params] =
367 this->param_index[c->prog_data.nr_params] = index;
368 this->param_offset[c->prog_data.nr_params] = swiz;
369 c->prog_data.nr_params++;
375 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
377 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
379 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
382 if (ir->pixel_center_integer) {
383 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
385 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
390 if (!flip && ir->pixel_center_integer) {
391 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
393 fs_reg pixel_y = this->pixel_y;
394 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
397 pixel_y.negate = true;
398 offset += c->key.drawable_height - 1.0;
401 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
406 if (intel->gen >= 6) {
407 emit(BRW_OPCODE_MOV, wpos,
408 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
410 emit(FS_OPCODE_LINTERP, wpos,
411 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
412 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
413 interp_reg(FRAG_ATTRIB_WPOS, 2));
417 /* gl_FragCoord.w: Already set up in emit_interpolation */
418 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
424 fs_visitor::emit_general_interpolation(ir_variable *ir)
426 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
427 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
430 unsigned int array_elements;
431 const glsl_type *type;
433 if (ir->type->is_array()) {
434 array_elements = ir->type->length;
435 if (array_elements == 0) {
436 fail("dereferenced array '%s' has length 0\n", ir->name);
438 type = ir->type->fields.array;
444 glsl_interp_qualifier interpolation_mode =
445 ir->determine_interpolation_mode(c->key.flat_shade);
447 int location = ir->location;
448 for (unsigned int i = 0; i < array_elements; i++) {
449 for (unsigned int j = 0; j < type->matrix_columns; j++) {
450 if (urb_setup[location] == -1) {
451 /* If there's no incoming setup data for this slot, don't
452 * emit interpolation for it.
454 attr.reg_offset += type->vector_elements;
459 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
460 /* Constant interpolation (flat shading) case. The SF has
461 * handed us defined values in only the constant offset
462 * field of the setup reg.
464 for (unsigned int k = 0; k < type->vector_elements; k++) {
465 struct brw_reg interp = interp_reg(location, k);
466 interp = suboffset(interp, 3);
467 interp.type = reg->type;
468 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
472 /* Smooth/noperspective interpolation case. */
473 for (unsigned int k = 0; k < type->vector_elements; k++) {
474 /* FINISHME: At some point we probably want to push
475 * this farther by giving similar treatment to the
476 * other potentially constant components of the
477 * attribute, as well as making brw_vs_constval.c
478 * handle varyings other than gl_TexCoord.
480 if (location >= FRAG_ATTRIB_TEX0 &&
481 location <= FRAG_ATTRIB_TEX7 &&
482 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
483 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
485 struct brw_reg interp = interp_reg(location, k);
486 brw_wm_barycentric_interp_mode barycoord_mode;
487 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
488 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
490 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
491 emit(FS_OPCODE_LINTERP, attr,
492 this->delta_x[barycoord_mode],
493 this->delta_y[barycoord_mode], fs_reg(interp));
494 if (intel->gen < 6) {
495 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
510 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
512 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
514 /* The frontfacing comes in as a bit in the thread payload. */
515 if (intel->gen >= 6) {
516 emit(BRW_OPCODE_ASR, *reg,
517 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
519 emit(BRW_OPCODE_NOT, *reg, *reg);
520 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
522 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
523 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
526 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
529 inst->conditional_mod = BRW_CONDITIONAL_L;
530 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
537 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
540 case SHADER_OPCODE_RCP:
541 case SHADER_OPCODE_RSQ:
542 case SHADER_OPCODE_SQRT:
543 case SHADER_OPCODE_EXP2:
544 case SHADER_OPCODE_LOG2:
545 case SHADER_OPCODE_SIN:
546 case SHADER_OPCODE_COS:
549 assert(!"not reached: bad math opcode");
553 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
554 * might be able to do better by doing execsize = 1 math and then
555 * expanding that result out, but we would need to be careful with
558 * Gen 6 hardware ignores source modifiers (negate and abs) on math
559 * instructions, so we also move to a temp to set those up.
561 if (intel->gen == 6 && (src.file == UNIFORM ||
564 fs_reg expanded = fs_reg(this, glsl_type::float_type);
565 emit(BRW_OPCODE_MOV, expanded, src);
569 fs_inst *inst = emit(opcode, dst, src);
571 if (intel->gen < 6) {
573 inst->mlen = c->dispatch_width / 8;
580 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
586 case SHADER_OPCODE_POW:
587 case SHADER_OPCODE_INT_QUOTIENT:
588 case SHADER_OPCODE_INT_REMAINDER:
591 assert(!"not reached: unsupported binary math opcode.");
595 if (intel->gen >= 7) {
596 inst = emit(opcode, dst, src0, src1);
597 } else if (intel->gen == 6) {
598 /* Can't do hstride == 0 args to gen6 math, so expand it out.
600 * The hardware ignores source modifiers (negate and abs) on math
601 * instructions, so we also move to a temp to set those up.
603 if (src0.file == UNIFORM || src0.abs || src0.negate) {
604 fs_reg expanded = fs_reg(this, glsl_type::float_type);
605 expanded.type = src0.type;
606 emit(BRW_OPCODE_MOV, expanded, src0);
610 if (src1.file == UNIFORM || src1.abs || src1.negate) {
611 fs_reg expanded = fs_reg(this, glsl_type::float_type);
612 expanded.type = src1.type;
613 emit(BRW_OPCODE_MOV, expanded, src1);
617 inst = emit(opcode, dst, src0, src1);
619 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
622 * "Operand0[7]. For the INT DIV functions, this operand is the
625 * "Operand1[7]. For the INT DIV functions, this operand is the
628 bool is_int_div = opcode != SHADER_OPCODE_POW;
629 fs_reg &op0 = is_int_div ? src1 : src0;
630 fs_reg &op1 = is_int_div ? src0 : src1;
632 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
633 inst = emit(opcode, dst, op0, reg_null_f);
635 inst->base_mrf = base_mrf;
636 inst->mlen = 2 * c->dispatch_width / 8;
642 * To be called after the last _mesa_add_state_reference() call, to
643 * set up prog_data.param[] for assign_curb_setup() and
644 * setup_pull_constants().
647 fs_visitor::setup_paramvalues_refs()
649 if (c->dispatch_width != 8)
652 /* Set up the pointers to ParamValues now that that array is finalized. */
653 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
654 c->prog_data.param[i] =
655 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
656 this->param_offset[i];
661 fs_visitor::assign_curb_setup()
663 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
664 if (c->dispatch_width == 8) {
665 c->prog_data.first_curbe_grf = c->nr_payload_regs;
667 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
670 /* Map the offsets in the UNIFORM file to fixed HW regs. */
671 foreach_list(node, &this->instructions) {
672 fs_inst *inst = (fs_inst *)node;
674 for (unsigned int i = 0; i < 3; i++) {
675 if (inst->src[i].file == UNIFORM) {
676 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
677 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
681 inst->src[i].file = FIXED_HW_REG;
682 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
689 fs_visitor::calculate_urb_setup()
691 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
696 /* Figure out where each of the incoming setup attributes lands. */
697 if (intel->gen >= 6) {
698 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
699 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
700 urb_setup[i] = urb_next++;
704 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
705 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
706 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
707 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
710 urb_setup[fp_index] = urb_next++;
715 * It's a FS only attribute, and we did interpolation for this attribute
716 * in SF thread. So, count it here, too.
718 * See compile_sf_prog() for more info.
720 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
721 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
724 /* Each attribute is 4 setup channels, each of which is half a reg. */
725 c->prog_data.urb_read_length = urb_next * 2;
729 fs_visitor::assign_urb_setup()
731 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
733 /* Offset all the urb_setup[] index by the actual position of the
734 * setup regs, now that the location of the constants has been chosen.
736 foreach_list(node, &this->instructions) {
737 fs_inst *inst = (fs_inst *)node;
739 if (inst->opcode == FS_OPCODE_LINTERP) {
740 assert(inst->src[2].file == FIXED_HW_REG);
741 inst->src[2].fixed_hw_reg.nr += urb_start;
744 if (inst->opcode == FS_OPCODE_CINTERP) {
745 assert(inst->src[0].file == FIXED_HW_REG);
746 inst->src[0].fixed_hw_reg.nr += urb_start;
750 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
754 * Split large virtual GRFs into separate components if we can.
756 * This is mostly duplicated with what brw_fs_vector_splitting does,
757 * but that's really conservative because it's afraid of doing
758 * splitting that doesn't result in real progress after the rest of
759 * the optimization phases, which would cause infinite looping in
760 * optimization. We can do it once here, safely. This also has the
761 * opportunity to split interpolated values, or maybe even uniforms,
762 * which we don't have at the IR level.
764 * We want to split, because virtual GRFs are what we register
765 * allocate and spill (due to contiguousness requirements for some
766 * instructions), and they're what we naturally generate in the
767 * codegen process, but most virtual GRFs don't actually need to be
768 * contiguous sets of GRFs. If we split, we'll end up with reduced
769 * live intervals and better dead code elimination and coalescing.
772 fs_visitor::split_virtual_grfs()
774 int num_vars = this->virtual_grf_next;
775 bool split_grf[num_vars];
776 int new_virtual_grf[num_vars];
778 /* Try to split anything > 0 sized. */
779 for (int i = 0; i < num_vars; i++) {
780 if (this->virtual_grf_sizes[i] != 1)
783 split_grf[i] = false;
787 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
788 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
789 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
790 * Gen6, that was the only supported interpolation mode, and since Gen6,
791 * delta_x and delta_y are in fixed hardware registers.
793 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
797 foreach_list(node, &this->instructions) {
798 fs_inst *inst = (fs_inst *)node;
800 /* Texturing produces 4 contiguous registers, so no splitting. */
801 if (inst->is_tex()) {
802 split_grf[inst->dst.reg] = false;
806 /* Allocate new space for split regs. Note that the virtual
807 * numbers will be contiguous.
809 for (int i = 0; i < num_vars; i++) {
811 new_virtual_grf[i] = virtual_grf_alloc(1);
812 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
813 int reg = virtual_grf_alloc(1);
814 assert(reg == new_virtual_grf[i] + j - 1);
817 this->virtual_grf_sizes[i] = 1;
821 foreach_list(node, &this->instructions) {
822 fs_inst *inst = (fs_inst *)node;
824 if (inst->dst.file == GRF &&
825 split_grf[inst->dst.reg] &&
826 inst->dst.reg_offset != 0) {
827 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
828 inst->dst.reg_offset - 1);
829 inst->dst.reg_offset = 0;
831 for (int i = 0; i < 3; i++) {
832 if (inst->src[i].file == GRF &&
833 split_grf[inst->src[i].reg] &&
834 inst->src[i].reg_offset != 0) {
835 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
836 inst->src[i].reg_offset - 1);
837 inst->src[i].reg_offset = 0;
841 this->live_intervals_valid = false;
845 fs_visitor::remove_dead_constants()
847 if (c->dispatch_width == 8) {
848 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
850 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
851 this->params_remap[i] = -1;
853 /* Find which params are still in use. */
854 foreach_list(node, &this->instructions) {
855 fs_inst *inst = (fs_inst *)node;
857 for (int i = 0; i < 3; i++) {
858 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
860 if (inst->src[i].file != UNIFORM)
863 assert(constant_nr < (int)c->prog_data.nr_params);
865 /* For now, set this to non-negative. We'll give it the
866 * actual new number in a moment, in order to keep the
867 * register numbers nicely ordered.
869 this->params_remap[constant_nr] = 0;
873 /* Figure out what the new numbers for the params will be. At some
874 * point when we're doing uniform array access, we're going to want
875 * to keep the distinction between .reg and .reg_offset, but for
878 unsigned int new_nr_params = 0;
879 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
880 if (this->params_remap[i] != -1) {
881 this->params_remap[i] = new_nr_params++;
885 /* Update the list of params to be uploaded to match our new numbering. */
886 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
887 int remapped = this->params_remap[i];
892 /* We've already done setup_paramvalues_refs() so no need to worry
893 * about param_index and param_offset.
895 c->prog_data.param[remapped] = c->prog_data.param[i];
896 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i];
899 c->prog_data.nr_params = new_nr_params;
901 /* This should have been generated in the 8-wide pass already. */
902 assert(this->params_remap);
905 /* Now do the renumbering of the shader to remove unused params. */
906 foreach_list(node, &this->instructions) {
907 fs_inst *inst = (fs_inst *)node;
909 for (int i = 0; i < 3; i++) {
910 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
912 if (inst->src[i].file != UNIFORM)
915 assert(this->params_remap[constant_nr] != -1);
916 inst->src[i].reg = this->params_remap[constant_nr];
917 inst->src[i].reg_offset = 0;
925 * Choose accesses from the UNIFORM file to demote to using the pull
928 * We allow a fragment shader to have more than the specified minimum
929 * maximum number of fragment shader uniform components (64). If
930 * there are too many of these, they'd fill up all of register space.
931 * So, this will push some of them out to the pull constant buffer and
932 * update the program to load them.
935 fs_visitor::setup_pull_constants()
937 /* Only allow 16 registers (128 uniform components) as push constants. */
938 unsigned int max_uniform_components = 16 * 8;
939 if (c->prog_data.nr_params <= max_uniform_components)
942 if (c->dispatch_width == 16) {
943 fail("Pull constants not supported in 16-wide\n");
947 /* Just demote the end of the list. We could probably do better
948 * here, demoting things that are rarely used in the program first.
950 int pull_uniform_base = max_uniform_components;
951 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
953 foreach_list(node, &this->instructions) {
954 fs_inst *inst = (fs_inst *)node;
956 for (int i = 0; i < 3; i++) {
957 if (inst->src[i].file != UNIFORM)
960 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
961 if (uniform_nr < pull_uniform_base)
964 fs_reg dst = fs_reg(this, glsl_type::float_type);
965 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
967 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
969 pull->annotation = inst->annotation;
973 inst->insert_before(pull);
975 inst->src[i].file = GRF;
976 inst->src[i].reg = dst.reg;
977 inst->src[i].reg_offset = 0;
978 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
982 for (int i = 0; i < pull_uniform_count; i++) {
983 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
984 c->prog_data.pull_param_convert[i] =
985 c->prog_data.param_convert[pull_uniform_base + i];
987 c->prog_data.nr_params -= pull_uniform_count;
988 c->prog_data.nr_pull_params = pull_uniform_count;
992 fs_visitor::calculate_live_intervals()
994 int num_vars = this->virtual_grf_next;
995 int *def = ralloc_array(mem_ctx, int, num_vars);
996 int *use = ralloc_array(mem_ctx, int, num_vars);
1000 if (this->live_intervals_valid)
1003 for (int i = 0; i < num_vars; i++) {
1004 def[i] = MAX_INSTRUCTION;
1009 foreach_list(node, &this->instructions) {
1010 fs_inst *inst = (fs_inst *)node;
1012 if (inst->opcode == BRW_OPCODE_DO) {
1013 if (loop_depth++ == 0)
1015 } else if (inst->opcode == BRW_OPCODE_WHILE) {
1018 if (loop_depth == 0) {
1019 /* Patches up the use of vars marked for being live across
1022 for (int i = 0; i < num_vars; i++) {
1023 if (use[i] == loop_start) {
1029 for (unsigned int i = 0; i < 3; i++) {
1030 if (inst->src[i].file == GRF) {
1031 int reg = inst->src[i].reg;
1036 def[reg] = MIN2(loop_start, def[reg]);
1037 use[reg] = loop_start;
1039 /* Nobody else is going to go smash our start to
1040 * later in the loop now, because def[reg] now
1041 * points before the bb header.
1046 if (inst->dst.file == GRF) {
1047 int reg = inst->dst.reg;
1050 def[reg] = MIN2(def[reg], ip);
1052 def[reg] = MIN2(def[reg], loop_start);
1060 ralloc_free(this->virtual_grf_def);
1061 ralloc_free(this->virtual_grf_use);
1062 this->virtual_grf_def = def;
1063 this->virtual_grf_use = use;
1065 this->live_intervals_valid = true;
1069 * Attempts to move immediate constants into the immediate
1070 * constant slot of following instructions.
1072 * Immediate constants are a bit tricky -- they have to be in the last
1073 * operand slot, you can't do abs/negate on them,
1077 fs_visitor::propagate_constants()
1079 bool progress = false;
1081 calculate_live_intervals();
1083 foreach_list(node, &this->instructions) {
1084 fs_inst *inst = (fs_inst *)node;
1086 if (inst->opcode != BRW_OPCODE_MOV ||
1088 inst->dst.file != GRF || inst->src[0].file != IMM ||
1089 inst->dst.type != inst->src[0].type ||
1090 (c->dispatch_width == 16 &&
1091 (inst->force_uncompressed || inst->force_sechalf)))
1094 /* Don't bother with cases where we should have had the
1095 * operation on the constant folded in GLSL already.
1100 /* Found a move of a constant to a GRF. Find anything else using the GRF
1101 * before it's written, and replace it with the constant if we can.
1103 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1104 !scan_inst->is_tail_sentinel();
1105 scan_inst = (fs_inst *)scan_inst->next) {
1106 if (scan_inst->opcode == BRW_OPCODE_DO ||
1107 scan_inst->opcode == BRW_OPCODE_WHILE ||
1108 scan_inst->opcode == BRW_OPCODE_ELSE ||
1109 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1113 for (int i = 2; i >= 0; i--) {
1114 if (scan_inst->src[i].file != GRF ||
1115 scan_inst->src[i].reg != inst->dst.reg ||
1116 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
1119 /* Don't bother with cases where we should have had the
1120 * operation on the constant folded in GLSL already.
1122 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
1125 switch (scan_inst->opcode) {
1126 case BRW_OPCODE_MOV:
1127 scan_inst->src[i] = inst->src[0];
1131 case BRW_OPCODE_MUL:
1132 case BRW_OPCODE_ADD:
1134 scan_inst->src[i] = inst->src[0];
1136 } else if (i == 0 && scan_inst->src[1].file != IMM) {
1137 /* Fit this constant in by commuting the operands.
1138 * Exception: we can't do this for 32-bit integer MUL
1139 * because it's asymmetric.
1141 if (scan_inst->opcode == BRW_OPCODE_MUL &&
1142 (scan_inst->src[1].type == BRW_REGISTER_TYPE_D ||
1143 scan_inst->src[1].type == BRW_REGISTER_TYPE_UD))
1145 scan_inst->src[0] = scan_inst->src[1];
1146 scan_inst->src[1] = inst->src[0];
1151 case BRW_OPCODE_CMP:
1154 scan_inst->src[i] = inst->src[0];
1156 } else if (i == 0 && scan_inst->src[1].file != IMM) {
1159 new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1160 if (new_cmod != ~0u) {
1161 /* Fit this constant in by swapping the operands and
1164 scan_inst->src[0] = scan_inst->src[1];
1165 scan_inst->src[1] = inst->src[0];
1166 scan_inst->conditional_mod = new_cmod;
1172 case BRW_OPCODE_SEL:
1174 scan_inst->src[i] = inst->src[0];
1176 } else if (i == 0 && scan_inst->src[1].file != IMM) {
1177 scan_inst->src[0] = scan_inst->src[1];
1178 scan_inst->src[1] = inst->src[0];
1180 /* If this was predicated, flipping operands means
1181 * we also need to flip the predicate.
1183 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
1184 scan_inst->predicate_inverse =
1185 !scan_inst->predicate_inverse;
1191 case SHADER_OPCODE_RCP:
1192 /* The hardware doesn't do math on immediate values
1193 * (because why are you doing that, seriously?), but
1194 * the correct answer is to just constant fold it
1198 if (inst->src[0].imm.f != 0.0f) {
1199 scan_inst->opcode = BRW_OPCODE_MOV;
1200 scan_inst->src[0] = inst->src[0];
1201 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
1211 if (scan_inst->dst.file == GRF &&
1212 scan_inst->dst.reg == inst->dst.reg &&
1213 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1214 scan_inst->is_tex())) {
1221 this->live_intervals_valid = false;
1228 * Attempts to move immediate constants into the immediate
1229 * constant slot of following instructions.
1231 * Immediate constants are a bit tricky -- they have to be in the last
1232 * operand slot, you can't do abs/negate on them,
1236 fs_visitor::opt_algebraic()
1238 bool progress = false;
1240 calculate_live_intervals();
1242 foreach_list(node, &this->instructions) {
1243 fs_inst *inst = (fs_inst *)node;
1245 switch (inst->opcode) {
1246 case BRW_OPCODE_MUL:
1247 if (inst->src[1].file != IMM)
1251 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1252 inst->src[1].imm.f == 1.0) {
1253 inst->opcode = BRW_OPCODE_MOV;
1254 inst->src[1] = reg_undef;
1269 * Must be called after calculate_live_intervales() to remove unused
1270 * writes to registers -- register allocation will fail otherwise
1271 * because something deffed but not used won't be considered to
1272 * interfere with other regs.
1275 fs_visitor::dead_code_eliminate()
1277 bool progress = false;
1280 calculate_live_intervals();
1282 foreach_list_safe(node, &this->instructions) {
1283 fs_inst *inst = (fs_inst *)node;
1285 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1294 live_intervals_valid = false;
1300 fs_visitor::register_coalesce()
1302 bool progress = false;
1306 foreach_list_safe(node, &this->instructions) {
1307 fs_inst *inst = (fs_inst *)node;
1309 /* Make sure that we dominate the instructions we're going to
1310 * scan for interfering with our coalescing, or we won't have
1311 * scanned enough to see if anything interferes with our
1312 * coalescing. We don't dominate the following instructions if
1313 * we're in a loop or an if block.
1315 switch (inst->opcode) {
1319 case BRW_OPCODE_WHILE:
1325 case BRW_OPCODE_ENDIF:
1331 if (loop_depth || if_depth)
1334 if (inst->opcode != BRW_OPCODE_MOV ||
1337 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1338 inst->src[0].file != UNIFORM)||
1339 inst->dst.type != inst->src[0].type)
1342 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1344 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1345 * them: check for no writes to either one until the exit of the
1348 bool interfered = false;
1350 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1351 !scan_inst->is_tail_sentinel();
1352 scan_inst = (fs_inst *)scan_inst->next) {
1353 if (scan_inst->dst.file == GRF) {
1354 if (scan_inst->dst.reg == inst->dst.reg &&
1355 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1356 scan_inst->is_tex())) {
1360 if (inst->src[0].file == GRF &&
1361 scan_inst->dst.reg == inst->src[0].reg &&
1362 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1363 scan_inst->is_tex())) {
1369 /* The gen6 MATH instruction can't handle source modifiers or
1370 * unusual register regions, so avoid coalescing those for
1371 * now. We should do something more specific.
1373 if (intel->gen >= 6 &&
1374 scan_inst->is_math() &&
1375 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1380 /* The accumulator result appears to get used for the
1381 * conditional modifier generation. When negating a UD
1382 * value, there is a 33rd bit generated for the sign in the
1383 * accumulator value, so now you can't check, for example,
1384 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1386 if (scan_inst->conditional_mod &&
1387 inst->src[0].negate &&
1388 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1397 /* Rewrite the later usage to point at the source of the move to
1400 for (fs_inst *scan_inst = inst;
1401 !scan_inst->is_tail_sentinel();
1402 scan_inst = (fs_inst *)scan_inst->next) {
1403 for (int i = 0; i < 3; i++) {
1404 if (scan_inst->src[i].file == GRF &&
1405 scan_inst->src[i].reg == inst->dst.reg &&
1406 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1407 fs_reg new_src = inst->src[0];
1408 if (scan_inst->src[i].abs) {
1412 new_src.negate ^= scan_inst->src[i].negate;
1413 scan_inst->src[i] = new_src;
1423 live_intervals_valid = false;
1430 fs_visitor::compute_to_mrf()
1432 bool progress = false;
1435 calculate_live_intervals();
1437 foreach_list_safe(node, &this->instructions) {
1438 fs_inst *inst = (fs_inst *)node;
1443 if (inst->opcode != BRW_OPCODE_MOV ||
1445 inst->dst.file != MRF || inst->src[0].file != GRF ||
1446 inst->dst.type != inst->src[0].type ||
1447 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1450 /* Work out which hardware MRF registers are written by this
1453 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1455 if (inst->dst.reg & BRW_MRF_COMPR4) {
1456 mrf_high = mrf_low + 4;
1457 } else if (c->dispatch_width == 16 &&
1458 (!inst->force_uncompressed && !inst->force_sechalf)) {
1459 mrf_high = mrf_low + 1;
1464 /* Can't compute-to-MRF this GRF if someone else was going to
1467 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1470 /* Found a move of a GRF to a MRF. Let's see if we can go
1471 * rewrite the thing that made this GRF to write into the MRF.
1474 for (scan_inst = (fs_inst *)inst->prev;
1475 scan_inst->prev != NULL;
1476 scan_inst = (fs_inst *)scan_inst->prev) {
1477 if (scan_inst->dst.file == GRF &&
1478 scan_inst->dst.reg == inst->src[0].reg) {
1479 /* Found the last thing to write our reg we want to turn
1480 * into a compute-to-MRF.
1483 if (scan_inst->is_tex()) {
1484 /* texturing writes several continuous regs, so we can't
1485 * compute-to-mrf that.
1490 /* If it's predicated, it (probably) didn't populate all
1491 * the channels. We might be able to rewrite everything
1492 * that writes that reg, but it would require smarter
1493 * tracking to delay the rewriting until complete success.
1495 if (scan_inst->predicated)
1498 /* If it's half of register setup and not the same half as
1499 * our MOV we're trying to remove, bail for now.
1501 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1502 scan_inst->force_sechalf != inst->force_sechalf) {
1506 /* SEND instructions can't have MRF as a destination. */
1507 if (scan_inst->mlen)
1510 if (intel->gen >= 6) {
1511 /* gen6 math instructions must have the destination be
1512 * GRF, so no compute-to-MRF for them.
1514 if (scan_inst->is_math()) {
1519 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1520 /* Found the creator of our MRF's source value. */
1521 scan_inst->dst.file = MRF;
1522 scan_inst->dst.reg = inst->dst.reg;
1523 scan_inst->saturate |= inst->saturate;
1530 /* We don't handle flow control here. Most computation of
1531 * values that end up in MRFs are shortly before the MRF
1534 if (scan_inst->opcode == BRW_OPCODE_DO ||
1535 scan_inst->opcode == BRW_OPCODE_WHILE ||
1536 scan_inst->opcode == BRW_OPCODE_ELSE ||
1537 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1541 /* You can't read from an MRF, so if someone else reads our
1542 * MRF's source GRF that we wanted to rewrite, that stops us.
1544 bool interfered = false;
1545 for (int i = 0; i < 3; i++) {
1546 if (scan_inst->src[i].file == GRF &&
1547 scan_inst->src[i].reg == inst->src[0].reg &&
1548 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1555 if (scan_inst->dst.file == MRF) {
1556 /* If somebody else writes our MRF here, we can't
1557 * compute-to-MRF before that.
1559 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1562 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1563 scan_mrf_high = scan_mrf_low + 4;
1564 } else if (c->dispatch_width == 16 &&
1565 (!scan_inst->force_uncompressed &&
1566 !scan_inst->force_sechalf)) {
1567 scan_mrf_high = scan_mrf_low + 1;
1569 scan_mrf_high = scan_mrf_low;
1572 if (mrf_low == scan_mrf_low ||
1573 mrf_low == scan_mrf_high ||
1574 mrf_high == scan_mrf_low ||
1575 mrf_high == scan_mrf_high) {
1580 if (scan_inst->mlen > 0) {
1581 /* Found a SEND instruction, which means that there are
1582 * live values in MRFs from base_mrf to base_mrf +
1583 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1586 if (mrf_low >= scan_inst->base_mrf &&
1587 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1590 if (mrf_high >= scan_inst->base_mrf &&
1591 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1602 * Walks through basic blocks, looking for repeated MRF writes and
1603 * removing the later ones.
1606 fs_visitor::remove_duplicate_mrf_writes()
1608 fs_inst *last_mrf_move[16];
1609 bool progress = false;
1611 /* Need to update the MRF tracking for compressed instructions. */
1612 if (c->dispatch_width == 16)
1615 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1617 foreach_list_safe(node, &this->instructions) {
1618 fs_inst *inst = (fs_inst *)node;
1620 switch (inst->opcode) {
1622 case BRW_OPCODE_WHILE:
1624 case BRW_OPCODE_ELSE:
1625 case BRW_OPCODE_ENDIF:
1626 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1632 if (inst->opcode == BRW_OPCODE_MOV &&
1633 inst->dst.file == MRF) {
1634 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1635 if (prev_inst && inst->equals(prev_inst)) {
1642 /* Clear out the last-write records for MRFs that were overwritten. */
1643 if (inst->dst.file == MRF) {
1644 last_mrf_move[inst->dst.reg] = NULL;
1647 if (inst->mlen > 0) {
1648 /* Found a SEND instruction, which will include two or fewer
1649 * implied MRF writes. We could do better here.
1651 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1652 last_mrf_move[inst->base_mrf + i] = NULL;
1656 /* Clear out any MRF move records whose sources got overwritten. */
1657 if (inst->dst.file == GRF) {
1658 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1659 if (last_mrf_move[i] &&
1660 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1661 last_mrf_move[i] = NULL;
1666 if (inst->opcode == BRW_OPCODE_MOV &&
1667 inst->dst.file == MRF &&
1668 inst->src[0].file == GRF &&
1669 !inst->predicated) {
1670 last_mrf_move[inst->dst.reg] = inst;
1678 fs_visitor::virtual_grf_interferes(int a, int b)
1680 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
1681 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
1683 /* We can't handle dead register writes here, without iterating
1684 * over the whole instruction stream to find every single dead
1685 * write to that register to compare to the live interval of the
1686 * other register. Just assert that dead_code_eliminate() has been
1689 assert((this->virtual_grf_use[a] != -1 ||
1690 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
1691 (this->virtual_grf_use[b] != -1 ||
1692 this->virtual_grf_def[b] == MAX_INSTRUCTION));
1694 /* If the register is used to store 16 values of less than float
1695 * size (only the case for pixel_[xy]), then we can't allocate
1696 * another dword-sized thing to that register that would be used in
1697 * the same instruction. This is because when the GPU decodes (for
1700 * (declare (in ) vec4 gl_FragCoord@0x97766a0)
1701 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr };
1703 * it's actually processed as:
1704 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 };
1705 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf };
1707 * so our second half values in g6 got overwritten in the first
1710 if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
1711 this->pixel_x.reg == b ||
1712 this->pixel_y.reg == a ||
1713 this->pixel_y.reg == b)) {
1714 return start <= end;
1723 uint32_t prog_offset_16 = 0;
1724 uint32_t orig_nr_params = c->prog_data.nr_params;
1726 brw_wm_payload_setup(brw, c);
1728 if (c->dispatch_width == 16) {
1729 /* align to 64 byte boundary. */
1730 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1734 /* Save off the start of this 16-wide program in case we succeed. */
1735 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1737 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1743 calculate_urb_setup();
1745 emit_interpolation_setup_gen4();
1747 emit_interpolation_setup_gen6();
1749 /* Generate FS IR for main(). (the visitor only descends into
1750 * functions called "main").
1752 foreach_list(node, &*shader->ir) {
1753 ir_instruction *ir = (ir_instruction *)node;
1755 this->result = reg_undef;
1763 split_virtual_grfs();
1765 setup_paramvalues_refs();
1766 setup_pull_constants();
1772 progress = remove_duplicate_mrf_writes() || progress;
1774 progress = propagate_constants() || progress;
1775 progress = opt_algebraic() || progress;
1776 progress = register_coalesce() || progress;
1777 progress = compute_to_mrf() || progress;
1778 progress = dead_code_eliminate() || progress;
1781 remove_dead_constants();
1783 schedule_instructions();
1785 assign_curb_setup();
1789 /* Debug of register spilling: Go spill everything. */
1790 int virtual_grf_count = virtual_grf_next;
1791 for (int i = 0; i < virtual_grf_count; i++) {
1797 assign_regs_trivial();
1799 while (!assign_regs()) {
1805 assert(force_uncompressed_stack == 0);
1806 assert(force_sechalf_stack == 0);
1813 if (c->dispatch_width == 8) {
1814 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1816 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1817 c->prog_data.prog_offset_16 = prog_offset_16;
1819 /* Make sure we didn't try to sneak in an extra uniform */
1820 assert(orig_nr_params == c->prog_data.nr_params);
1821 (void) orig_nr_params;
1828 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
1829 struct gl_shader_program *prog)
1831 struct intel_context *intel = &brw->intel;
1836 struct brw_shader *shader =
1837 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1841 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1842 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1843 _mesa_print_ir(shader->ir, NULL);
1847 /* Now the main event: Visit the shader IR and generate our FS IR for it.
1849 c->dispatch_width = 8;
1851 fs_visitor v(c, prog, shader);
1853 prog->LinkStatus = false;
1854 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1856 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
1862 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1863 c->dispatch_width = 16;
1864 fs_visitor v2(c, prog, shader);
1865 v2.import_uniforms(&v);
1869 c->prog_data.dispatch_width = 8;
1875 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
1877 struct brw_context *brw = brw_context(ctx);
1878 struct brw_wm_prog_key key;
1880 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
1883 struct gl_fragment_program *fp = (struct gl_fragment_program *)
1884 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
1885 struct brw_fragment_program *bfp = brw_fragment_program(fp);
1887 memset(&key, 0, sizeof(key));
1890 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
1892 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
1893 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
1895 /* Just assume depth testing. */
1896 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
1897 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
1899 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
1900 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1901 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
1904 key.proj_attrib_mask |= 1 << i;
1906 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1909 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
1912 key.clamp_fragment_color = true;
1914 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
1915 if (fp->Base.ShadowSamplers & (1 << i))
1916 key.tex.compare_funcs[i] = GL_LESS;
1918 /* FINISHME: depth compares might use (0,0,0,W) for example */
1919 key.tex.swizzles[i] = SWIZZLE_XYZW;
1922 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
1923 key.drawable_height = ctx->DrawBuffer->Height;
1924 key.render_to_fbo = ctx->DrawBuffer->Name != 0;
1927 key.nr_color_regions = 1;
1929 key.program_string_id = bfp->id;
1931 uint32_t old_prog_offset = brw->wm.prog_offset;
1932 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
1934 bool success = do_wm_prog(brw, prog, bfp, &key);
1936 brw->wm.prog_offset = old_prog_offset;
1937 brw->wm.prog_data = old_prog_data;