2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
33 #include <sys/types.h>
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
73 fs_inst::fs_inst(enum opcode opcode)
76 this->opcode = opcode;
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 this->opcode = opcode;
86 assert(dst.reg_offset >= 0);
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 this->opcode = opcode;
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 this->opcode = opcode;
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
122 this->opcode = opcode;
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
176 /** Gen4 predicated IF. */
178 fs_visitor::IF(uint32_t predicate)
180 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
181 inst->predicate = predicate;
185 /** Gen6+ IF with embedded comparison. */
187 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
189 assert(intel->gen >= 6);
190 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
191 reg_null_d, src0, src1);
192 inst->conditional_mod = condition;
197 * CMP: Sets the low bit of the destination channels with the result
198 * of the comparison, while the upper bits are undefined, and updates
199 * the flag register with the packed 16 bits of the result.
202 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
206 /* Take the instruction:
208 * CMP null<d> src0<f> src1<f>
210 * Original gen4 does type conversion to the destination type before
211 * comparison, producing garbage results for floating point comparisons.
212 * gen5 does the comparison on the execution type (resolved source types),
213 * so dst type doesn't matter. gen6 does comparison and then uses the
214 * result as if it was the dst type with no conversion, which happens to
215 * mostly work out for float-interpreted-as-int since our comparisons are
218 if (intel->gen == 4) {
219 dst.type = src0.type;
220 if (dst.file == FIXED_HW_REG)
221 dst.fixed_hw_reg.type = dst.type;
224 resolve_ud_negate(&src0);
225 resolve_ud_negate(&src1);
227 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
228 inst->conditional_mod = condition;
234 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
235 fs_reg varying_offset,
236 uint32_t const_offset)
238 exec_list instructions;
241 /* We have our constant surface use a pitch of 4 bytes, so our index can
242 * be any component of a vector, and then we load 4 contiguous
243 * components starting from that.
245 * We break down the const_offset to a portion added to the variable
246 * offset and a portion done using reg_offset, which means that if you
247 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
248 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
249 * CSE can later notice that those loads are all the same and eliminate
250 * the redundant ones.
252 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
253 instructions.push_tail(ADD(vec4_offset,
254 varying_offset, const_offset & ~3));
257 if (intel->gen == 4 && dispatch_width == 8) {
258 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
259 * u, v, r) as parameters, or we can just use the SIMD16 message
260 * consisting of (header, u). We choose the second, at the cost of a
261 * longer return length.
268 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
270 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
271 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
272 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
273 inst->regs_written = 4 * scale;
274 instructions.push_tail(inst);
276 if (intel->gen < 7) {
278 inst->header_present = true;
282 inst->mlen = 1 + dispatch_width / 8;
285 vec4_result.reg_offset += (const_offset & 3) * scale;
286 instructions.push_tail(MOV(dst, vec4_result));
292 * A helper for MOV generation for fixing up broken hardware SEND dependency
296 fs_visitor::DEP_RESOLVE_MOV(int grf)
298 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
301 inst->annotation = "send dependency resolve";
303 /* The caller always wants uncompressed to emit the minimal extra
304 * dependencies, and to avoid having to deal with aligning its regs to 2.
306 inst->force_uncompressed = true;
312 fs_inst::equals(fs_inst *inst)
314 return (opcode == inst->opcode &&
315 dst.equals(inst->dst) &&
316 src[0].equals(inst->src[0]) &&
317 src[1].equals(inst->src[1]) &&
318 src[2].equals(inst->src[2]) &&
319 saturate == inst->saturate &&
320 predicate == inst->predicate &&
321 conditional_mod == inst->conditional_mod &&
322 mlen == inst->mlen &&
323 base_mrf == inst->base_mrf &&
324 sampler == inst->sampler &&
325 target == inst->target &&
327 header_present == inst->header_present &&
328 shadow_compare == inst->shadow_compare &&
329 offset == inst->offset);
333 fs_inst::overwrites_reg(const fs_reg ®)
335 return (reg.file == dst.file &&
336 reg.reg == dst.reg &&
337 reg.reg_offset >= dst.reg_offset &&
338 reg.reg_offset < dst.reg_offset + regs_written);
344 return (opcode == SHADER_OPCODE_TEX ||
345 opcode == FS_OPCODE_TXB ||
346 opcode == SHADER_OPCODE_TXD ||
347 opcode == SHADER_OPCODE_TXF ||
348 opcode == SHADER_OPCODE_TXF_MS ||
349 opcode == SHADER_OPCODE_TXL ||
350 opcode == SHADER_OPCODE_TXS ||
351 opcode == SHADER_OPCODE_LOD);
357 return (opcode == SHADER_OPCODE_RCP ||
358 opcode == SHADER_OPCODE_RSQ ||
359 opcode == SHADER_OPCODE_SQRT ||
360 opcode == SHADER_OPCODE_EXP2 ||
361 opcode == SHADER_OPCODE_LOG2 ||
362 opcode == SHADER_OPCODE_SIN ||
363 opcode == SHADER_OPCODE_COS ||
364 opcode == SHADER_OPCODE_INT_QUOTIENT ||
365 opcode == SHADER_OPCODE_INT_REMAINDER ||
366 opcode == SHADER_OPCODE_POW);
370 fs_inst::is_control_flow()
374 case BRW_OPCODE_WHILE:
376 case BRW_OPCODE_ELSE:
377 case BRW_OPCODE_ENDIF:
378 case BRW_OPCODE_BREAK:
379 case BRW_OPCODE_CONTINUE:
387 fs_inst::is_send_from_grf()
389 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
390 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
391 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
392 src[1].file == GRF));
396 fs_visitor::can_do_source_mods(fs_inst *inst)
398 if (intel->gen == 6 && inst->is_math())
401 if (inst->is_send_from_grf())
410 memset(this, 0, sizeof(*this));
414 /** Generic unset register constructor. */
418 this->file = BAD_FILE;
421 /** Immediate value constructor. */
422 fs_reg::fs_reg(float f)
426 this->type = BRW_REGISTER_TYPE_F;
430 /** Immediate value constructor. */
431 fs_reg::fs_reg(int32_t i)
435 this->type = BRW_REGISTER_TYPE_D;
439 /** Immediate value constructor. */
440 fs_reg::fs_reg(uint32_t u)
444 this->type = BRW_REGISTER_TYPE_UD;
448 /** Fixed brw_reg Immediate value constructor. */
449 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
452 this->file = FIXED_HW_REG;
453 this->fixed_hw_reg = fixed_hw_reg;
454 this->type = fixed_hw_reg.type;
458 fs_reg::equals(const fs_reg &r) const
460 return (file == r.file &&
462 reg_offset == r.reg_offset &&
464 negate == r.negate &&
466 !reladdr && !r.reladdr &&
467 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
468 sizeof(fixed_hw_reg)) == 0 &&
474 fs_reg::is_zero() const
479 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
483 fs_reg::is_one() const
488 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
492 fs_visitor::type_size(const struct glsl_type *type)
494 unsigned int size, i;
496 switch (type->base_type) {
499 case GLSL_TYPE_FLOAT:
501 return type->components();
502 case GLSL_TYPE_ARRAY:
503 return type_size(type->fields.array) * type->length;
504 case GLSL_TYPE_STRUCT:
506 for (i = 0; i < type->length; i++) {
507 size += type_size(type->fields.structure[i].type);
510 case GLSL_TYPE_SAMPLER:
511 /* Samplers take up no register space, since they're baked in at
516 case GLSL_TYPE_ERROR:
517 case GLSL_TYPE_INTERFACE:
518 assert(!"not reached");
526 fs_visitor::get_timestamp()
528 assert(intel->gen >= 7);
530 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
533 BRW_REGISTER_TYPE_UD));
535 fs_reg dst = fs_reg(this, glsl_type::uint_type);
537 fs_inst *mov = emit(MOV(dst, ts));
538 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
539 * even if it's not enabled in the dispatch.
541 mov->force_writemask_all = true;
542 mov->force_uncompressed = true;
544 /* The caller wants the low 32 bits of the timestamp. Since it's running
545 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
546 * which is plenty of time for our purposes. It is identical across the
547 * EUs, but since it's tracking GPU core speed it will increment at a
548 * varying rate as render P-states change.
550 * The caller could also check if render P-states have changed (or anything
551 * else that might disrupt timing) by setting smear to 2 and checking if
552 * that field is != 0.
560 fs_visitor::emit_shader_time_begin()
562 current_annotation = "shader time start";
563 shader_start_time = get_timestamp();
567 fs_visitor::emit_shader_time_end()
569 current_annotation = "shader time end";
571 enum shader_time_shader_type type, written_type, reset_type;
572 if (dispatch_width == 8) {
574 written_type = ST_FS8_WRITTEN;
575 reset_type = ST_FS8_RESET;
577 assert(dispatch_width == 16);
579 written_type = ST_FS16_WRITTEN;
580 reset_type = ST_FS16_RESET;
583 fs_reg shader_end_time = get_timestamp();
585 /* Check that there weren't any timestamp reset events (assuming these
586 * were the only two timestamp reads that happened).
588 fs_reg reset = shader_end_time;
590 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
591 test->conditional_mod = BRW_CONDITIONAL_Z;
592 emit(IF(BRW_PREDICATE_NORMAL));
594 push_force_uncompressed();
595 fs_reg start = shader_start_time;
597 fs_reg diff = fs_reg(this, glsl_type::uint_type);
598 emit(ADD(diff, start, shader_end_time));
600 /* If there were no instructions between the two timestamp gets, the diff
601 * is 2 cycles. Remove that overhead, so I can forget about that when
602 * trying to determine the time taken for single instructions.
604 emit(ADD(diff, diff, fs_reg(-2u)));
606 emit_shader_time_write(type, diff);
607 emit_shader_time_write(written_type, fs_reg(1u));
608 emit(BRW_OPCODE_ELSE);
609 emit_shader_time_write(reset_type, fs_reg(1u));
610 emit(BRW_OPCODE_ENDIF);
612 pop_force_uncompressed();
616 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
619 int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
621 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
624 if (dispatch_width == 8)
625 payload = fs_reg(this, glsl_type::uvec2_type);
627 payload = fs_reg(this, glsl_type::uint_type);
629 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
630 fs_reg(), payload, offset, value));
634 fs_visitor::fail(const char *format, ...)
644 va_start(va, format);
645 msg = ralloc_vasprintf(mem_ctx, format, va);
647 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649 this->fail_msg = msg;
651 if (INTEL_DEBUG & DEBUG_WM) {
652 fprintf(stderr, "%s", msg);
657 fs_visitor::emit(enum opcode opcode)
659 return emit(fs_inst(opcode));
663 fs_visitor::emit(enum opcode opcode, fs_reg dst)
665 return emit(fs_inst(opcode, dst));
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
671 return emit(fs_inst(opcode, dst, src0));
675 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
677 return emit(fs_inst(opcode, dst, src0, src1));
681 fs_visitor::emit(enum opcode opcode, fs_reg dst,
682 fs_reg src0, fs_reg src1, fs_reg src2)
684 return emit(fs_inst(opcode, dst, src0, src1, src2));
688 fs_visitor::push_force_uncompressed()
690 force_uncompressed_stack++;
694 fs_visitor::pop_force_uncompressed()
696 force_uncompressed_stack--;
697 assert(force_uncompressed_stack >= 0);
701 fs_visitor::push_force_sechalf()
703 force_sechalf_stack++;
707 fs_visitor::pop_force_sechalf()
709 force_sechalf_stack--;
710 assert(force_sechalf_stack >= 0);
714 * Returns how many MRFs an FS opcode will write over.
716 * Note that this is not the 0 or 1 implied writes in an actual gen
717 * instruction -- the FS opcodes often generate MOVs in addition.
720 fs_visitor::implied_mrf_writes(fs_inst *inst)
725 switch (inst->opcode) {
726 case SHADER_OPCODE_RCP:
727 case SHADER_OPCODE_RSQ:
728 case SHADER_OPCODE_SQRT:
729 case SHADER_OPCODE_EXP2:
730 case SHADER_OPCODE_LOG2:
731 case SHADER_OPCODE_SIN:
732 case SHADER_OPCODE_COS:
733 return 1 * dispatch_width / 8;
734 case SHADER_OPCODE_POW:
735 case SHADER_OPCODE_INT_QUOTIENT:
736 case SHADER_OPCODE_INT_REMAINDER:
737 return 2 * dispatch_width / 8;
738 case SHADER_OPCODE_TEX:
740 case SHADER_OPCODE_TXD:
741 case SHADER_OPCODE_TXF:
742 case SHADER_OPCODE_TXF_MS:
743 case SHADER_OPCODE_TXL:
744 case SHADER_OPCODE_TXS:
745 case SHADER_OPCODE_LOD:
747 case FS_OPCODE_FB_WRITE:
749 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
750 case FS_OPCODE_UNSPILL:
752 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
754 case FS_OPCODE_SPILL:
757 assert(!"not reached");
763 fs_visitor::virtual_grf_alloc(int size)
765 if (virtual_grf_array_size <= virtual_grf_count) {
766 if (virtual_grf_array_size == 0)
767 virtual_grf_array_size = 16;
769 virtual_grf_array_size *= 2;
770 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
771 virtual_grf_array_size);
773 virtual_grf_sizes[virtual_grf_count] = size;
774 return virtual_grf_count++;
777 /** Fixed HW reg constructor. */
778 fs_reg::fs_reg(enum register_file file, int reg)
783 this->type = BRW_REGISTER_TYPE_F;
786 /** Fixed HW reg constructor. */
787 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
795 /** Automatic reg constructor. */
796 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
801 this->reg = v->virtual_grf_alloc(v->type_size(type));
802 this->reg_offset = 0;
803 this->type = brw_type_for_base_type(type);
807 fs_visitor::variable_storage(ir_variable *var)
809 return (fs_reg *)hash_table_find(this->variable_ht, var);
813 import_uniforms_callback(const void *key,
817 struct hash_table *dst_ht = (struct hash_table *)closure;
818 const fs_reg *reg = (const fs_reg *)data;
820 if (reg->file != UNIFORM)
823 hash_table_insert(dst_ht, data, key);
826 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
827 * This brings in those uniform definitions
830 fs_visitor::import_uniforms(fs_visitor *v)
832 hash_table_call_foreach(v->variable_ht,
833 import_uniforms_callback,
835 this->params_remap = v->params_remap;
838 /* Our support for uniforms is piggy-backed on the struct
839 * gl_fragment_program, because that's where the values actually
840 * get stored, rather than in some global gl_shader_program uniform
844 fs_visitor::setup_uniform_values(ir_variable *ir)
846 int namelen = strlen(ir->name);
848 /* The data for our (non-builtin) uniforms is stored in a series of
849 * gl_uniform_driver_storage structs for each subcomponent that
850 * glGetUniformLocation() could name. We know it's been set up in the same
851 * order we'd walk the type, so walk the list of storage and find anything
852 * with our name, or the prefix of a component that starts with our name.
854 unsigned params_before = c->prog_data.nr_params;
855 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
856 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
858 if (strncmp(ir->name, storage->name, namelen) != 0 ||
859 (storage->name[namelen] != 0 &&
860 storage->name[namelen] != '.' &&
861 storage->name[namelen] != '[')) {
865 unsigned slots = storage->type->component_slots();
866 if (storage->array_elements)
867 slots *= storage->array_elements;
869 for (unsigned i = 0; i < slots; i++) {
870 c->prog_data.param[c->prog_data.nr_params++] =
871 &storage->storage[i].f;
875 /* Make sure we actually initialized the right amount of stuff here. */
876 assert(params_before + ir->type->component_slots() ==
877 c->prog_data.nr_params);
881 /* Our support for builtin uniforms is even scarier than non-builtin.
882 * It sits on top of the PROG_STATE_VAR parameters that are
883 * automatically updated from GL context state.
886 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
888 const ir_state_slot *const slots = ir->state_slots;
889 assert(ir->state_slots != NULL);
891 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
892 /* This state reference has already been setup by ir_to_mesa, but we'll
893 * get the same index back here.
895 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
896 (gl_state_index *)slots[i].tokens);
898 /* Add each of the unique swizzles of the element as a parameter.
899 * This'll end up matching the expected layout of the
900 * array/matrix/structure we're trying to fill in.
903 for (unsigned int j = 0; j < 4; j++) {
904 int swiz = GET_SWZ(slots[i].swizzle, j);
905 if (swiz == last_swiz)
909 c->prog_data.param[c->prog_data.nr_params++] =
910 &fp->Base.Parameters->ParameterValues[index][swiz].f;
916 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
918 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
920 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
923 if (ir->pixel_center_integer) {
924 emit(MOV(wpos, this->pixel_x));
926 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
931 if (!flip && ir->pixel_center_integer) {
932 emit(MOV(wpos, this->pixel_y));
934 fs_reg pixel_y = this->pixel_y;
935 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
938 pixel_y.negate = true;
939 offset += c->key.drawable_height - 1.0;
942 emit(ADD(wpos, pixel_y, fs_reg(offset)));
947 if (intel->gen >= 6) {
948 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
950 emit(FS_OPCODE_LINTERP, wpos,
951 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
952 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
953 interp_reg(VARYING_SLOT_POS, 2));
957 /* gl_FragCoord.w: Already set up in emit_interpolation */
958 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
964 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
965 glsl_interp_qualifier interpolation_mode,
968 brw_wm_barycentric_interp_mode barycoord_mode;
970 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
971 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
973 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
975 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
976 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
978 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
980 return emit(FS_OPCODE_LINTERP, attr,
981 this->delta_x[barycoord_mode],
982 this->delta_y[barycoord_mode], interp);
986 fs_visitor::emit_general_interpolation(ir_variable *ir)
988 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
989 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
992 unsigned int array_elements;
993 const glsl_type *type;
995 if (ir->type->is_array()) {
996 array_elements = ir->type->length;
997 if (array_elements == 0) {
998 fail("dereferenced array '%s' has length 0\n", ir->name);
1000 type = ir->type->fields.array;
1006 glsl_interp_qualifier interpolation_mode =
1007 ir->determine_interpolation_mode(c->key.flat_shade);
1009 int location = ir->location;
1010 for (unsigned int i = 0; i < array_elements; i++) {
1011 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1012 if (urb_setup[location] == -1) {
1013 /* If there's no incoming setup data for this slot, don't
1014 * emit interpolation for it.
1016 attr.reg_offset += type->vector_elements;
1021 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1022 /* Constant interpolation (flat shading) case. The SF has
1023 * handed us defined values in only the constant offset
1024 * field of the setup reg.
1026 for (unsigned int k = 0; k < type->vector_elements; k++) {
1027 struct brw_reg interp = interp_reg(location, k);
1028 interp = suboffset(interp, 3);
1029 interp.type = reg->type;
1030 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034 /* Smooth/noperspective interpolation case. */
1035 for (unsigned int k = 0; k < type->vector_elements; k++) {
1036 /* FINISHME: At some point we probably want to push
1037 * this farther by giving similar treatment to the
1038 * other potentially constant components of the
1039 * attribute, as well as making brw_vs_constval.c
1040 * handle varyings other than gl_TexCoord.
1042 struct brw_reg interp = interp_reg(location, k);
1043 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1045 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1046 /* Get the pixel/sample mask into f0 so that we know
1047 * which pixels are lit. Then, for each channel that is
1048 * unlit, replace the centroid data with non-centroid
1051 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1052 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1053 interpolation_mode, false);
1054 inst->predicate = BRW_PREDICATE_NORMAL;
1055 inst->predicate_inverse = true;
1057 if (intel->gen < 6) {
1058 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1072 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1074 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1076 /* The frontfacing comes in as a bit in the thread payload. */
1077 if (intel->gen >= 6) {
1078 emit(BRW_OPCODE_ASR, *reg,
1079 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1081 emit(BRW_OPCODE_NOT, *reg, *reg);
1082 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1084 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1085 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1088 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1089 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1096 fs_visitor::fix_math_operand(fs_reg src)
1098 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1099 * might be able to do better by doing execsize = 1 math and then
1100 * expanding that result out, but we would need to be careful with
1103 * The hardware ignores source modifiers (negate and abs) on math
1104 * instructions, so we also move to a temp to set those up.
1106 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1107 !src.abs && !src.negate)
1110 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1113 if (intel->gen >= 7 && src.file != IMM)
1116 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1117 expanded.type = src.type;
1118 emit(BRW_OPCODE_MOV, expanded, src);
1123 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1126 case SHADER_OPCODE_RCP:
1127 case SHADER_OPCODE_RSQ:
1128 case SHADER_OPCODE_SQRT:
1129 case SHADER_OPCODE_EXP2:
1130 case SHADER_OPCODE_LOG2:
1131 case SHADER_OPCODE_SIN:
1132 case SHADER_OPCODE_COS:
1135 assert(!"not reached: bad math opcode");
1139 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1140 * might be able to do better by doing execsize = 1 math and then
1141 * expanding that result out, but we would need to be careful with
1144 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1145 * instructions, so we also move to a temp to set those up.
1147 if (intel->gen >= 6)
1148 src = fix_math_operand(src);
1150 fs_inst *inst = emit(opcode, dst, src);
1152 if (intel->gen < 6) {
1154 inst->mlen = dispatch_width / 8;
1161 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1167 case SHADER_OPCODE_INT_QUOTIENT:
1168 case SHADER_OPCODE_INT_REMAINDER:
1169 if (intel->gen >= 7 && dispatch_width == 16)
1170 fail("16-wide INTDIV unsupported\n");
1172 case SHADER_OPCODE_POW:
1175 assert(!"not reached: unsupported binary math opcode.");
1179 if (intel->gen >= 6) {
1180 src0 = fix_math_operand(src0);
1181 src1 = fix_math_operand(src1);
1183 inst = emit(opcode, dst, src0, src1);
1185 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1186 * "Message Payload":
1188 * "Operand0[7]. For the INT DIV functions, this operand is the
1191 * "Operand1[7]. For the INT DIV functions, this operand is the
1194 bool is_int_div = opcode != SHADER_OPCODE_POW;
1195 fs_reg &op0 = is_int_div ? src1 : src0;
1196 fs_reg &op1 = is_int_div ? src0 : src1;
1198 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1199 inst = emit(opcode, dst, op0, reg_null_f);
1201 inst->base_mrf = base_mrf;
1202 inst->mlen = 2 * dispatch_width / 8;
1208 fs_visitor::assign_curb_setup()
1210 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1211 if (dispatch_width == 8) {
1212 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1214 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1217 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1218 foreach_list(node, &this->instructions) {
1219 fs_inst *inst = (fs_inst *)node;
1221 for (unsigned int i = 0; i < 3; i++) {
1222 if (inst->src[i].file == UNIFORM) {
1223 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1224 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1228 inst->src[i].file = FIXED_HW_REG;
1229 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1236 fs_visitor::calculate_urb_setup()
1238 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1243 /* Figure out where each of the incoming setup attributes lands. */
1244 if (intel->gen >= 6) {
1245 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1246 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1247 urb_setup[i] = urb_next++;
1251 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1252 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1253 /* Point size is packed into the header, not as a general attribute */
1254 if (i == VARYING_SLOT_PSIZ)
1257 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1258 /* The back color slot is skipped when the front color is
1259 * also written to. In addition, some slots can be
1260 * written in the vertex shader and not read in the
1261 * fragment shader. So the register number must always be
1262 * incremented, mapped or not.
1264 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1265 urb_setup[i] = urb_next;
1271 * It's a FS only attribute, and we did interpolation for this attribute
1272 * in SF thread. So, count it here, too.
1274 * See compile_sf_prog() for more info.
1276 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1277 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1280 /* Each attribute is 4 setup channels, each of which is half a reg. */
1281 c->prog_data.urb_read_length = urb_next * 2;
1285 fs_visitor::assign_urb_setup()
1287 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289 /* Offset all the urb_setup[] index by the actual position of the
1290 * setup regs, now that the location of the constants has been chosen.
1292 foreach_list(node, &this->instructions) {
1293 fs_inst *inst = (fs_inst *)node;
1295 if (inst->opcode == FS_OPCODE_LINTERP) {
1296 assert(inst->src[2].file == FIXED_HW_REG);
1297 inst->src[2].fixed_hw_reg.nr += urb_start;
1300 if (inst->opcode == FS_OPCODE_CINTERP) {
1301 assert(inst->src[0].file == FIXED_HW_REG);
1302 inst->src[0].fixed_hw_reg.nr += urb_start;
1306 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1310 * Split large virtual GRFs into separate components if we can.
1312 * This is mostly duplicated with what brw_fs_vector_splitting does,
1313 * but that's really conservative because it's afraid of doing
1314 * splitting that doesn't result in real progress after the rest of
1315 * the optimization phases, which would cause infinite looping in
1316 * optimization. We can do it once here, safely. This also has the
1317 * opportunity to split interpolated values, or maybe even uniforms,
1318 * which we don't have at the IR level.
1320 * We want to split, because virtual GRFs are what we register
1321 * allocate and spill (due to contiguousness requirements for some
1322 * instructions), and they're what we naturally generate in the
1323 * codegen process, but most virtual GRFs don't actually need to be
1324 * contiguous sets of GRFs. If we split, we'll end up with reduced
1325 * live intervals and better dead code elimination and coalescing.
1328 fs_visitor::split_virtual_grfs()
1330 int num_vars = this->virtual_grf_count;
1331 bool split_grf[num_vars];
1332 int new_virtual_grf[num_vars];
1334 /* Try to split anything > 0 sized. */
1335 for (int i = 0; i < num_vars; i++) {
1336 if (this->virtual_grf_sizes[i] != 1)
1337 split_grf[i] = true;
1339 split_grf[i] = false;
1343 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1344 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1345 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1346 * Gen6, that was the only supported interpolation mode, and since Gen6,
1347 * delta_x and delta_y are in fixed hardware registers.
1349 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1353 foreach_list(node, &this->instructions) {
1354 fs_inst *inst = (fs_inst *)node;
1356 /* If there's a SEND message that requires contiguous destination
1357 * registers, no splitting is allowed.
1359 if (inst->regs_written > 1) {
1360 split_grf[inst->dst.reg] = false;
1363 /* If we're sending from a GRF, don't split it, on the assumption that
1364 * the send is reading the whole thing.
1366 if (inst->is_send_from_grf()) {
1367 split_grf[inst->src[0].reg] = false;
1371 /* Allocate new space for split regs. Note that the virtual
1372 * numbers will be contiguous.
1374 for (int i = 0; i < num_vars; i++) {
1376 new_virtual_grf[i] = virtual_grf_alloc(1);
1377 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1378 int reg = virtual_grf_alloc(1);
1379 assert(reg == new_virtual_grf[i] + j - 1);
1382 this->virtual_grf_sizes[i] = 1;
1386 foreach_list(node, &this->instructions) {
1387 fs_inst *inst = (fs_inst *)node;
1389 if (inst->dst.file == GRF &&
1390 split_grf[inst->dst.reg] &&
1391 inst->dst.reg_offset != 0) {
1392 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1393 inst->dst.reg_offset - 1);
1394 inst->dst.reg_offset = 0;
1396 for (int i = 0; i < 3; i++) {
1397 if (inst->src[i].file == GRF &&
1398 split_grf[inst->src[i].reg] &&
1399 inst->src[i].reg_offset != 0) {
1400 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1401 inst->src[i].reg_offset - 1);
1402 inst->src[i].reg_offset = 0;
1406 this->live_intervals_valid = false;
1410 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412 * During code generation, we create tons of temporary variables, many of
1413 * which get immediately killed and are never used again. Yet, in later
1414 * optimization and analysis passes, such as compute_live_intervals, we need
1415 * to loop over all the virtual GRFs. Compacting them can save a lot of
1419 fs_visitor::compact_virtual_grfs()
1421 /* Mark which virtual GRFs are used, and count how many. */
1422 int remap_table[this->virtual_grf_count];
1423 memset(remap_table, -1, sizeof(remap_table));
1425 foreach_list(node, &this->instructions) {
1426 const fs_inst *inst = (const fs_inst *) node;
1428 if (inst->dst.file == GRF)
1429 remap_table[inst->dst.reg] = 0;
1431 for (int i = 0; i < 3; i++) {
1432 if (inst->src[i].file == GRF)
1433 remap_table[inst->src[i].reg] = 0;
1437 /* In addition to registers used in instructions, fs_visitor keeps
1438 * direct references to certain special values which must be patched:
1440 fs_reg *special[] = {
1441 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1442 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1443 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1444 &delta_x[0], &delta_x[1], &delta_x[2],
1445 &delta_x[3], &delta_x[4], &delta_x[5],
1446 &delta_y[0], &delta_y[1], &delta_y[2],
1447 &delta_y[3], &delta_y[4], &delta_y[5],
1449 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1450 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452 /* Treat all special values as used, to be conservative */
1453 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1454 if (special[i]->file == GRF)
1455 remap_table[special[i]->reg] = 0;
1458 /* Compact the GRF arrays. */
1460 for (int i = 0; i < this->virtual_grf_count; i++) {
1461 if (remap_table[i] != -1) {
1462 remap_table[i] = new_index;
1463 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1464 if (live_intervals_valid) {
1465 virtual_grf_use[new_index] = virtual_grf_use[i];
1466 virtual_grf_def[new_index] = virtual_grf_def[i];
1472 this->virtual_grf_count = new_index;
1474 /* Patch all the instructions to use the newly renumbered registers */
1475 foreach_list(node, &this->instructions) {
1476 fs_inst *inst = (fs_inst *) node;
1478 if (inst->dst.file == GRF)
1479 inst->dst.reg = remap_table[inst->dst.reg];
1481 for (int i = 0; i < 3; i++) {
1482 if (inst->src[i].file == GRF)
1483 inst->src[i].reg = remap_table[inst->src[i].reg];
1487 /* Patch all the references to special values */
1488 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1489 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1490 special[i]->reg = remap_table[special[i]->reg];
1495 fs_visitor::remove_dead_constants()
1497 if (dispatch_width == 8) {
1498 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1501 this->params_remap[i] = -1;
1503 /* Find which params are still in use. */
1504 foreach_list(node, &this->instructions) {
1505 fs_inst *inst = (fs_inst *)node;
1507 for (int i = 0; i < 3; i++) {
1508 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510 if (inst->src[i].file != UNIFORM)
1513 assert(constant_nr < (int)c->prog_data.nr_params);
1515 /* For now, set this to non-negative. We'll give it the
1516 * actual new number in a moment, in order to keep the
1517 * register numbers nicely ordered.
1519 this->params_remap[constant_nr] = 0;
1523 /* Figure out what the new numbers for the params will be. At some
1524 * point when we're doing uniform array access, we're going to want
1525 * to keep the distinction between .reg and .reg_offset, but for
1526 * now we don't care.
1528 unsigned int new_nr_params = 0;
1529 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1530 if (this->params_remap[i] != -1) {
1531 this->params_remap[i] = new_nr_params++;
1535 /* Update the list of params to be uploaded to match our new numbering. */
1536 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1537 int remapped = this->params_remap[i];
1542 c->prog_data.param[remapped] = c->prog_data.param[i];
1545 c->prog_data.nr_params = new_nr_params;
1547 /* This should have been generated in the 8-wide pass already. */
1548 assert(this->params_remap);
1551 /* Now do the renumbering of the shader to remove unused params. */
1552 foreach_list(node, &this->instructions) {
1553 fs_inst *inst = (fs_inst *)node;
1555 for (int i = 0; i < 3; i++) {
1556 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1558 if (inst->src[i].file != UNIFORM)
1561 assert(this->params_remap[constant_nr] != -1);
1562 inst->src[i].reg = this->params_remap[constant_nr];
1563 inst->src[i].reg_offset = 0;
1571 * Implements array access of uniforms by inserting a
1572 * PULL_CONSTANT_LOAD instruction.
1574 * Unlike temporary GRF array access (where we don't support it due to
1575 * the difficulty of doing relative addressing on instruction
1576 * destinations), we could potentially do array access of uniforms
1577 * that were loaded in GRF space as push constants. In real-world
1578 * usage we've seen, though, the arrays being used are always larger
1579 * than we could load as push constants, so just always move all
1580 * uniform array access out to a pull constant buffer.
1583 fs_visitor::move_uniform_array_access_to_pull_constants()
1585 int pull_constant_loc[c->prog_data.nr_params];
1587 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1588 pull_constant_loc[i] = -1;
1591 /* Walk through and find array access of uniforms. Put a copy of that
1592 * uniform in the pull constant buffer.
1594 * Note that we don't move constant-indexed accesses to arrays. No
1595 * testing has been done of the performance impact of this choice.
1597 foreach_list_safe(node, &this->instructions) {
1598 fs_inst *inst = (fs_inst *)node;
1600 for (int i = 0 ; i < 3; i++) {
1601 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1604 int uniform = inst->src[i].reg;
1606 /* If this array isn't already present in the pull constant buffer,
1609 if (pull_constant_loc[uniform] == -1) {
1610 const float **values = &c->prog_data.param[uniform];
1612 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1614 assert(param_size[uniform]);
1616 for (int j = 0; j < param_size[uniform]; j++) {
1617 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622 /* Set up the annotation tracking for new generated instructions. */
1624 current_annotation = inst->annotation;
1626 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1627 fs_reg temp = fs_reg(this, glsl_type::float_type);
1628 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1630 *inst->src[i].reladdr,
1631 pull_constant_loc[uniform] +
1632 inst->src[i].reg_offset);
1633 inst->insert_before(&list);
1635 inst->src[i].file = temp.file;
1636 inst->src[i].reg = temp.reg;
1637 inst->src[i].reg_offset = temp.reg_offset;
1638 inst->src[i].reladdr = NULL;
1644 * Choose accesses from the UNIFORM file to demote to using the pull
1647 * We allow a fragment shader to have more than the specified minimum
1648 * maximum number of fragment shader uniform components (64). If
1649 * there are too many of these, they'd fill up all of register space.
1650 * So, this will push some of them out to the pull constant buffer and
1651 * update the program to load them.
1654 fs_visitor::setup_pull_constants()
1656 /* Only allow 16 registers (128 uniform components) as push constants. */
1657 unsigned int max_uniform_components = 16 * 8;
1658 if (c->prog_data.nr_params <= max_uniform_components)
1661 if (dispatch_width == 16) {
1662 fail("Pull constants not supported in 16-wide\n");
1666 /* Just demote the end of the list. We could probably do better
1667 * here, demoting things that are rarely used in the program first.
1669 unsigned int pull_uniform_base = max_uniform_components;
1671 int pull_constant_loc[c->prog_data.nr_params];
1672 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1673 if (i < pull_uniform_base) {
1674 pull_constant_loc[i] = -1;
1676 pull_constant_loc[i] = -1;
1677 /* If our constant is already being uploaded for reladdr purposes,
1680 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1681 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1682 pull_constant_loc[i] = j;
1686 if (pull_constant_loc[i] == -1) {
1687 int pull_index = c->prog_data.nr_pull_params++;
1688 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1689 pull_constant_loc[i] = pull_index;;
1693 c->prog_data.nr_params = pull_uniform_base;
1695 foreach_list(node, &this->instructions) {
1696 fs_inst *inst = (fs_inst *)node;
1698 for (int i = 0; i < 3; i++) {
1699 if (inst->src[i].file != UNIFORM)
1702 int pull_index = pull_constant_loc[inst->src[i].reg +
1703 inst->src[i].reg_offset];
1704 if (pull_index == -1)
1707 assert(!inst->src[i].reladdr);
1709 fs_reg dst = fs_reg(this, glsl_type::float_type);
1710 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1711 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1713 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1714 dst, index, offset);
1715 pull->ir = inst->ir;
1716 pull->annotation = inst->annotation;
1718 inst->insert_before(pull);
1720 inst->src[i].file = GRF;
1721 inst->src[i].reg = dst.reg;
1722 inst->src[i].reg_offset = 0;
1723 inst->src[i].smear = pull_index & 3;
1729 fs_visitor::opt_algebraic()
1731 bool progress = false;
1733 foreach_list(node, &this->instructions) {
1734 fs_inst *inst = (fs_inst *)node;
1736 switch (inst->opcode) {
1737 case BRW_OPCODE_MUL:
1738 if (inst->src[1].file != IMM)
1742 if (inst->src[1].is_one()) {
1743 inst->opcode = BRW_OPCODE_MOV;
1744 inst->src[1] = reg_undef;
1750 if (inst->src[1].is_zero()) {
1751 inst->opcode = BRW_OPCODE_MOV;
1752 inst->src[0] = inst->src[1];
1753 inst->src[1] = reg_undef;
1759 case BRW_OPCODE_ADD:
1760 if (inst->src[1].file != IMM)
1764 if (inst->src[1].is_zero()) {
1765 inst->opcode = BRW_OPCODE_MOV;
1766 inst->src[1] = reg_undef;
1780 * Must be called after calculate_live_intervales() to remove unused
1781 * writes to registers -- register allocation will fail otherwise
1782 * because something deffed but not used won't be considered to
1783 * interfere with other regs.
1786 fs_visitor::dead_code_eliminate()
1788 bool progress = false;
1791 calculate_live_intervals();
1793 foreach_list_safe(node, &this->instructions) {
1794 fs_inst *inst = (fs_inst *)node;
1796 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1805 live_intervals_valid = false;
1811 * Implements a second type of register coalescing: This one checks if
1812 * the two regs involved in a raw move don't interfere, in which case
1813 * they can both by stored in the same place and the MOV removed.
1816 fs_visitor::register_coalesce_2()
1818 bool progress = false;
1820 calculate_live_intervals();
1822 foreach_list_safe(node, &this->instructions) {
1823 fs_inst *inst = (fs_inst *)node;
1825 if (inst->opcode != BRW_OPCODE_MOV ||
1828 inst->src[0].file != GRF ||
1829 inst->src[0].negate ||
1831 inst->src[0].smear != -1 ||
1832 inst->dst.file != GRF ||
1833 inst->dst.type != inst->src[0].type ||
1834 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1835 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1839 int reg_from = inst->src[0].reg;
1840 assert(inst->src[0].reg_offset == 0);
1841 int reg_to = inst->dst.reg;
1842 int reg_to_offset = inst->dst.reg_offset;
1844 foreach_list(node, &this->instructions) {
1845 fs_inst *scan_inst = (fs_inst *)node;
1847 if (scan_inst->dst.file == GRF &&
1848 scan_inst->dst.reg == reg_from) {
1849 scan_inst->dst.reg = reg_to;
1850 scan_inst->dst.reg_offset = reg_to_offset;
1852 for (int i = 0; i < 3; i++) {
1853 if (scan_inst->src[i].file == GRF &&
1854 scan_inst->src[i].reg == reg_from) {
1855 scan_inst->src[i].reg = reg_to;
1856 scan_inst->src[i].reg_offset = reg_to_offset;
1863 /* We don't need to recalculate live intervals inside the loop despite
1864 * flagging live_intervals_valid because we only use live intervals for
1865 * the interferes test, and we must have had a situation where the
1876 * Some register R that might get coalesced with one of these two could
1877 * only be referencing "to", otherwise "from"'s range would have been
1878 * longer. R's range could also only start at the end of "to" or later,
1879 * otherwise it will conflict with "to" when we try to coalesce "to"
1882 live_intervals_valid = false;
1892 fs_visitor::register_coalesce()
1894 bool progress = false;
1898 foreach_list_safe(node, &this->instructions) {
1899 fs_inst *inst = (fs_inst *)node;
1901 /* Make sure that we dominate the instructions we're going to
1902 * scan for interfering with our coalescing, or we won't have
1903 * scanned enough to see if anything interferes with our
1904 * coalescing. We don't dominate the following instructions if
1905 * we're in a loop or an if block.
1907 switch (inst->opcode) {
1911 case BRW_OPCODE_WHILE:
1917 case BRW_OPCODE_ENDIF:
1923 if (loop_depth || if_depth)
1926 if (inst->opcode != BRW_OPCODE_MOV ||
1929 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1930 inst->src[0].file != UNIFORM)||
1931 inst->dst.type != inst->src[0].type)
1934 bool has_source_modifiers = (inst->src[0].abs ||
1935 inst->src[0].negate ||
1936 inst->src[0].smear != -1 ||
1937 inst->src[0].file == UNIFORM);
1939 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1940 * them: check for no writes to either one until the exit of the
1943 bool interfered = false;
1945 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1946 !scan_inst->is_tail_sentinel();
1947 scan_inst = (fs_inst *)scan_inst->next) {
1948 if (scan_inst->dst.file == GRF) {
1949 if (scan_inst->overwrites_reg(inst->dst) ||
1950 scan_inst->overwrites_reg(inst->src[0])) {
1956 /* The gen6 MATH instruction can't handle source modifiers or
1957 * unusual register regions, so avoid coalescing those for
1958 * now. We should do something more specific.
1960 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1965 /* The accumulator result appears to get used for the
1966 * conditional modifier generation. When negating a UD
1967 * value, there is a 33rd bit generated for the sign in the
1968 * accumulator value, so now you can't check, for example,
1969 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1971 if (scan_inst->conditional_mod &&
1972 inst->src[0].negate &&
1973 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1982 /* Rewrite the later usage to point at the source of the move to
1985 for (fs_inst *scan_inst = inst;
1986 !scan_inst->is_tail_sentinel();
1987 scan_inst = (fs_inst *)scan_inst->next) {
1988 for (int i = 0; i < 3; i++) {
1989 if (scan_inst->src[i].file == GRF &&
1990 scan_inst->src[i].reg == inst->dst.reg &&
1991 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1992 fs_reg new_src = inst->src[0];
1993 if (scan_inst->src[i].abs) {
1997 new_src.negate ^= scan_inst->src[i].negate;
1998 scan_inst->src[i] = new_src;
2008 live_intervals_valid = false;
2015 fs_visitor::compute_to_mrf()
2017 bool progress = false;
2020 calculate_live_intervals();
2022 foreach_list_safe(node, &this->instructions) {
2023 fs_inst *inst = (fs_inst *)node;
2028 if (inst->opcode != BRW_OPCODE_MOV ||
2030 inst->dst.file != MRF || inst->src[0].file != GRF ||
2031 inst->dst.type != inst->src[0].type ||
2032 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2035 /* Work out which hardware MRF registers are written by this
2038 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2040 if (inst->dst.reg & BRW_MRF_COMPR4) {
2041 mrf_high = mrf_low + 4;
2042 } else if (dispatch_width == 16 &&
2043 (!inst->force_uncompressed && !inst->force_sechalf)) {
2044 mrf_high = mrf_low + 1;
2049 /* Can't compute-to-MRF this GRF if someone else was going to
2052 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2055 /* Found a move of a GRF to a MRF. Let's see if we can go
2056 * rewrite the thing that made this GRF to write into the MRF.
2059 for (scan_inst = (fs_inst *)inst->prev;
2060 scan_inst->prev != NULL;
2061 scan_inst = (fs_inst *)scan_inst->prev) {
2062 if (scan_inst->dst.file == GRF &&
2063 scan_inst->dst.reg == inst->src[0].reg) {
2064 /* Found the last thing to write our reg we want to turn
2065 * into a compute-to-MRF.
2068 /* If it's predicated, it (probably) didn't populate all
2069 * the channels. We might be able to rewrite everything
2070 * that writes that reg, but it would require smarter
2071 * tracking to delay the rewriting until complete success.
2073 if (scan_inst->predicate)
2076 /* If it's half of register setup and not the same half as
2077 * our MOV we're trying to remove, bail for now.
2079 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2080 scan_inst->force_sechalf != inst->force_sechalf) {
2084 /* Things returning more than one register would need us to
2085 * understand coalescing out more than one MOV at a time.
2087 if (scan_inst->regs_written > 1)
2090 /* SEND instructions can't have MRF as a destination. */
2091 if (scan_inst->mlen)
2094 if (intel->gen == 6) {
2095 /* gen6 math instructions must have the destination be
2096 * GRF, so no compute-to-MRF for them.
2098 if (scan_inst->is_math()) {
2103 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2104 /* Found the creator of our MRF's source value. */
2105 scan_inst->dst.file = MRF;
2106 scan_inst->dst.reg = inst->dst.reg;
2107 scan_inst->saturate |= inst->saturate;
2114 /* We don't handle control flow here. Most computation of
2115 * values that end up in MRFs are shortly before the MRF
2118 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2121 /* You can't read from an MRF, so if someone else reads our
2122 * MRF's source GRF that we wanted to rewrite, that stops us.
2124 bool interfered = false;
2125 for (int i = 0; i < 3; i++) {
2126 if (scan_inst->src[i].file == GRF &&
2127 scan_inst->src[i].reg == inst->src[0].reg &&
2128 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2135 if (scan_inst->dst.file == MRF) {
2136 /* If somebody else writes our MRF here, we can't
2137 * compute-to-MRF before that.
2139 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2142 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2143 scan_mrf_high = scan_mrf_low + 4;
2144 } else if (dispatch_width == 16 &&
2145 (!scan_inst->force_uncompressed &&
2146 !scan_inst->force_sechalf)) {
2147 scan_mrf_high = scan_mrf_low + 1;
2149 scan_mrf_high = scan_mrf_low;
2152 if (mrf_low == scan_mrf_low ||
2153 mrf_low == scan_mrf_high ||
2154 mrf_high == scan_mrf_low ||
2155 mrf_high == scan_mrf_high) {
2160 if (scan_inst->mlen > 0) {
2161 /* Found a SEND instruction, which means that there are
2162 * live values in MRFs from base_mrf to base_mrf +
2163 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2166 if (mrf_low >= scan_inst->base_mrf &&
2167 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2170 if (mrf_high >= scan_inst->base_mrf &&
2171 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2179 live_intervals_valid = false;
2185 * Walks through basic blocks, looking for repeated MRF writes and
2186 * removing the later ones.
2189 fs_visitor::remove_duplicate_mrf_writes()
2191 fs_inst *last_mrf_move[16];
2192 bool progress = false;
2194 /* Need to update the MRF tracking for compressed instructions. */
2195 if (dispatch_width == 16)
2198 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200 foreach_list_safe(node, &this->instructions) {
2201 fs_inst *inst = (fs_inst *)node;
2203 if (inst->is_control_flow()) {
2204 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2207 if (inst->opcode == BRW_OPCODE_MOV &&
2208 inst->dst.file == MRF) {
2209 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2210 if (prev_inst && inst->equals(prev_inst)) {
2217 /* Clear out the last-write records for MRFs that were overwritten. */
2218 if (inst->dst.file == MRF) {
2219 last_mrf_move[inst->dst.reg] = NULL;
2222 if (inst->mlen > 0) {
2223 /* Found a SEND instruction, which will include two or fewer
2224 * implied MRF writes. We could do better here.
2226 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2227 last_mrf_move[inst->base_mrf + i] = NULL;
2231 /* Clear out any MRF move records whose sources got overwritten. */
2232 if (inst->dst.file == GRF) {
2233 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2234 if (last_mrf_move[i] &&
2235 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2236 last_mrf_move[i] = NULL;
2241 if (inst->opcode == BRW_OPCODE_MOV &&
2242 inst->dst.file == MRF &&
2243 inst->src[0].file == GRF &&
2245 last_mrf_move[inst->dst.reg] = inst;
2250 live_intervals_valid = false;
2256 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2257 int first_grf, int grf_len)
2259 bool inst_16wide = (dispatch_width > 8 &&
2260 !inst->force_uncompressed &&
2261 !inst->force_sechalf);
2263 /* Clear the flag for registers that actually got read (as expected). */
2264 for (int i = 0; i < 3; i++) {
2266 if (inst->src[i].file == GRF) {
2267 grf = inst->src[i].reg;
2268 } else if (inst->src[i].file == FIXED_HW_REG &&
2269 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2270 grf = inst->src[i].fixed_hw_reg.nr;
2275 if (grf >= first_grf &&
2276 grf < first_grf + grf_len) {
2277 deps[grf - first_grf] = false;
2279 deps[grf - first_grf + 1] = false;
2285 * Implements this workaround for the original 965:
2287 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2288 * check for post destination dependencies on this instruction, software
2289 * must ensure that there is no destination hazard for the case of ‘write
2290 * followed by a posted write’ shown in the following example.
2293 * 2. send r3.xy <rest of send instruction>
2296 * Due to no post-destination dependency check on the ‘send’, the above
2297 * code sequence could have two instructions (1 and 2) in flight at the
2298 * same time that both consider ‘r3’ as the target of their final writes.
2301 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2303 int reg_size = dispatch_width / 8;
2304 int write_len = inst->regs_written * reg_size;
2305 int first_write_grf = inst->dst.reg;
2306 bool needs_dep[BRW_MAX_MRF];
2307 assert(write_len < (int)sizeof(needs_dep) - 1);
2309 memset(needs_dep, false, sizeof(needs_dep));
2310 memset(needs_dep, true, write_len);
2312 clear_deps_for_inst_src(inst, dispatch_width,
2313 needs_dep, first_write_grf, write_len);
2315 /* Walk backwards looking for writes to registers we're writing which
2316 * aren't read since being written. If we hit the start of the program,
2317 * we assume that there are no outstanding dependencies on entry to the
2320 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2322 scan_inst = (fs_inst *)scan_inst->prev) {
2324 /* If we hit control flow, assume that there *are* outstanding
2325 * dependencies, and force their cleanup before our instruction.
2327 if (scan_inst->is_control_flow()) {
2328 for (int i = 0; i < write_len; i++) {
2330 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2336 bool scan_inst_16wide = (dispatch_width > 8 &&
2337 !scan_inst->force_uncompressed &&
2338 !scan_inst->force_sechalf);
2340 /* We insert our reads as late as possible on the assumption that any
2341 * instruction but a MOV that might have left us an outstanding
2342 * dependency has more latency than a MOV.
2344 if (scan_inst->dst.file == GRF) {
2345 for (int i = 0; i < scan_inst->regs_written; i++) {
2346 int reg = scan_inst->dst.reg + i * reg_size;
2348 if (reg >= first_write_grf &&
2349 reg < first_write_grf + write_len &&
2350 needs_dep[reg - first_write_grf]) {
2351 inst->insert_before(DEP_RESOLVE_MOV(reg));
2352 needs_dep[reg - first_write_grf] = false;
2353 if (scan_inst_16wide)
2354 needs_dep[reg - first_write_grf + 1] = false;
2359 /* Clear the flag for registers that actually got read (as expected). */
2360 clear_deps_for_inst_src(scan_inst, dispatch_width,
2361 needs_dep, first_write_grf, write_len);
2363 /* Continue the loop only if we haven't resolved all the dependencies */
2365 for (i = 0; i < write_len; i++) {
2375 * Implements this workaround for the original 965:
2377 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2378 * used as a destination register until after it has been sourced by an
2379 * instruction with a different destination register.
2382 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2384 int write_len = inst->regs_written * dispatch_width / 8;
2385 int first_write_grf = inst->dst.reg;
2386 bool needs_dep[BRW_MAX_MRF];
2387 assert(write_len < (int)sizeof(needs_dep) - 1);
2389 memset(needs_dep, false, sizeof(needs_dep));
2390 memset(needs_dep, true, write_len);
2391 /* Walk forwards looking for writes to registers we're writing which aren't
2392 * read before being written.
2394 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2395 !scan_inst->is_tail_sentinel();
2396 scan_inst = (fs_inst *)scan_inst->next) {
2397 /* If we hit control flow, force resolve all remaining dependencies. */
2398 if (scan_inst->is_control_flow()) {
2399 for (int i = 0; i < write_len; i++) {
2401 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2406 /* Clear the flag for registers that actually got read (as expected). */
2407 clear_deps_for_inst_src(scan_inst, dispatch_width,
2408 needs_dep, first_write_grf, write_len);
2410 /* We insert our reads as late as possible since they're reading the
2411 * result of a SEND, which has massive latency.
2413 if (scan_inst->dst.file == GRF &&
2414 scan_inst->dst.reg >= first_write_grf &&
2415 scan_inst->dst.reg < first_write_grf + write_len &&
2416 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2417 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2418 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2421 /* Continue the loop only if we haven't resolved all the dependencies */
2423 for (i = 0; i < write_len; i++) {
2431 /* If we hit the end of the program, resolve all remaining dependencies out
2434 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2435 assert(last_inst->eot);
2436 for (int i = 0; i < write_len; i++) {
2438 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2443 fs_visitor::insert_gen4_send_dependency_workarounds()
2445 if (intel->gen != 4 || intel->is_g4x)
2448 /* Note that we're done with register allocation, so GRF fs_regs always
2449 * have a .reg_offset of 0.
2452 foreach_list_safe(node, &this->instructions) {
2453 fs_inst *inst = (fs_inst *)node;
2455 if (inst->mlen != 0 && inst->dst.file == GRF) {
2456 insert_gen4_pre_send_dependency_workarounds(inst);
2457 insert_gen4_post_send_dependency_workarounds(inst);
2463 * Turns the generic expression-style uniform pull constant load instruction
2464 * into a hardware-specific series of instructions for loading a pull
2467 * The expression style allows the CSE pass before this to optimize out
2468 * repeated loads from the same offset, and gives the pre-register-allocation
2469 * scheduling full flexibility, while the conversion to native instructions
2470 * allows the post-register-allocation scheduler the best information
2473 * Note that execution masking for setting up pull constant loads is special:
2474 * the channels that need to be written are unrelated to the current execution
2475 * mask, since a later instruction will use one of the result channels as a
2476 * source operand for all 8 or 16 of its channels.
2479 fs_visitor::lower_uniform_pull_constant_loads()
2481 foreach_list(node, &this->instructions) {
2482 fs_inst *inst = (fs_inst *)node;
2484 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2487 if (intel->gen >= 7) {
2488 /* The offset arg before was a vec4-aligned byte offset. We need to
2489 * turn it into a dword offset.
2491 fs_reg const_offset_reg = inst->src[1];
2492 assert(const_offset_reg.file == IMM &&
2493 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2494 const_offset_reg.imm.u /= 4;
2495 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2497 /* This is actually going to be a MOV, but since only the first dword
2498 * is accessed, we have a special opcode to do just that one. Note
2499 * that this needs to be an operation that will be considered a def
2500 * by live variable analysis, or register allocation will explode.
2502 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2503 payload, const_offset_reg);
2504 setup->force_writemask_all = true;
2506 setup->ir = inst->ir;
2507 setup->annotation = inst->annotation;
2508 inst->insert_before(setup);
2510 /* Similarly, this will only populate the first 4 channels of the
2511 * result register (since we only use smear values from 0-3), but we
2512 * don't tell the optimizer.
2514 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2515 inst->src[1] = payload;
2517 this->live_intervals_valid = false;
2519 /* Before register allocation, we didn't tell the scheduler about the
2520 * MRF we use. We know it's safe to use this MRF because nothing
2521 * else does except for register spill/unspill, which generates and
2522 * uses its MRF within a single IR instruction.
2524 inst->base_mrf = 14;
2531 fs_visitor::dump_instruction(fs_inst *inst)
2533 if (inst->predicate) {
2534 printf("(%cf0.%d) ",
2535 inst->predicate_inverse ? '-' : '+',
2539 printf("%s", brw_instruction_name(inst->opcode));
2542 if (inst->conditional_mod) {
2544 if (!inst->predicate &&
2545 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2546 inst->opcode != BRW_OPCODE_IF &&
2547 inst->opcode != BRW_OPCODE_WHILE))) {
2548 printf(".f0.%d\n", inst->flag_subreg);
2554 switch (inst->dst.file) {
2556 printf("vgrf%d", inst->dst.reg);
2557 if (inst->dst.reg_offset)
2558 printf("+%d", inst->dst.reg_offset);
2561 printf("m%d", inst->dst.reg);
2567 printf("***u%d***", inst->dst.reg);
2575 for (int i = 0; i < 3; i++) {
2576 if (inst->src[i].negate)
2578 if (inst->src[i].abs)
2580 switch (inst->src[i].file) {
2582 printf("vgrf%d", inst->src[i].reg);
2583 if (inst->src[i].reg_offset)
2584 printf("+%d", inst->src[i].reg_offset);
2587 printf("***m%d***", inst->src[i].reg);
2590 printf("u%d", inst->src[i].reg);
2591 if (inst->src[i].reg_offset)
2592 printf(".%d", inst->src[i].reg_offset);
2598 switch (inst->src[i].type) {
2599 case BRW_REGISTER_TYPE_F:
2600 printf("%ff", inst->src[i].imm.f);
2602 case BRW_REGISTER_TYPE_D:
2603 printf("%dd", inst->src[i].imm.i);
2605 case BRW_REGISTER_TYPE_UD:
2606 printf("%uu", inst->src[i].imm.u);
2617 if (inst->src[i].abs)
2626 if (inst->force_uncompressed)
2629 if (inst->force_sechalf)
2636 fs_visitor::dump_instructions()
2639 foreach_list(node, &this->instructions) {
2640 fs_inst *inst = (fs_inst *)node;
2641 printf("%d: ", ip++);
2642 dump_instruction(inst);
2647 * Possibly returns an instruction that set up @param reg.
2649 * Sometimes we want to take the result of some expression/variable
2650 * dereference tree and rewrite the instruction generating the result
2651 * of the tree. When processing the tree, we know that the
2652 * instructions generated are all writing temporaries that are dead
2653 * outside of this tree. So, if we have some instructions that write
2654 * a temporary, we're free to point that temp write somewhere else.
2656 * Note that this doesn't guarantee that the instruction generated
2657 * only reg -- it might be the size=4 destination of a texture instruction.
2660 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2666 end->force_uncompressed ||
2667 end->force_sechalf ||
2669 !reg.equals(end->dst)) {
2677 fs_visitor::setup_payload_gen6()
2679 struct intel_context *intel = &brw->intel;
2681 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2682 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2684 assert(intel->gen >= 6);
2686 /* R0-1: masks, pixel X/Y coordinates. */
2687 c->nr_payload_regs = 2;
2688 /* R2: only for 32-pixel dispatch.*/
2690 /* R3-26: barycentric interpolation coordinates. These appear in the
2691 * same order that they appear in the brw_wm_barycentric_interp_mode
2692 * enum. Each set of coordinates occupies 2 registers if dispatch width
2693 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2694 * appear if they were enabled using the "Barycentric Interpolation
2695 * Mode" bits in WM_STATE.
2697 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2698 if (barycentric_interp_modes & (1 << i)) {
2699 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2700 c->nr_payload_regs += 2;
2701 if (dispatch_width == 16) {
2702 c->nr_payload_regs += 2;
2707 /* R27: interpolated depth if uses source depth */
2709 c->source_depth_reg = c->nr_payload_regs;
2710 c->nr_payload_regs++;
2711 if (dispatch_width == 16) {
2712 /* R28: interpolated depth if not 8-wide. */
2713 c->nr_payload_regs++;
2716 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2718 c->source_w_reg = c->nr_payload_regs;
2719 c->nr_payload_regs++;
2720 if (dispatch_width == 16) {
2721 /* R30: interpolated W if not 8-wide. */
2722 c->nr_payload_regs++;
2725 /* R31: MSAA position offsets. */
2726 /* R32-: bary for 32-pixel. */
2727 /* R58-59: interp W for 32-pixel. */
2729 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2730 c->source_depth_to_render_target = true;
2737 sanity_param_count = fp->Base.Parameters->NumParameters;
2738 uint32_t orig_nr_params = c->prog_data.nr_params;
2740 if (intel->gen >= 6)
2741 setup_payload_gen6();
2743 setup_payload_gen4();
2748 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2749 emit_shader_time_begin();
2751 calculate_urb_setup();
2753 emit_interpolation_setup_gen4();
2755 emit_interpolation_setup_gen6();
2757 /* We handle discards by keeping track of the still-live pixels in f0.1.
2758 * Initialize it with the dispatched pixels.
2761 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2762 discard_init->flag_subreg = 1;
2765 /* Generate FS IR for main(). (the visitor only descends into
2766 * functions called "main").
2769 foreach_list(node, &*shader->ir) {
2770 ir_instruction *ir = (ir_instruction *)node;
2772 this->result = reg_undef;
2776 emit_fragment_program_code();
2782 emit(FS_OPCODE_PLACEHOLDER_HALT);
2786 split_virtual_grfs();
2788 move_uniform_array_access_to_pull_constants();
2789 setup_pull_constants();
2795 compact_virtual_grfs();
2797 progress = remove_duplicate_mrf_writes() || progress;
2799 progress = opt_algebraic() || progress;
2800 progress = opt_cse() || progress;
2801 progress = opt_copy_propagate() || progress;
2802 progress = dead_code_eliminate() || progress;
2803 progress = register_coalesce() || progress;
2804 progress = register_coalesce_2() || progress;
2805 progress = compute_to_mrf() || progress;
2808 remove_dead_constants();
2810 schedule_instructions(false);
2812 lower_uniform_pull_constant_loads();
2814 assign_curb_setup();
2818 /* Debug of register spilling: Go spill everything. */
2819 for (int i = 0; i < virtual_grf_count; i++) {
2825 assign_regs_trivial();
2827 while (!assign_regs()) {
2833 assert(force_uncompressed_stack == 0);
2834 assert(force_sechalf_stack == 0);
2836 /* This must come after all optimization and register allocation, since
2837 * it inserts dead code that happens to have side effects, and it does
2838 * so based on the actual physical registers in use.
2840 insert_gen4_send_dependency_workarounds();
2845 schedule_instructions(true);
2847 if (dispatch_width == 8) {
2848 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2850 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2852 /* Make sure we didn't try to sneak in an extra uniform */
2853 assert(orig_nr_params == c->prog_data.nr_params);
2854 (void) orig_nr_params;
2857 /* If any state parameters were appended, then ParameterValues could have
2858 * been realloced, in which case the driver uniform storage set up by
2859 * _mesa_associate_uniform_storage() would point to freed memory. Make
2860 * sure that didn't happen.
2862 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2868 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2869 struct gl_fragment_program *fp,
2870 struct gl_shader_program *prog,
2871 unsigned *final_assembly_size)
2873 struct intel_context *intel = &brw->intel;
2874 bool start_busy = false;
2875 float start_time = 0;
2877 if (unlikely(intel->perf_debug)) {
2878 start_busy = (intel->batch.last_bo &&
2879 drm_intel_bo_busy(intel->batch.last_bo));
2880 start_time = get_time();
2883 struct brw_shader *shader = NULL;
2885 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2887 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2889 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2890 _mesa_print_ir(shader->ir, NULL);
2893 printf("ARB_fragment_program %d ir for native fragment shader\n",
2895 _mesa_print_program(&fp->Base);
2899 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2901 fs_visitor v(brw, c, prog, fp, 8);
2903 prog->LinkStatus = false;
2904 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2906 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2912 exec_list *simd16_instructions = NULL;
2913 fs_visitor v2(brw, c, prog, fp, 16);
2914 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2915 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2916 v2.import_uniforms(&v);
2918 perf_debug("16-wide shader failed to compile, falling back to "
2919 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2921 simd16_instructions = &v2.instructions;
2925 c->prog_data.dispatch_width = 8;
2927 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2928 const unsigned *generated = g.generate_assembly(&v.instructions,
2929 simd16_instructions,
2930 final_assembly_size);
2932 if (unlikely(intel->perf_debug) && shader) {
2933 if (shader->compiled_once)
2934 brw_wm_debug_recompile(brw, prog, &c->key);
2935 shader->compiled_once = true;
2937 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2938 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2939 (get_time() - start_time) * 1000);
2947 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2949 struct brw_context *brw = brw_context(ctx);
2950 struct intel_context *intel = &brw->intel;
2951 struct brw_wm_prog_key key;
2953 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2956 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2957 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2958 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2959 bool program_uses_dfdy = fp->UsesDFdy;
2961 memset(&key, 0, sizeof(key));
2963 if (intel->gen < 6) {
2965 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2967 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2968 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2970 /* Just assume depth testing. */
2971 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2972 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2975 if (prog->Name != 0)
2976 key.proj_attrib_mask = ~(GLbitfield64) 0;
2978 /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2979 * avoid unnecessary recompiles, always set it to 1.
2981 key.proj_attrib_mask |= VARYING_BIT_POS;
2985 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2987 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2988 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2991 if (prog->Name == 0)
2992 key.proj_attrib_mask |= BITFIELD64_BIT(i);
2994 if (intel->gen < 6) {
2995 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2996 key.input_slots_valid |= BITFIELD64_BIT(i);
3000 key.clamp_fragment_color = true;
3002 for (int i = 0; i < MAX_SAMPLERS; i++) {
3003 if (fp->Base.ShadowSamplers & (1 << i)) {
3004 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3005 key.tex.swizzles[i] =
3006 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3008 /* Color sampler: assume no swizzling. */
3009 key.tex.swizzles[i] = SWIZZLE_XYZW;
3013 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3014 key.drawable_height = ctx->DrawBuffer->Height;
3017 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3018 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3021 key.nr_color_regions = 1;
3023 key.program_string_id = bfp->id;
3025 uint32_t old_prog_offset = brw->wm.prog_offset;
3026 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3028 bool success = do_wm_prog(brw, prog, bfp, &key);
3030 brw->wm.prog_offset = old_prog_offset;
3031 brw->wm.prog_data = old_prog_data;