2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
33 #include <sys/types.h>
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 const fs_reg *src, unsigned sources)
59 memset(this, 0, sizeof(*this));
61 this->src = new fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
65 this->opcode = opcode;
67 this->sources = sources;
68 this->exec_size = exec_size;
70 assert(dst.file != IMM && dst.file != UNIFORM);
72 /* If exec_size == 0, try to guess it from the registers. Since all
73 * manner of things may use hardware registers, we first try to guess
74 * based on GRF registers. If this fails, we will go ahead and take the
75 * width from the destination register.
77 if (this->exec_size == 0) {
78 if (dst.file == GRF) {
79 this->exec_size = dst.width;
81 for (unsigned i = 0; i < sources; ++i) {
82 if (src[i].file != GRF && src[i].file != ATTR)
85 if (this->exec_size <= 1)
86 this->exec_size = src[i].width;
87 assert(src[i].width == 1 || src[i].width == this->exec_size);
91 if (this->exec_size == 0 && dst.file != BAD_FILE)
92 this->exec_size = dst.width;
94 assert(this->exec_size != 0);
96 for (unsigned i = 0; i < sources; ++i) {
97 switch (this->src[i].file) {
99 this->src[i].effective_width = 8;
104 assert(this->src[i].width > 0);
105 if (this->src[i].width == 1) {
106 this->src[i].effective_width = this->exec_size;
108 this->src[i].effective_width = this->src[i].width;
113 this->src[i].effective_width = this->exec_size;
116 unreachable("Invalid source register file");
119 this->dst.effective_width = this->exec_size;
121 this->conditional_mod = BRW_CONDITIONAL_NONE;
123 /* This will be the case for almost all instructions. */
130 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
133 this->regs_written = 0;
137 unreachable("Invalid destination register file");
139 unreachable("Invalid register file");
142 this->writes_accumulator = false;
147 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
152 init(opcode, exec_size, reg_undef, NULL, 0);
155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
157 init(opcode, 0, dst, NULL, 0);
160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
163 const fs_reg src[1] = { src0 };
164 init(opcode, exec_size, dst, src, 1);
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
169 const fs_reg src[1] = { src0 };
170 init(opcode, 0, dst, src, 1);
173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
174 const fs_reg &src0, const fs_reg &src1)
176 const fs_reg src[2] = { src0, src1 };
177 init(opcode, exec_size, dst, src, 2);
180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
183 const fs_reg src[2] = { src0, src1 };
184 init(opcode, 0, dst, src, 2);
187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
188 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
190 const fs_reg src[3] = { src0, src1, src2 };
191 init(opcode, exec_size, dst, src, 3);
194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
195 const fs_reg &src1, const fs_reg &src2)
197 const fs_reg src[3] = { src0, src1, src2 };
198 init(opcode, 0, dst, src, 3);
201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
202 const fs_reg src[], unsigned sources)
204 init(opcode, 0, dst, src, sources);
207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
208 const fs_reg src[], unsigned sources)
210 init(opcode, exec_width, dst, src, sources);
213 fs_inst::fs_inst(const fs_inst &that)
215 memcpy(this, &that, sizeof(that));
217 this->src = new fs_reg[MAX2(that.sources, 3)];
219 for (unsigned i = 0; i < that.sources; i++)
220 this->src[i] = that.src[i];
229 fs_inst::resize_sources(uint8_t num_sources)
231 if (this->sources != num_sources) {
232 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
234 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
235 src[i] = this->src[i];
239 this->sources = num_sources;
245 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
252 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
253 const fs_reg &src1) \
255 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
258 #define ALU2_ACC(op) \
260 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
261 const fs_reg &src1) \
263 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
264 inst->writes_accumulator = true; \
270 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
271 const fs_reg &src1, const fs_reg &src2) \
273 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
305 /** Gen4 predicated IF. */
307 fs_visitor::IF(enum brw_predicate predicate)
309 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
310 inst->predicate = predicate;
314 /** Gen6 IF with embedded comparison. */
316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
317 enum brw_conditional_mod condition)
319 assert(brw->gen == 6);
320 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
321 reg_null_d, src0, src1);
322 inst->conditional_mod = condition;
327 * CMP: Sets the low bit of the destination channels with the result
328 * of the comparison, while the upper bits are undefined, and updates
329 * the flag register with the packed 16 bits of the result.
332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
333 enum brw_conditional_mod condition)
337 /* Take the instruction:
339 * CMP null<d> src0<f> src1<f>
341 * Original gen4 does type conversion to the destination type before
342 * comparison, producing garbage results for floating point comparisons.
344 * The destination type doesn't matter on newer generations, so we set the
345 * type to match src0 so we can compact the instruction.
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
351 resolve_ud_negate(&src0);
352 resolve_ud_negate(&src1);
354 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
355 inst->conditional_mod = condition;
361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 uint8_t exec_size = dst.width;
364 for (int i = 0; i < sources; ++i) {
365 assert(src[i].width % dst.width == 0);
366 if (src[i].width > exec_size)
367 exec_size = src[i].width;
370 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 inst->regs_written = 0;
373 for (int i = 0; i < sources; ++i) {
374 /* The LOAD_PAYLOAD instruction only really makes sense if we are
375 * dealing with whole registers. If this ever changes, we can deal
378 int size = inst->src[i].effective_width * type_sz(src[i].type);
379 assert(size % 32 == 0);
380 inst->regs_written += (size + 31) / 32;
387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
388 const fs_reg &surf_index,
389 const fs_reg &varying_offset,
390 uint32_t const_offset)
392 exec_list instructions;
395 /* We have our constant surface use a pitch of 4 bytes, so our index can
396 * be any component of a vector, and then we load 4 contiguous
397 * components starting from that.
399 * We break down the const_offset to a portion added to the variable
400 * offset and a portion done using reg_offset, which means that if you
401 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
402 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
403 * CSE can later notice that those loads are all the same and eliminate
404 * the redundant ones.
406 fs_reg vec4_offset = vgrf(glsl_type::int_type);
407 instructions.push_tail(ADD(vec4_offset,
408 varying_offset, fs_reg(const_offset & ~3)));
411 if (brw->gen == 4 && dst.width == 8) {
412 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
413 * u, v, r) as parameters, or we can just use the SIMD16 message
414 * consisting of (header, u). We choose the second, at the cost of a
415 * longer return length.
422 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426 assert(dst.width % 8 == 0);
427 int regs_written = 4 * (dst.width / 8) * scale;
428 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
429 dst.type, dst.width);
430 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
431 inst->regs_written = regs_written;
432 instructions.push_tail(inst);
436 inst->header_present = true;
440 inst->mlen = 1 + dispatch_width / 8;
443 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
444 instructions.push_tail(MOV(dst, result));
450 * A helper for MOV generation for fixing up broken hardware SEND dependency
454 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
459 inst->annotation = "send dependency resolve";
461 /* The caller always wants uncompressed to emit the minimal extra
462 * dependencies, and to avoid having to deal with aligning its regs to 2.
470 fs_inst::equals(fs_inst *inst) const
472 return (opcode == inst->opcode &&
473 dst.equals(inst->dst) &&
474 src[0].equals(inst->src[0]) &&
475 src[1].equals(inst->src[1]) &&
476 src[2].equals(inst->src[2]) &&
477 saturate == inst->saturate &&
478 predicate == inst->predicate &&
479 conditional_mod == inst->conditional_mod &&
480 mlen == inst->mlen &&
481 base_mrf == inst->base_mrf &&
482 target == inst->target &&
484 header_present == inst->header_present &&
485 shadow_compare == inst->shadow_compare &&
486 exec_size == inst->exec_size &&
487 offset == inst->offset);
491 fs_inst::overwrites_reg(const fs_reg ®) const
493 return (reg.file == dst.file &&
494 reg.reg == dst.reg &&
495 reg.reg_offset >= dst.reg_offset &&
496 reg.reg_offset < dst.reg_offset + regs_written);
500 fs_inst::is_send_from_grf() const
503 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
504 case SHADER_OPCODE_SHADER_TIME_ADD:
505 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
506 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
507 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
508 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
509 case SHADER_OPCODE_UNTYPED_ATOMIC:
510 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
511 case SHADER_OPCODE_URB_WRITE_SIMD8:
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
519 return src[0].file == GRF;
526 fs_inst::can_do_source_mods(struct brw_context *brw)
528 if (brw->gen == 6 && is_math())
531 if (is_send_from_grf())
534 if (!backend_instruction::can_do_source_mods())
543 memset(this, 0, sizeof(*this));
547 /** Generic unset register constructor. */
551 this->file = BAD_FILE;
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
616 fs_reg::equals(const fs_reg &r) const
618 return (file == r.file &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
623 negate == r.negate &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
632 fs_reg::set_smear(unsigned subreg)
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
641 fs_reg::is_contiguous() const
647 fs_visitor::type_size(const struct glsl_type *type)
649 unsigned int size, i;
651 switch (type->base_type) {
654 case GLSL_TYPE_FLOAT:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
670 case GLSL_TYPE_ATOMIC_UINT:
672 case GLSL_TYPE_IMAGE:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
684 fs_visitor::get_timestamp()
686 assert(brw->gen >= 7);
688 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
691 BRW_REGISTER_TYPE_UD));
693 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
695 fs_inst *mov = emit(MOV(dst, ts));
696 /* We want to read the 3 fields we care about even if it's not enabled in
699 mov->force_writemask_all = true;
701 /* The caller wants the low 32 bits of the timestamp. Since it's running
702 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
703 * which is plenty of time for our purposes. It is identical across the
704 * EUs, but since it's tracking GPU core speed it will increment at a
705 * varying rate as render P-states change.
707 * The caller could also check if render P-states have changed (or anything
708 * else that might disrupt timing) by setting smear to 2 and checking if
709 * that field is != 0.
717 fs_visitor::emit_shader_time_begin()
719 current_annotation = "shader time start";
720 shader_start_time = get_timestamp();
724 fs_visitor::emit_shader_time_end()
726 current_annotation = "shader time end";
728 enum shader_time_shader_type type, written_type, reset_type;
730 case MESA_SHADER_VERTEX:
732 written_type = ST_VS_WRITTEN;
733 reset_type = ST_VS_RESET;
735 case MESA_SHADER_GEOMETRY:
737 written_type = ST_GS_WRITTEN;
738 reset_type = ST_GS_RESET;
740 case MESA_SHADER_FRAGMENT:
741 if (dispatch_width == 8) {
743 written_type = ST_FS8_WRITTEN;
744 reset_type = ST_FS8_RESET;
746 assert(dispatch_width == 16);
748 written_type = ST_FS16_WRITTEN;
749 reset_type = ST_FS16_RESET;
753 unreachable("fs_visitor::emit_shader_time_end missing code");
756 fs_reg shader_end_time = get_timestamp();
758 /* Check that there weren't any timestamp reset events (assuming these
759 * were the only two timestamp reads that happened).
761 fs_reg reset = shader_end_time;
763 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
764 test->conditional_mod = BRW_CONDITIONAL_Z;
765 emit(IF(BRW_PREDICATE_NORMAL));
767 fs_reg start = shader_start_time;
769 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
770 emit(ADD(diff, start, shader_end_time));
772 /* If there were no instructions between the two timestamp gets, the diff
773 * is 2 cycles. Remove that overhead, so I can forget about that when
774 * trying to determine the time taken for single instructions.
776 emit(ADD(diff, diff, fs_reg(-2u)));
778 emit_shader_time_write(type, diff);
779 emit_shader_time_write(written_type, fs_reg(1u));
780 emit(BRW_OPCODE_ELSE);
781 emit_shader_time_write(reset_type, fs_reg(1u));
782 emit(BRW_OPCODE_ENDIF);
786 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
789 int shader_time_index =
790 brw_get_shader_time_index(brw, shader_prog, prog, type);
791 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
794 if (dispatch_width == 8)
795 payload = vgrf(glsl_type::uvec2_type);
797 payload = vgrf(glsl_type::uint_type);
799 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
800 fs_reg(), payload, offset, value));
804 fs_visitor::vfail(const char *format, va_list va)
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
816 this->fail_msg = msg;
819 fprintf(stderr, "%s", msg);
824 fs_visitor::fail(const char *format, ...)
828 va_start(va, format);
834 * Mark this program as impossible to compile in SIMD16 mode.
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
843 fs_visitor::no16(const char *format, ...)
847 va_start(va, format);
849 if (dispatch_width == 16) {
852 simd16_unsupported = true;
854 if (brw->perf_debug) {
856 ralloc_vasprintf_append(&no16_msg, format, va);
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
866 fs_visitor::emit(enum opcode opcode)
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
913 fs_inst::is_partial_write() const
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
921 fs_inst::regs_read(int arg) const
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
933 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
937 switch (src[arg].file) {
944 if (src[arg].stride == 0) {
947 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
948 return (size + 31) / 32;
951 unreachable("MRF registers are not allowed as sources");
953 unreachable("Invalid register file");
958 fs_inst::reads_flag() const
964 fs_inst::writes_flag() const
966 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
967 opcode != BRW_OPCODE_IF &&
968 opcode != BRW_OPCODE_WHILE)) ||
969 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
973 * Returns how many MRFs an FS opcode will write over.
975 * Note that this is not the 0 or 1 implied writes in an actual gen
976 * instruction -- the FS opcodes often generate MOVs in addition.
979 fs_visitor::implied_mrf_writes(fs_inst *inst)
984 if (inst->base_mrf == -1)
987 switch (inst->opcode) {
988 case SHADER_OPCODE_RCP:
989 case SHADER_OPCODE_RSQ:
990 case SHADER_OPCODE_SQRT:
991 case SHADER_OPCODE_EXP2:
992 case SHADER_OPCODE_LOG2:
993 case SHADER_OPCODE_SIN:
994 case SHADER_OPCODE_COS:
995 return 1 * dispatch_width / 8;
996 case SHADER_OPCODE_POW:
997 case SHADER_OPCODE_INT_QUOTIENT:
998 case SHADER_OPCODE_INT_REMAINDER:
999 return 2 * dispatch_width / 8;
1000 case SHADER_OPCODE_TEX:
1002 case SHADER_OPCODE_TXD:
1003 case SHADER_OPCODE_TXF:
1004 case SHADER_OPCODE_TXF_CMS:
1005 case SHADER_OPCODE_TXF_MCS:
1006 case SHADER_OPCODE_TG4:
1007 case SHADER_OPCODE_TG4_OFFSET:
1008 case SHADER_OPCODE_TXL:
1009 case SHADER_OPCODE_TXS:
1010 case SHADER_OPCODE_LOD:
1012 case FS_OPCODE_FB_WRITE:
1014 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1015 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1017 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1019 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1021 case SHADER_OPCODE_UNTYPED_ATOMIC:
1022 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1023 case SHADER_OPCODE_URB_WRITE_SIMD8:
1024 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1025 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1026 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1027 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1030 unreachable("not reached");
1035 fs_visitor::vgrf(const glsl_type *const type)
1037 int reg_width = dispatch_width / 8;
1038 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1039 brw_type_for_base_type(type), dispatch_width);
1043 fs_visitor::vgrf(int num_components)
1045 int reg_width = dispatch_width / 8;
1046 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1047 BRW_REGISTER_TYPE_F, dispatch_width);
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg)
1056 this->type = BRW_REGISTER_TYPE_F;
1067 /** Fixed HW reg constructor. */
1068 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1084 /** Fixed HW reg constructor. */
1085 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1092 this->width = width;
1096 fs_visitor::variable_storage(ir_variable *var)
1098 return (fs_reg *)hash_table_find(this->variable_ht, var);
1102 import_uniforms_callback(const void *key,
1106 struct hash_table *dst_ht = (struct hash_table *)closure;
1107 const fs_reg *reg = (const fs_reg *)data;
1109 if (reg->file != UNIFORM)
1112 hash_table_insert(dst_ht, data, key);
1115 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1116 * This brings in those uniform definitions
1119 fs_visitor::import_uniforms(fs_visitor *v)
1121 hash_table_call_foreach(v->variable_ht,
1122 import_uniforms_callback,
1124 this->push_constant_loc = v->push_constant_loc;
1125 this->pull_constant_loc = v->pull_constant_loc;
1126 this->uniforms = v->uniforms;
1127 this->param_size = v->param_size;
1130 /* Our support for uniforms is piggy-backed on the struct
1131 * gl_fragment_program, because that's where the values actually
1132 * get stored, rather than in some global gl_shader_program uniform
1136 fs_visitor::setup_uniform_values(ir_variable *ir)
1138 int namelen = strlen(ir->name);
1140 /* The data for our (non-builtin) uniforms is stored in a series of
1141 * gl_uniform_driver_storage structs for each subcomponent that
1142 * glGetUniformLocation() could name. We know it's been set up in the same
1143 * order we'd walk the type, so walk the list of storage and find anything
1144 * with our name, or the prefix of a component that starts with our name.
1146 unsigned params_before = uniforms;
1147 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1148 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1150 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1151 (storage->name[namelen] != 0 &&
1152 storage->name[namelen] != '.' &&
1153 storage->name[namelen] != '[')) {
1157 unsigned slots = storage->type->component_slots();
1158 if (storage->array_elements)
1159 slots *= storage->array_elements;
1161 for (unsigned i = 0; i < slots; i++) {
1162 stage_prog_data->param[uniforms++] = &storage->storage[i];
1166 /* Make sure we actually initialized the right amount of stuff here. */
1167 assert(params_before + ir->type->component_slots() == uniforms);
1168 (void)params_before;
1172 /* Our support for builtin uniforms is even scarier than non-builtin.
1173 * It sits on top of the PROG_STATE_VAR parameters that are
1174 * automatically updated from GL context state.
1177 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1179 const ir_state_slot *const slots = ir->get_state_slots();
1180 assert(slots != NULL);
1182 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1183 /* This state reference has already been setup by ir_to_mesa, but we'll
1184 * get the same index back here.
1186 int index = _mesa_add_state_reference(this->prog->Parameters,
1187 (gl_state_index *)slots[i].tokens);
1189 /* Add each of the unique swizzles of the element as a parameter.
1190 * This'll end up matching the expected layout of the
1191 * array/matrix/structure we're trying to fill in.
1194 for (unsigned int j = 0; j < 4; j++) {
1195 int swiz = GET_SWZ(slots[i].swizzle, j);
1196 if (swiz == last_swiz)
1200 stage_prog_data->param[uniforms++] =
1201 &prog->Parameters->ParameterValues[index][swiz];
1207 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1208 bool origin_upper_left)
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1214 bool flip = !origin_upper_left ^ key->render_to_fbo;
1216 /* gl_FragCoord.x */
1217 if (pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1222 wpos = offset(wpos, 1);
1224 /* gl_FragCoord.y */
1225 if (!flip && pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (pixel_center_integer ? 0.0 : 0.5);
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1238 wpos = offset(wpos, 1);
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1249 wpos = offset(wpos, 1);
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1293 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1294 const glsl_type *type,
1295 glsl_interp_qualifier interpolation_mode,
1296 int location, bool mod_centroid,
1299 attr.type = brw_type_for_base_type(type->get_scalar_type());
1301 assert(stage == MESA_SHADER_FRAGMENT);
1302 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1303 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1305 unsigned int array_elements;
1307 if (type->is_array()) {
1308 array_elements = type->length;
1309 if (array_elements == 0) {
1310 fail("dereferenced array '%s' has length 0\n", name);
1312 type = type->fields.array;
1317 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1319 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1320 if (key->flat_shade && is_gl_Color) {
1321 interpolation_mode = INTERP_QUALIFIER_FLAT;
1323 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1327 for (unsigned int i = 0; i < array_elements; i++) {
1328 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1329 if (prog_data->urb_setup[location] == -1) {
1330 /* If there's no incoming setup data for this slot, don't
1331 * emit interpolation for it.
1333 attr = offset(attr, type->vector_elements);
1338 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1339 /* Constant interpolation (flat shading) case. The SF has
1340 * handed us defined values in only the constant offset
1341 * field of the setup reg.
1343 for (unsigned int k = 0; k < type->vector_elements; k++) {
1344 struct brw_reg interp = interp_reg(location, k);
1345 interp = suboffset(interp, 3);
1346 interp.type = attr.type;
1347 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1348 attr = offset(attr, 1);
1351 /* Smooth/noperspective interpolation case. */
1352 for (unsigned int k = 0; k < type->vector_elements; k++) {
1353 struct brw_reg interp = interp_reg(location, k);
1354 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1355 /* Get the pixel/sample mask into f0 so that we know
1356 * which pixels are lit. Then, for each channel that is
1357 * unlit, replace the centroid data with non-centroid
1360 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1363 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366 inst->predicate_inverse = true;
1368 inst->no_dd_clear = true;
1370 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1371 mod_centroid && !key->persample_shading,
1372 mod_sample || key->persample_shading);
1373 inst->predicate = BRW_PREDICATE_NORMAL;
1374 inst->predicate_inverse = false;
1376 inst->no_dd_check = true;
1379 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1380 mod_centroid && !key->persample_shading,
1381 mod_sample || key->persample_shading);
1383 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1384 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1386 attr = offset(attr, 1);
1396 fs_visitor::emit_frontfacing_interpolation()
1398 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1400 if (brw->gen >= 6) {
1401 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1402 * a boolean result from this (~0/true or 0/false).
1404 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1405 * this task in only one instruction:
1406 * - a negation source modifier will flip the bit; and
1407 * - a W -> D type conversion will sign extend the bit into the high
1408 * word of the destination.
1410 * An ASR 15 fills the low word of the destination.
1412 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1415 emit(ASR(*reg, g0, fs_reg(15)));
1417 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1418 * a boolean result from this (1/true or 0/false).
1420 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1421 * the negation source modifier to flip it. Unfortunately the SHR
1422 * instruction only operates on UD (or D with an abs source modifier)
1423 * sources without negation.
1425 * Instead, use ASR (which will give ~0/true or 0/false).
1427 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1430 emit(ASR(*reg, g1_6, fs_reg(31)));
1437 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1439 assert(stage == MESA_SHADER_FRAGMENT);
1440 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1441 assert(dst.type == BRW_REGISTER_TYPE_F);
1443 if (key->compute_pos_offset) {
1444 /* Convert int_sample_pos to floating point */
1445 emit(MOV(dst, int_sample_pos));
1446 /* Scale to the range [0, 1] */
1447 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1450 /* From ARB_sample_shading specification:
1451 * "When rendering to a non-multisample buffer, or if multisample
1452 * rasterization is disabled, gl_SamplePosition will always be
1455 emit(MOV(dst, fs_reg(0.5f)));
1460 fs_visitor::emit_samplepos_setup()
1462 assert(brw->gen >= 6);
1464 this->current_annotation = "compute sample position";
1465 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1467 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1468 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1470 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1471 * mode will be enabled.
1473 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1474 * R31.1:0 Position Offset X/Y for Slot[3:0]
1475 * R31.3:2 Position Offset X/Y for Slot[7:4]
1478 * The X, Y sample positions come in as bytes in thread payload. So, read
1479 * the positions using vstride=16, width=8, hstride=2.
1481 struct brw_reg sample_pos_reg =
1482 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1483 BRW_REGISTER_TYPE_B), 16, 8, 2);
1485 if (dispatch_width == 8) {
1486 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1489 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1490 ->force_sechalf = true;
1492 /* Compute gl_SamplePosition.x */
1493 compute_sample_position(pos, int_sample_x);
1494 pos = offset(pos, 1);
1495 if (dispatch_width == 8) {
1496 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1498 emit(MOV(half(int_sample_y, 0),
1499 fs_reg(suboffset(sample_pos_reg, 1))));
1500 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1501 ->force_sechalf = true;
1503 /* Compute gl_SamplePosition.y */
1504 compute_sample_position(pos, int_sample_y);
1509 fs_visitor::emit_sampleid_setup()
1511 assert(stage == MESA_SHADER_FRAGMENT);
1512 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1513 assert(brw->gen >= 6);
1515 this->current_annotation = "compute sample id";
1516 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1518 if (key->compute_sample_id) {
1519 fs_reg t1 = vgrf(glsl_type::int_type);
1520 fs_reg t2 = vgrf(glsl_type::int_type);
1521 t2.type = BRW_REGISTER_TYPE_UW;
1523 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1524 * 8x multisampling, subspan 0 will represent sample N (where N
1525 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1526 * 7. We can find the value of N by looking at R0.0 bits 7:6
1527 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1528 * (since samples are always delivered in pairs). That is, we
1529 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1530 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1531 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1532 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1533 * populating a temporary variable with the sequence (0, 1, 2, 3),
1534 * and then reading from it using vstride=1, width=4, hstride=0.
1535 * These computations hold good for 4x multisampling as well.
1537 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1538 * the first four slots are sample 0 of subspan 0; the next four
1539 * are sample 1 of subspan 0; the third group is sample 0 of
1540 * subspan 1, and finally sample 1 of subspan 1.
1543 inst = emit(BRW_OPCODE_AND, t1,
1544 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1546 inst->force_writemask_all = true;
1547 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1548 inst->force_writemask_all = true;
1549 /* This works for both SIMD8 and SIMD16 */
1550 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1551 inst->force_writemask_all = true;
1552 /* This special instruction takes care of setting vstride=1,
1553 * width=4, hstride=0 of t2 during an ADD instruction.
1555 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1557 /* As per GL_ARB_sample_shading specification:
1558 * "When rendering to a non-multisample buffer, or if multisample
1559 * rasterization is disabled, gl_SampleID will always be zero."
1561 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1568 fs_visitor::fix_math_operand(fs_reg src)
1570 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1571 * might be able to do better by doing execsize = 1 math and then
1572 * expanding that result out, but we would need to be careful with
1575 * The hardware ignores source modifiers (negate and abs) on math
1576 * instructions, so we also move to a temp to set those up.
1578 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1579 !src.abs && !src.negate)
1582 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1585 if (brw->gen >= 7 && src.file != IMM)
1588 fs_reg expanded = vgrf(glsl_type::float_type);
1589 expanded.type = src.type;
1590 emit(BRW_OPCODE_MOV, expanded, src);
1595 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1598 case SHADER_OPCODE_RCP:
1599 case SHADER_OPCODE_RSQ:
1600 case SHADER_OPCODE_SQRT:
1601 case SHADER_OPCODE_EXP2:
1602 case SHADER_OPCODE_LOG2:
1603 case SHADER_OPCODE_SIN:
1604 case SHADER_OPCODE_COS:
1607 unreachable("not reached: bad math opcode");
1610 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1611 * might be able to do better by doing execsize = 1 math and then
1612 * expanding that result out, but we would need to be careful with
1615 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1616 * instructions, so we also move to a temp to set those up.
1618 if (brw->gen == 6 || brw->gen == 7)
1619 src = fix_math_operand(src);
1621 fs_inst *inst = emit(opcode, dst, src);
1625 inst->mlen = dispatch_width / 8;
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1637 if (brw->gen >= 8) {
1638 inst = emit(opcode, dst, src0, src1);
1639 } else if (brw->gen >= 6) {
1640 src0 = fix_math_operand(src0);
1641 src1 = fix_math_operand(src1);
1643 inst = emit(opcode, dst, src0, src1);
1645 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1646 * "Message Payload":
1648 * "Operand0[7]. For the INT DIV functions, this operand is the
1651 * "Operand1[7]. For the INT DIV functions, this operand is the
1654 bool is_int_div = opcode != SHADER_OPCODE_POW;
1655 fs_reg &op0 = is_int_div ? src1 : src0;
1656 fs_reg &op1 = is_int_div ? src0 : src1;
1658 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1659 inst = emit(opcode, dst, op0, reg_null_f);
1661 inst->base_mrf = base_mrf;
1662 inst->mlen = 2 * dispatch_width / 8;
1668 fs_visitor::assign_curb_setup()
1670 if (dispatch_width == 8) {
1671 prog_data->dispatch_grf_start_reg = payload.num_regs;
1673 assert(stage == MESA_SHADER_FRAGMENT);
1674 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1675 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1678 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1680 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1681 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1682 for (unsigned int i = 0; i < inst->sources; i++) {
1683 if (inst->src[i].file == UNIFORM) {
1684 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1686 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1687 constant_nr = push_constant_loc[uniform_nr];
1689 /* Section 5.11 of the OpenGL 4.1 spec says:
1690 * "Out-of-bounds reads return undefined values, which include
1691 * values from other variables of the active program or zero."
1692 * Just return the first push constant.
1697 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1701 inst->src[i].file = HW_REG;
1702 inst->src[i].fixed_hw_reg = byte_offset(
1703 retype(brw_reg, inst->src[i].type),
1704 inst->src[i].subreg_offset);
1711 fs_visitor::calculate_urb_setup()
1713 assert(stage == MESA_SHADER_FRAGMENT);
1714 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1715 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1717 memset(prog_data->urb_setup, -1,
1718 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1721 /* Figure out where each of the incoming setup attributes lands. */
1722 if (brw->gen >= 6) {
1723 if (_mesa_bitcount_64(prog->InputsRead &
1724 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1725 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1726 * first 16 varying inputs, so we can put them wherever we want.
1727 * Just put them in order.
1729 * This is useful because it means that (a) inputs not used by the
1730 * fragment shader won't take up valuable register space, and (b) we
1731 * won't have to recompile the fragment shader if it gets paired with
1732 * a different vertex (or geometry) shader.
1734 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1735 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1736 BITFIELD64_BIT(i)) {
1737 prog_data->urb_setup[i] = urb_next++;
1741 /* We have enough input varyings that the SF/SBE pipeline stage can't
1742 * arbitrarily rearrange them to suit our whim; we have to put them
1743 * in an order that matches the output of the previous pipeline stage
1744 * (geometry or vertex shader).
1746 struct brw_vue_map prev_stage_vue_map;
1747 brw_compute_vue_map(brw, &prev_stage_vue_map,
1748 key->input_slots_valid);
1749 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1750 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1751 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1753 int varying = prev_stage_vue_map.slot_to_varying[slot];
1754 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1757 if (varying != BRW_VARYING_SLOT_COUNT &&
1758 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1759 BITFIELD64_BIT(varying))) {
1760 prog_data->urb_setup[varying] = slot - first_slot;
1763 urb_next = prev_stage_vue_map.num_slots - first_slot;
1766 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1767 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1768 /* Point size is packed into the header, not as a general attribute */
1769 if (i == VARYING_SLOT_PSIZ)
1772 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1773 /* The back color slot is skipped when the front color is
1774 * also written to. In addition, some slots can be
1775 * written in the vertex shader and not read in the
1776 * fragment shader. So the register number must always be
1777 * incremented, mapped or not.
1779 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1780 prog_data->urb_setup[i] = urb_next;
1786 * It's a FS only attribute, and we did interpolation for this attribute
1787 * in SF thread. So, count it here, too.
1789 * See compile_sf_prog() for more info.
1791 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1792 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1795 prog_data->num_varying_inputs = urb_next;
1799 fs_visitor::assign_urb_setup()
1801 assert(stage == MESA_SHADER_FRAGMENT);
1802 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1804 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1806 /* Offset all the urb_setup[] index by the actual position of the
1807 * setup regs, now that the location of the constants has been chosen.
1809 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1810 if (inst->opcode == FS_OPCODE_LINTERP) {
1811 assert(inst->src[2].file == HW_REG);
1812 inst->src[2].fixed_hw_reg.nr += urb_start;
1815 if (inst->opcode == FS_OPCODE_CINTERP) {
1816 assert(inst->src[0].file == HW_REG);
1817 inst->src[0].fixed_hw_reg.nr += urb_start;
1821 /* Each attribute is 4 setup channels, each of which is half a reg. */
1822 this->first_non_payload_grf =
1823 urb_start + prog_data->num_varying_inputs * 2;
1827 fs_visitor::assign_vs_urb_setup()
1829 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1830 int grf, count, slot, channel, attr;
1832 assert(stage == MESA_SHADER_VERTEX);
1833 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1834 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1837 /* Each attribute is 4 regs. */
1838 this->first_non_payload_grf =
1839 payload.num_regs + prog_data->curb_read_length + count * 4;
1841 unsigned vue_entries =
1842 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1844 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1845 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1847 assert(vs_prog_data->base.urb_read_length <= 15);
1849 /* Rewrite all ATTR file references to the hw grf that they land in. */
1850 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851 for (int i = 0; i < inst->sources; i++) {
1852 if (inst->src[i].file == ATTR) {
1854 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1857 /* Attributes come in in a contiguous block, ordered by their
1858 * gl_vert_attrib value. That means we can compute the slot
1859 * number for an attribute by masking out the enabled
1860 * attributes before it and counting the bits.
1862 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1863 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1864 BITFIELD64_MASK(attr));
1867 channel = inst->src[i].reg_offset & 3;
1869 grf = payload.num_regs +
1870 prog_data->curb_read_length +
1873 inst->src[i].file = HW_REG;
1874 inst->src[i].fixed_hw_reg =
1875 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1882 * Split large virtual GRFs into separate components if we can.
1884 * This is mostly duplicated with what brw_fs_vector_splitting does,
1885 * but that's really conservative because it's afraid of doing
1886 * splitting that doesn't result in real progress after the rest of
1887 * the optimization phases, which would cause infinite looping in
1888 * optimization. We can do it once here, safely. This also has the
1889 * opportunity to split interpolated values, or maybe even uniforms,
1890 * which we don't have at the IR level.
1892 * We want to split, because virtual GRFs are what we register
1893 * allocate and spill (due to contiguousness requirements for some
1894 * instructions), and they're what we naturally generate in the
1895 * codegen process, but most virtual GRFs don't actually need to be
1896 * contiguous sets of GRFs. If we split, we'll end up with reduced
1897 * live intervals and better dead code elimination and coalescing.
1900 fs_visitor::split_virtual_grfs()
1902 int num_vars = this->alloc.count;
1904 /* Count the total number of registers */
1906 int vgrf_to_reg[num_vars];
1907 for (int i = 0; i < num_vars; i++) {
1908 vgrf_to_reg[i] = reg_count;
1909 reg_count += alloc.sizes[i];
1912 /* An array of "split points". For each register slot, this indicates
1913 * if this slot can be separated from the previous slot. Every time an
1914 * instruction uses multiple elements of a register (as a source or
1915 * destination), we mark the used slots as inseparable. Then we go
1916 * through and split the registers into the smallest pieces we can.
1918 bool split_points[reg_count];
1919 memset(split_points, 0, sizeof(split_points));
1921 /* Mark all used registers as fully splittable */
1922 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1923 if (inst->dst.file == GRF) {
1924 int reg = vgrf_to_reg[inst->dst.reg];
1925 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1926 split_points[reg + j] = true;
1929 for (int i = 0; i < inst->sources; i++) {
1930 if (inst->src[i].file == GRF) {
1931 int reg = vgrf_to_reg[inst->src[i].reg];
1932 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1933 split_points[reg + j] = true;
1939 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1940 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1941 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1942 * Gen6, that was the only supported interpolation mode, and since Gen6,
1943 * delta_x and delta_y are in fixed hardware registers.
1945 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1946 split_points[vgrf_to_reg[vgrf] + 1] = false;
1949 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1950 if (inst->dst.file == GRF) {
1951 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1952 for (int j = 1; j < inst->regs_written; j++)
1953 split_points[reg + j] = false;
1955 for (int i = 0; i < inst->sources; i++) {
1956 if (inst->src[i].file == GRF) {
1957 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1958 for (int j = 1; j < inst->regs_read(i); j++)
1959 split_points[reg + j] = false;
1964 int new_virtual_grf[reg_count];
1965 int new_reg_offset[reg_count];
1968 for (int i = 0; i < num_vars; i++) {
1969 /* The first one should always be 0 as a quick sanity check. */
1970 assert(split_points[reg] == false);
1973 new_reg_offset[reg] = 0;
1978 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1979 /* If this is a split point, reset the offset to 0 and allocate a
1980 * new virtual GRF for the previous offset many registers
1982 if (split_points[reg]) {
1983 assert(offset <= MAX_VGRF_SIZE);
1984 int grf = alloc.allocate(offset);
1985 for (int k = reg - offset; k < reg; k++)
1986 new_virtual_grf[k] = grf;
1989 new_reg_offset[reg] = offset;
1994 /* The last one gets the original register number */
1995 assert(offset <= MAX_VGRF_SIZE);
1996 alloc.sizes[i] = offset;
1997 for (int k = reg - offset; k < reg; k++)
1998 new_virtual_grf[k] = i;
2000 assert(reg == reg_count);
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF) {
2004 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2005 inst->dst.reg = new_virtual_grf[reg];
2006 inst->dst.reg_offset = new_reg_offset[reg];
2007 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2009 for (int i = 0; i < inst->sources; i++) {
2010 if (inst->src[i].file == GRF) {
2011 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2012 inst->src[i].reg = new_virtual_grf[reg];
2013 inst->src[i].reg_offset = new_reg_offset[reg];
2014 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2018 invalidate_live_intervals();
2022 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2024 * During code generation, we create tons of temporary variables, many of
2025 * which get immediately killed and are never used again. Yet, in later
2026 * optimization and analysis passes, such as compute_live_intervals, we need
2027 * to loop over all the virtual GRFs. Compacting them can save a lot of
2031 fs_visitor::compact_virtual_grfs()
2033 bool progress = false;
2034 int remap_table[this->alloc.count];
2035 memset(remap_table, -1, sizeof(remap_table));
2037 /* Mark which virtual GRFs are used. */
2038 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2039 if (inst->dst.file == GRF)
2040 remap_table[inst->dst.reg] = 0;
2042 for (int i = 0; i < inst->sources; i++) {
2043 if (inst->src[i].file == GRF)
2044 remap_table[inst->src[i].reg] = 0;
2048 /* Compact the GRF arrays. */
2050 for (unsigned i = 0; i < this->alloc.count; i++) {
2051 if (remap_table[i] == -1) {
2052 /* We just found an unused register. This means that we are
2053 * actually going to compact something.
2057 remap_table[i] = new_index;
2058 alloc.sizes[new_index] = alloc.sizes[i];
2059 invalidate_live_intervals();
2064 this->alloc.count = new_index;
2066 /* Patch all the instructions to use the newly renumbered registers */
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF)
2069 inst->dst.reg = remap_table[inst->dst.reg];
2071 for (int i = 0; i < inst->sources; i++) {
2072 if (inst->src[i].file == GRF)
2073 inst->src[i].reg = remap_table[inst->src[i].reg];
2077 /* Patch all the references to delta_x/delta_y, since they're used in
2078 * register allocation. If they're unused, switch them to BAD_FILE so
2079 * we don't think some random VGRF is delta_x/delta_y.
2081 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2082 if (delta_x[i].file == GRF) {
2083 if (remap_table[delta_x[i].reg] != -1) {
2084 delta_x[i].reg = remap_table[delta_x[i].reg];
2086 delta_x[i].file = BAD_FILE;
2090 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2091 if (delta_y[i].file == GRF) {
2092 if (remap_table[delta_y[i].reg] != -1) {
2093 delta_y[i].reg = remap_table[delta_y[i].reg];
2095 delta_y[i].file = BAD_FILE;
2104 * Implements array access of uniforms by inserting a
2105 * PULL_CONSTANT_LOAD instruction.
2107 * Unlike temporary GRF array access (where we don't support it due to
2108 * the difficulty of doing relative addressing on instruction
2109 * destinations), we could potentially do array access of uniforms
2110 * that were loaded in GRF space as push constants. In real-world
2111 * usage we've seen, though, the arrays being used are always larger
2112 * than we could load as push constants, so just always move all
2113 * uniform array access out to a pull constant buffer.
2116 fs_visitor::move_uniform_array_access_to_pull_constants()
2118 if (dispatch_width != 8)
2121 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2122 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2124 /* Walk through and find array access of uniforms. Put a copy of that
2125 * uniform in the pull constant buffer.
2127 * Note that we don't move constant-indexed accesses to arrays. No
2128 * testing has been done of the performance impact of this choice.
2130 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2131 for (int i = 0 ; i < inst->sources; i++) {
2132 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2135 int uniform = inst->src[i].reg;
2137 /* If this array isn't already present in the pull constant buffer,
2140 if (pull_constant_loc[uniform] == -1) {
2141 const gl_constant_value **values = &stage_prog_data->param[uniform];
2143 assert(param_size[uniform]);
2145 for (int j = 0; j < param_size[uniform]; j++) {
2146 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2148 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2157 * Assign UNIFORM file registers to either push constants or pull constants.
2159 * We allow a fragment shader to have more than the specified minimum
2160 * maximum number of fragment shader uniform components (64). If
2161 * there are too many of these, they'd fill up all of register space.
2162 * So, this will push some of them out to the pull constant buffer and
2163 * update the program to load them.
2166 fs_visitor::assign_constant_locations()
2168 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2169 if (dispatch_width != 8)
2172 /* Find which UNIFORM registers are still in use. */
2173 bool is_live[uniforms];
2174 for (unsigned int i = 0; i < uniforms; i++) {
2178 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2179 for (int i = 0; i < inst->sources; i++) {
2180 if (inst->src[i].file != UNIFORM)
2183 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2184 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2185 is_live[constant_nr] = true;
2189 /* Only allow 16 registers (128 uniform components) as push constants.
2191 * Just demote the end of the list. We could probably do better
2192 * here, demoting things that are rarely used in the program first.
2194 * If changing this value, note the limitation about total_regs in
2197 unsigned int max_push_components = 16 * 8;
2198 unsigned int num_push_constants = 0;
2200 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2202 for (unsigned int i = 0; i < uniforms; i++) {
2203 if (!is_live[i] || pull_constant_loc[i] != -1) {
2204 /* This UNIFORM register is either dead, or has already been demoted
2205 * to a pull const. Mark it as no longer living in the param[] array.
2207 push_constant_loc[i] = -1;
2211 if (num_push_constants < max_push_components) {
2212 /* Retain as a push constant. Record the location in the params[]
2215 push_constant_loc[i] = num_push_constants++;
2217 /* Demote to a pull constant. */
2218 push_constant_loc[i] = -1;
2220 int pull_index = stage_prog_data->nr_pull_params++;
2221 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2222 pull_constant_loc[i] = pull_index;
2226 stage_prog_data->nr_params = num_push_constants;
2228 /* Up until now, the param[] array has been indexed by reg + reg_offset
2229 * of UNIFORM registers. Condense it to only contain the uniforms we
2230 * chose to upload as push constants.
2232 for (unsigned int i = 0; i < uniforms; i++) {
2233 int remapped = push_constant_loc[i];
2238 assert(remapped <= (int)i);
2239 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2244 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2245 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2248 fs_visitor::demote_pull_constants()
2250 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2251 for (int i = 0; i < inst->sources; i++) {
2252 if (inst->src[i].file != UNIFORM)
2255 int pull_index = pull_constant_loc[inst->src[i].reg +
2256 inst->src[i].reg_offset];
2257 if (pull_index == -1)
2260 /* Set up the annotation tracking for new generated instructions. */
2262 current_annotation = inst->annotation;
2264 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2265 fs_reg dst = vgrf(glsl_type::float_type);
2267 /* Generate a pull load into dst. */
2268 if (inst->src[i].reladdr) {
2269 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2271 *inst->src[i].reladdr,
2273 inst->insert_before(block, &list);
2274 inst->src[i].reladdr = NULL;
2276 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2278 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2279 dst, surf_index, offset);
2280 inst->insert_before(block, pull);
2281 inst->src[i].set_smear(pull_index & 3);
2284 /* Rewrite the instruction to use the temporary VGRF. */
2285 inst->src[i].file = GRF;
2286 inst->src[i].reg = dst.reg;
2287 inst->src[i].reg_offset = 0;
2288 inst->src[i].width = dispatch_width;
2291 invalidate_live_intervals();
2295 fs_visitor::opt_algebraic()
2297 bool progress = false;
2299 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2300 switch (inst->opcode) {
2301 case BRW_OPCODE_MOV:
2302 if (inst->src[0].file != IMM)
2305 if (inst->saturate) {
2306 if (inst->dst.type != inst->src[0].type)
2307 assert(!"unimplemented: saturate mixed types");
2309 if (brw_saturate_immediate(inst->dst.type,
2310 &inst->src[0].fixed_hw_reg)) {
2311 inst->saturate = false;
2317 case BRW_OPCODE_MUL:
2318 if (inst->src[1].file != IMM)
2322 if (inst->src[1].is_one()) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2330 if (inst->src[1].is_negative_one()) {
2331 inst->opcode = BRW_OPCODE_MOV;
2332 inst->src[0].negate = !inst->src[0].negate;
2333 inst->src[1] = reg_undef;
2339 if (inst->src[1].is_zero()) {
2340 inst->opcode = BRW_OPCODE_MOV;
2341 inst->src[0] = inst->src[1];
2342 inst->src[1] = reg_undef;
2347 if (inst->src[0].file == IMM) {
2348 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2349 inst->opcode = BRW_OPCODE_MOV;
2350 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2351 inst->src[1] = reg_undef;
2356 case BRW_OPCODE_ADD:
2357 if (inst->src[1].file != IMM)
2361 if (inst->src[1].is_zero()) {
2362 inst->opcode = BRW_OPCODE_MOV;
2363 inst->src[1] = reg_undef;
2368 if (inst->src[0].file == IMM) {
2369 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2370 inst->opcode = BRW_OPCODE_MOV;
2371 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2372 inst->src[1] = reg_undef;
2378 if (inst->src[0].equals(inst->src[1])) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2385 case BRW_OPCODE_LRP:
2386 if (inst->src[1].equals(inst->src[2])) {
2387 inst->opcode = BRW_OPCODE_MOV;
2388 inst->src[0] = inst->src[1];
2389 inst->src[1] = reg_undef;
2390 inst->src[2] = reg_undef;
2395 case BRW_OPCODE_CMP:
2396 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2398 inst->src[0].negate &&
2399 inst->src[1].is_zero()) {
2400 inst->src[0].abs = false;
2401 inst->src[0].negate = false;
2402 inst->conditional_mod = BRW_CONDITIONAL_Z;
2407 case BRW_OPCODE_SEL:
2408 if (inst->src[0].equals(inst->src[1])) {
2409 inst->opcode = BRW_OPCODE_MOV;
2410 inst->src[1] = reg_undef;
2411 inst->predicate = BRW_PREDICATE_NONE;
2412 inst->predicate_inverse = false;
2414 } else if (inst->saturate && inst->src[1].file == IMM) {
2415 switch (inst->conditional_mod) {
2416 case BRW_CONDITIONAL_LE:
2417 case BRW_CONDITIONAL_L:
2418 switch (inst->src[1].type) {
2419 case BRW_REGISTER_TYPE_F:
2420 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[1] = reg_undef;
2423 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2431 case BRW_CONDITIONAL_GE:
2432 case BRW_CONDITIONAL_G:
2433 switch (inst->src[1].type) {
2434 case BRW_REGISTER_TYPE_F:
2435 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2436 inst->opcode = BRW_OPCODE_MOV;
2437 inst->src[1] = reg_undef;
2438 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2450 case BRW_OPCODE_MAD:
2451 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2452 inst->opcode = BRW_OPCODE_MOV;
2453 inst->src[1] = reg_undef;
2454 inst->src[2] = reg_undef;
2456 } else if (inst->src[0].is_zero()) {
2457 inst->opcode = BRW_OPCODE_MUL;
2458 inst->src[0] = inst->src[2];
2459 inst->src[2] = reg_undef;
2460 } else if (inst->src[1].is_one()) {
2461 inst->opcode = BRW_OPCODE_ADD;
2462 inst->src[1] = inst->src[2];
2463 inst->src[2] = reg_undef;
2465 } else if (inst->src[2].is_one()) {
2466 inst->opcode = BRW_OPCODE_ADD;
2467 inst->src[2] = reg_undef;
2469 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2470 inst->opcode = BRW_OPCODE_ADD;
2471 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2472 inst->src[2] = reg_undef;
2476 case SHADER_OPCODE_RCP: {
2477 fs_inst *prev = (fs_inst *)inst->prev;
2478 if (prev->opcode == SHADER_OPCODE_SQRT) {
2479 if (inst->src[0].equals(prev->dst)) {
2480 inst->opcode = SHADER_OPCODE_RSQ;
2481 inst->src[0] = prev->src[0];
2496 fs_visitor::opt_register_renaming()
2498 bool progress = false;
2501 int remap[alloc.count];
2502 memset(remap, -1, sizeof(int) * alloc.count);
2504 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2505 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2507 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2508 inst->opcode == BRW_OPCODE_WHILE) {
2512 /* Rewrite instruction sources. */
2513 for (int i = 0; i < inst->sources; i++) {
2514 if (inst->src[i].file == GRF &&
2515 remap[inst->src[i].reg] != -1 &&
2516 remap[inst->src[i].reg] != inst->src[i].reg) {
2517 inst->src[i].reg = remap[inst->src[i].reg];
2522 const int dst = inst->dst.reg;
2525 inst->dst.file == GRF &&
2526 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2527 !inst->is_partial_write()) {
2528 if (remap[dst] == -1) {
2531 remap[dst] = alloc.allocate(inst->dst.width / 8);
2532 inst->dst.reg = remap[dst];
2535 } else if (inst->dst.file == GRF &&
2537 remap[dst] != dst) {
2538 inst->dst.reg = remap[dst];
2544 invalidate_live_intervals();
2546 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2547 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2548 delta_x[i].reg = remap[delta_x[i].reg];
2551 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2552 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2553 delta_y[i].reg = remap[delta_y[i].reg];
2562 fs_visitor::compute_to_mrf()
2564 bool progress = false;
2567 /* No MRFs on Gen >= 7. */
2571 calculate_live_intervals();
2573 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2577 if (inst->opcode != BRW_OPCODE_MOV ||
2578 inst->is_partial_write() ||
2579 inst->dst.file != MRF || inst->src[0].file != GRF ||
2580 inst->dst.type != inst->src[0].type ||
2581 inst->src[0].abs || inst->src[0].negate ||
2582 !inst->src[0].is_contiguous() ||
2583 inst->src[0].subreg_offset)
2586 /* Work out which hardware MRF registers are written by this
2589 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2591 if (inst->dst.reg & BRW_MRF_COMPR4) {
2592 mrf_high = mrf_low + 4;
2593 } else if (inst->exec_size == 16) {
2594 mrf_high = mrf_low + 1;
2599 /* Can't compute-to-MRF this GRF if someone else was going to
2602 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2605 /* Found a move of a GRF to a MRF. Let's see if we can go
2606 * rewrite the thing that made this GRF to write into the MRF.
2608 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2609 if (scan_inst->dst.file == GRF &&
2610 scan_inst->dst.reg == inst->src[0].reg) {
2611 /* Found the last thing to write our reg we want to turn
2612 * into a compute-to-MRF.
2615 /* If this one instruction didn't populate all the
2616 * channels, bail. We might be able to rewrite everything
2617 * that writes that reg, but it would require smarter
2618 * tracking to delay the rewriting until complete success.
2620 if (scan_inst->is_partial_write())
2623 /* Things returning more than one register would need us to
2624 * understand coalescing out more than one MOV at a time.
2626 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2629 /* SEND instructions can't have MRF as a destination. */
2630 if (scan_inst->mlen)
2633 if (brw->gen == 6) {
2634 /* gen6 math instructions must have the destination be
2635 * GRF, so no compute-to-MRF for them.
2637 if (scan_inst->is_math()) {
2642 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2643 /* Found the creator of our MRF's source value. */
2644 scan_inst->dst.file = MRF;
2645 scan_inst->dst.reg = inst->dst.reg;
2646 scan_inst->saturate |= inst->saturate;
2647 inst->remove(block);
2653 /* We don't handle control flow here. Most computation of
2654 * values that end up in MRFs are shortly before the MRF
2657 if (block->start() == scan_inst)
2660 /* You can't read from an MRF, so if someone else reads our
2661 * MRF's source GRF that we wanted to rewrite, that stops us.
2663 bool interfered = false;
2664 for (int i = 0; i < scan_inst->sources; i++) {
2665 if (scan_inst->src[i].file == GRF &&
2666 scan_inst->src[i].reg == inst->src[0].reg &&
2667 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2674 if (scan_inst->dst.file == MRF) {
2675 /* If somebody else writes our MRF here, we can't
2676 * compute-to-MRF before that.
2678 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2681 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2682 scan_mrf_high = scan_mrf_low + 4;
2683 } else if (scan_inst->exec_size == 16) {
2684 scan_mrf_high = scan_mrf_low + 1;
2686 scan_mrf_high = scan_mrf_low;
2689 if (mrf_low == scan_mrf_low ||
2690 mrf_low == scan_mrf_high ||
2691 mrf_high == scan_mrf_low ||
2692 mrf_high == scan_mrf_high) {
2697 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2698 /* Found a SEND instruction, which means that there are
2699 * live values in MRFs from base_mrf to base_mrf +
2700 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2703 if (mrf_low >= scan_inst->base_mrf &&
2704 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2707 if (mrf_high >= scan_inst->base_mrf &&
2708 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2716 invalidate_live_intervals();
2722 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2723 * instructions to FS_OPCODE_REP_FB_WRITE.
2726 fs_visitor::emit_repclear_shader()
2728 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2730 int color_mrf = base_mrf + 2;
2732 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2733 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2734 mov->force_writemask_all = true;
2737 if (key->nr_color_regions == 1) {
2738 write = emit(FS_OPCODE_REP_FB_WRITE);
2739 write->saturate = key->clamp_fragment_color;
2740 write->base_mrf = color_mrf;
2742 write->header_present = false;
2745 assume(key->nr_color_regions > 0);
2746 for (int i = 0; i < key->nr_color_regions; ++i) {
2747 write = emit(FS_OPCODE_REP_FB_WRITE);
2748 write->saturate = key->clamp_fragment_color;
2749 write->base_mrf = base_mrf;
2751 write->header_present = true;
2759 assign_constant_locations();
2760 assign_curb_setup();
2762 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2763 assert(mov->src[0].file == HW_REG);
2764 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2768 * Walks through basic blocks, looking for repeated MRF writes and
2769 * removing the later ones.
2772 fs_visitor::remove_duplicate_mrf_writes()
2774 fs_inst *last_mrf_move[16];
2775 bool progress = false;
2777 /* Need to update the MRF tracking for compressed instructions. */
2778 if (dispatch_width == 16)
2781 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2783 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2784 if (inst->is_control_flow()) {
2785 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2788 if (inst->opcode == BRW_OPCODE_MOV &&
2789 inst->dst.file == MRF) {
2790 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2791 if (prev_inst && inst->equals(prev_inst)) {
2792 inst->remove(block);
2798 /* Clear out the last-write records for MRFs that were overwritten. */
2799 if (inst->dst.file == MRF) {
2800 last_mrf_move[inst->dst.reg] = NULL;
2803 if (inst->mlen > 0 && inst->base_mrf != -1) {
2804 /* Found a SEND instruction, which will include two or fewer
2805 * implied MRF writes. We could do better here.
2807 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2808 last_mrf_move[inst->base_mrf + i] = NULL;
2812 /* Clear out any MRF move records whose sources got overwritten. */
2813 if (inst->dst.file == GRF) {
2814 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2815 if (last_mrf_move[i] &&
2816 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2817 last_mrf_move[i] = NULL;
2822 if (inst->opcode == BRW_OPCODE_MOV &&
2823 inst->dst.file == MRF &&
2824 inst->src[0].file == GRF &&
2825 !inst->is_partial_write()) {
2826 last_mrf_move[inst->dst.reg] = inst;
2831 invalidate_live_intervals();
2837 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2838 int first_grf, int grf_len)
2840 /* Clear the flag for registers that actually got read (as expected). */
2841 for (int i = 0; i < inst->sources; i++) {
2843 if (inst->src[i].file == GRF) {
2844 grf = inst->src[i].reg;
2845 } else if (inst->src[i].file == HW_REG &&
2846 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2847 grf = inst->src[i].fixed_hw_reg.nr;
2852 if (grf >= first_grf &&
2853 grf < first_grf + grf_len) {
2854 deps[grf - first_grf] = false;
2855 if (inst->exec_size == 16)
2856 deps[grf - first_grf + 1] = false;
2862 * Implements this workaround for the original 965:
2864 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2865 * check for post destination dependencies on this instruction, software
2866 * must ensure that there is no destination hazard for the case of ‘write
2867 * followed by a posted write’ shown in the following example.
2870 * 2. send r3.xy <rest of send instruction>
2873 * Due to no post-destination dependency check on the ‘send’, the above
2874 * code sequence could have two instructions (1 and 2) in flight at the
2875 * same time that both consider ‘r3’ as the target of their final writes.
2878 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2881 int write_len = inst->regs_written;
2882 int first_write_grf = inst->dst.reg;
2883 bool needs_dep[BRW_MAX_MRF];
2884 assert(write_len < (int)sizeof(needs_dep) - 1);
2886 memset(needs_dep, false, sizeof(needs_dep));
2887 memset(needs_dep, true, write_len);
2889 clear_deps_for_inst_src(inst, dispatch_width,
2890 needs_dep, first_write_grf, write_len);
2892 /* Walk backwards looking for writes to registers we're writing which
2893 * aren't read since being written. If we hit the start of the program,
2894 * we assume that there are no outstanding dependencies on entry to the
2897 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2898 /* If we hit control flow, assume that there *are* outstanding
2899 * dependencies, and force their cleanup before our instruction.
2901 if (block->start() == scan_inst) {
2902 for (int i = 0; i < write_len; i++) {
2904 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2910 /* We insert our reads as late as possible on the assumption that any
2911 * instruction but a MOV that might have left us an outstanding
2912 * dependency has more latency than a MOV.
2914 if (scan_inst->dst.file == GRF) {
2915 for (int i = 0; i < scan_inst->regs_written; i++) {
2916 int reg = scan_inst->dst.reg + i;
2918 if (reg >= first_write_grf &&
2919 reg < first_write_grf + write_len &&
2920 needs_dep[reg - first_write_grf]) {
2921 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2922 needs_dep[reg - first_write_grf] = false;
2923 if (scan_inst->exec_size == 16)
2924 needs_dep[reg - first_write_grf + 1] = false;
2929 /* Clear the flag for registers that actually got read (as expected). */
2930 clear_deps_for_inst_src(scan_inst, dispatch_width,
2931 needs_dep, first_write_grf, write_len);
2933 /* Continue the loop only if we haven't resolved all the dependencies */
2935 for (i = 0; i < write_len; i++) {
2945 * Implements this workaround for the original 965:
2947 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2948 * used as a destination register until after it has been sourced by an
2949 * instruction with a different destination register.
2952 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2954 int write_len = inst->regs_written;
2955 int first_write_grf = inst->dst.reg;
2956 bool needs_dep[BRW_MAX_MRF];
2957 assert(write_len < (int)sizeof(needs_dep) - 1);
2959 memset(needs_dep, false, sizeof(needs_dep));
2960 memset(needs_dep, true, write_len);
2961 /* Walk forwards looking for writes to registers we're writing which aren't
2962 * read before being written.
2964 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2965 /* If we hit control flow, force resolve all remaining dependencies. */
2966 if (block->end() == scan_inst) {
2967 for (int i = 0; i < write_len; i++) {
2969 scan_inst->insert_before(block,
2970 DEP_RESOLVE_MOV(first_write_grf + i));
2975 /* Clear the flag for registers that actually got read (as expected). */
2976 clear_deps_for_inst_src(scan_inst, dispatch_width,
2977 needs_dep, first_write_grf, write_len);
2979 /* We insert our reads as late as possible since they're reading the
2980 * result of a SEND, which has massive latency.
2982 if (scan_inst->dst.file == GRF &&
2983 scan_inst->dst.reg >= first_write_grf &&
2984 scan_inst->dst.reg < first_write_grf + write_len &&
2985 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2986 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2987 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2990 /* Continue the loop only if we haven't resolved all the dependencies */
2992 for (i = 0; i < write_len; i++) {
3000 /* If we hit the end of the program, resolve all remaining dependencies out
3003 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3004 assert(last_inst->eot);
3005 for (int i = 0; i < write_len; i++) {
3007 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3012 fs_visitor::insert_gen4_send_dependency_workarounds()
3014 if (brw->gen != 4 || brw->is_g4x)
3017 bool progress = false;
3019 /* Note that we're done with register allocation, so GRF fs_regs always
3020 * have a .reg_offset of 0.
3023 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3024 if (inst->mlen != 0 && inst->dst.file == GRF) {
3025 insert_gen4_pre_send_dependency_workarounds(block, inst);
3026 insert_gen4_post_send_dependency_workarounds(block, inst);
3032 invalidate_live_intervals();
3036 * Turns the generic expression-style uniform pull constant load instruction
3037 * into a hardware-specific series of instructions for loading a pull
3040 * The expression style allows the CSE pass before this to optimize out
3041 * repeated loads from the same offset, and gives the pre-register-allocation
3042 * scheduling full flexibility, while the conversion to native instructions
3043 * allows the post-register-allocation scheduler the best information
3046 * Note that execution masking for setting up pull constant loads is special:
3047 * the channels that need to be written are unrelated to the current execution
3048 * mask, since a later instruction will use one of the result channels as a
3049 * source operand for all 8 or 16 of its channels.
3052 fs_visitor::lower_uniform_pull_constant_loads()
3054 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3055 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3058 if (brw->gen >= 7) {
3059 /* The offset arg before was a vec4-aligned byte offset. We need to
3060 * turn it into a dword offset.
3062 fs_reg const_offset_reg = inst->src[1];
3063 assert(const_offset_reg.file == IMM &&
3064 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3065 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3066 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3068 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3069 * Reserve space for the register.
3071 if (brw->gen >= 9) {
3072 payload.reg_offset++;
3073 alloc.sizes[payload.reg] = 2;
3076 /* This is actually going to be a MOV, but since only the first dword
3077 * is accessed, we have a special opcode to do just that one. Note
3078 * that this needs to be an operation that will be considered a def
3079 * by live variable analysis, or register allocation will explode.
3081 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3082 8, payload, const_offset_reg);
3083 setup->force_writemask_all = true;
3085 setup->ir = inst->ir;
3086 setup->annotation = inst->annotation;
3087 inst->insert_before(block, setup);
3089 /* Similarly, this will only populate the first 4 channels of the
3090 * result register (since we only use smear values from 0-3), but we
3091 * don't tell the optimizer.
3093 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3094 inst->src[1] = payload;
3096 invalidate_live_intervals();
3098 /* Before register allocation, we didn't tell the scheduler about the
3099 * MRF we use. We know it's safe to use this MRF because nothing
3100 * else does except for register spill/unspill, which generates and
3101 * uses its MRF within a single IR instruction.
3103 inst->base_mrf = 14;
3110 fs_visitor::lower_load_payload()
3112 bool progress = false;
3114 int vgrf_to_reg[alloc.count];
3115 int reg_count = 16; /* Leave room for MRF */
3116 for (unsigned i = 0; i < alloc.count; ++i) {
3117 vgrf_to_reg[i] = reg_count;
3118 reg_count += alloc.sizes[i];
3122 bool written:1; /* Whether this register has ever been written */
3123 bool force_writemask_all:1;
3124 bool force_sechalf:1;
3125 } metadata[reg_count];
3126 memset(metadata, 0, sizeof(metadata));
3128 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3130 if (inst->dst.file == GRF) {
3131 dst_reg = vgrf_to_reg[inst->dst.reg];
3134 dst_reg = inst->dst.reg;
3137 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3138 bool force_sechalf = inst->force_sechalf;
3139 bool toggle_sechalf = inst->dst.width == 16 &&
3140 type_sz(inst->dst.type) == 4;
3141 for (int i = 0; i < inst->regs_written; ++i) {
3142 metadata[dst_reg + i].written = true;
3143 metadata[dst_reg + i].force_sechalf = force_sechalf;
3144 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3145 force_sechalf = (toggle_sechalf != force_sechalf);
3149 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3150 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3151 fs_reg dst = inst->dst;
3153 for (int i = 0; i < inst->sources; i++) {
3154 dst.width = inst->src[i].effective_width;
3155 dst.type = inst->src[i].type;
3157 if (inst->src[i].file == BAD_FILE) {
3158 /* Do nothing but otherwise increment as normal */
3159 } else if (dst.file == MRF &&
3162 i + 4 < inst->sources &&
3163 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3164 fs_reg compr4_dst = dst;
3165 compr4_dst.reg += BRW_MRF_COMPR4;
3166 compr4_dst.width = 16;
3167 fs_reg compr4_src = inst->src[i];
3168 compr4_src.width = 16;
3169 fs_inst *mov = MOV(compr4_dst, compr4_src);
3170 mov->force_writemask_all = true;
3171 inst->insert_before(block, mov);
3172 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3173 inst->src[i + 4].file = BAD_FILE;
3175 fs_inst *mov = MOV(dst, inst->src[i]);
3176 if (inst->src[i].file == GRF) {
3177 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3178 inst->src[i].reg_offset;
3179 mov->force_sechalf = metadata[src_reg].force_sechalf;
3180 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3181 metadata[dst_reg] = metadata[src_reg];
3182 if (dst.width * type_sz(dst.type) > 32) {
3183 assert((!metadata[src_reg].written ||
3184 !metadata[src_reg].force_sechalf) &&
3185 (!metadata[src_reg + 1].written ||
3186 metadata[src_reg + 1].force_sechalf));
3187 metadata[dst_reg + 1] = metadata[src_reg + 1];
3190 metadata[dst_reg].force_writemask_all = false;
3191 metadata[dst_reg].force_sechalf = false;
3192 if (dst.width == 16) {
3193 metadata[dst_reg + 1].force_writemask_all = false;
3194 metadata[dst_reg + 1].force_sechalf = true;
3197 inst->insert_before(block, mov);
3200 dst = offset(dst, 1);
3203 inst->remove(block);
3209 invalidate_live_intervals();
3215 fs_visitor::dump_instructions()
3217 dump_instructions(NULL);
3221 fs_visitor::dump_instructions(const char *name)
3223 FILE *file = stderr;
3224 if (name && geteuid() != 0) {
3225 file = fopen(name, "w");
3231 calculate_register_pressure();
3232 int ip = 0, max_pressure = 0;
3233 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3234 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3235 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3236 dump_instruction(inst, file);
3239 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3242 foreach_in_list(backend_instruction, inst, &instructions) {
3243 fprintf(file, "%4d: ", ip++);
3244 dump_instruction(inst, file);
3248 if (file != stderr) {
3254 fs_visitor::dump_instruction(backend_instruction *be_inst)
3256 dump_instruction(be_inst, stderr);
3260 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3262 fs_inst *inst = (fs_inst *)be_inst;
3264 if (inst->predicate) {
3265 fprintf(file, "(%cf0.%d) ",
3266 inst->predicate_inverse ? '-' : '+',
3270 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3272 fprintf(file, ".sat");
3273 if (inst->conditional_mod) {
3274 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3275 if (!inst->predicate &&
3276 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3277 inst->opcode != BRW_OPCODE_IF &&
3278 inst->opcode != BRW_OPCODE_WHILE))) {
3279 fprintf(file, ".f0.%d", inst->flag_subreg);
3282 fprintf(file, "(%d) ", inst->exec_size);
3285 switch (inst->dst.file) {
3287 fprintf(file, "vgrf%d", inst->dst.reg);
3288 if (inst->dst.width != dispatch_width)
3289 fprintf(file, "@%d", inst->dst.width);
3290 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3291 inst->dst.subreg_offset)
3292 fprintf(file, "+%d.%d",
3293 inst->dst.reg_offset, inst->dst.subreg_offset);
3296 fprintf(file, "m%d", inst->dst.reg);
3299 fprintf(file, "(null)");
3302 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3305 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3308 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3309 switch (inst->dst.fixed_hw_reg.nr) {
3311 fprintf(file, "null");
3313 case BRW_ARF_ADDRESS:
3314 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3316 case BRW_ARF_ACCUMULATOR:
3317 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3320 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3321 inst->dst.fixed_hw_reg.subnr);
3324 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3325 inst->dst.fixed_hw_reg.subnr);
3329 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3331 if (inst->dst.fixed_hw_reg.subnr)
3332 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3335 fprintf(file, "???");
3338 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3340 for (int i = 0; i < inst->sources; i++) {
3341 if (inst->src[i].negate)
3343 if (inst->src[i].abs)
3345 switch (inst->src[i].file) {
3347 fprintf(file, "vgrf%d", inst->src[i].reg);
3348 if (inst->src[i].width != dispatch_width)
3349 fprintf(file, "@%d", inst->src[i].width);
3350 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3351 inst->src[i].subreg_offset)
3352 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3353 inst->src[i].subreg_offset);
3356 fprintf(file, "***m%d***", inst->src[i].reg);
3359 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3362 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3363 if (inst->src[i].reladdr) {
3364 fprintf(file, "+reladdr");
3365 } else if (inst->src[i].subreg_offset) {
3366 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3367 inst->src[i].subreg_offset);
3371 fprintf(file, "(null)");
3374 switch (inst->src[i].type) {
3375 case BRW_REGISTER_TYPE_F:
3376 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3378 case BRW_REGISTER_TYPE_W:
3379 case BRW_REGISTER_TYPE_D:
3380 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3382 case BRW_REGISTER_TYPE_UW:
3383 case BRW_REGISTER_TYPE_UD:
3384 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3386 case BRW_REGISTER_TYPE_VF:
3387 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3388 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3389 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3390 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3391 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3394 fprintf(file, "???");
3399 if (inst->src[i].fixed_hw_reg.negate)
3401 if (inst->src[i].fixed_hw_reg.abs)
3403 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3404 switch (inst->src[i].fixed_hw_reg.nr) {
3406 fprintf(file, "null");
3408 case BRW_ARF_ADDRESS:
3409 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3411 case BRW_ARF_ACCUMULATOR:
3412 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3415 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3416 inst->src[i].fixed_hw_reg.subnr);
3419 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3420 inst->src[i].fixed_hw_reg.subnr);
3424 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3426 if (inst->src[i].fixed_hw_reg.subnr)
3427 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3428 if (inst->src[i].fixed_hw_reg.abs)
3432 fprintf(file, "???");
3435 if (inst->src[i].abs)
3438 if (inst->src[i].file != IMM) {
3439 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3442 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3443 fprintf(file, ", ");
3448 if (dispatch_width == 16 && inst->exec_size == 8) {
3449 if (inst->force_sechalf)
3450 fprintf(file, "2ndhalf ");
3452 fprintf(file, "1sthalf ");
3455 fprintf(file, "\n");
3459 * Possibly returns an instruction that set up @param reg.
3461 * Sometimes we want to take the result of some expression/variable
3462 * dereference tree and rewrite the instruction generating the result
3463 * of the tree. When processing the tree, we know that the
3464 * instructions generated are all writing temporaries that are dead
3465 * outside of this tree. So, if we have some instructions that write
3466 * a temporary, we're free to point that temp write somewhere else.
3468 * Note that this doesn't guarantee that the instruction generated
3469 * only reg -- it might be the size=4 destination of a texture instruction.
3472 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3477 end->is_partial_write() ||
3479 !reg.equals(end->dst)) {
3487 fs_visitor::setup_payload_gen6()
3490 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3491 unsigned barycentric_interp_modes =
3492 (stage == MESA_SHADER_FRAGMENT) ?
3493 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3495 assert(brw->gen >= 6);
3497 /* R0-1: masks, pixel X/Y coordinates. */
3498 payload.num_regs = 2;
3499 /* R2: only for 32-pixel dispatch.*/
3501 /* R3-26: barycentric interpolation coordinates. These appear in the
3502 * same order that they appear in the brw_wm_barycentric_interp_mode
3503 * enum. Each set of coordinates occupies 2 registers if dispatch width
3504 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3505 * appear if they were enabled using the "Barycentric Interpolation
3506 * Mode" bits in WM_STATE.
3508 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3509 if (barycentric_interp_modes & (1 << i)) {
3510 payload.barycentric_coord_reg[i] = payload.num_regs;
3511 payload.num_regs += 2;
3512 if (dispatch_width == 16) {
3513 payload.num_regs += 2;
3518 /* R27: interpolated depth if uses source depth */
3520 payload.source_depth_reg = payload.num_regs;
3522 if (dispatch_width == 16) {
3523 /* R28: interpolated depth if not SIMD8. */
3527 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3529 payload.source_w_reg = payload.num_regs;
3531 if (dispatch_width == 16) {
3532 /* R30: interpolated W if not SIMD8. */
3537 if (stage == MESA_SHADER_FRAGMENT) {
3538 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3539 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3540 prog_data->uses_pos_offset = key->compute_pos_offset;
3541 /* R31: MSAA position offsets. */
3542 if (prog_data->uses_pos_offset) {
3543 payload.sample_pos_reg = payload.num_regs;
3548 /* R32: MSAA input coverage mask */
3549 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3550 assert(brw->gen >= 7);
3551 payload.sample_mask_in_reg = payload.num_regs;
3553 if (dispatch_width == 16) {
3554 /* R33: input coverage mask if not SIMD8. */
3559 /* R34-: bary for 32-pixel. */
3560 /* R58-59: interp W for 32-pixel. */
3562 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3563 source_depth_to_render_target = true;
3568 fs_visitor::setup_vs_payload()
3570 /* R0: thread header, R1: urb handles */
3571 payload.num_regs = 2;
3575 fs_visitor::assign_binding_table_offsets()
3577 assert(stage == MESA_SHADER_FRAGMENT);
3578 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3579 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3580 uint32_t next_binding_table_offset = 0;
3582 /* If there are no color regions, we still perform an FB write to a null
3583 * renderbuffer, which we place at surface index 0.
3585 prog_data->binding_table.render_target_start = next_binding_table_offset;
3586 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3588 assign_common_binding_table_offsets(next_binding_table_offset);
3592 fs_visitor::calculate_register_pressure()
3594 invalidate_live_intervals();
3595 calculate_live_intervals();
3597 unsigned num_instructions = 0;
3598 foreach_block(block, cfg)
3599 num_instructions += block->instructions.length();
3601 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3603 for (unsigned reg = 0; reg < alloc.count; reg++) {
3604 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3605 regs_live_at_ip[ip] += alloc.sizes[reg];
3610 fs_visitor::optimize()
3612 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3614 split_virtual_grfs();
3616 move_uniform_array_access_to_pull_constants();
3617 assign_constant_locations();
3618 demote_pull_constants();
3620 #define OPT(pass, args...) ({ \
3622 bool this_progress = pass(args); \
3624 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3625 char filename[64]; \
3626 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3627 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3629 backend_visitor::dump_instructions(filename); \
3632 progress = progress || this_progress; \
3636 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3638 snprintf(filename, 64, "%s%d-%04d-00-start",
3639 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3641 backend_visitor::dump_instructions(filename);
3652 OPT(remove_duplicate_mrf_writes);
3656 OPT(opt_copy_propagate);
3657 OPT(opt_peephole_predicated_break);
3658 OPT(opt_cmod_propagation);
3659 OPT(dead_code_eliminate);
3660 OPT(opt_peephole_sel);
3661 OPT(dead_control_flow_eliminate, this);
3662 OPT(opt_register_renaming);
3663 OPT(opt_saturate_propagation);
3664 OPT(register_coalesce);
3665 OPT(compute_to_mrf);
3667 OPT(compact_virtual_grfs);
3672 if (OPT(lower_load_payload)) {
3673 split_virtual_grfs();
3674 OPT(register_coalesce);
3675 OPT(compute_to_mrf);
3676 OPT(dead_code_eliminate);
3679 OPT(opt_combine_constants);
3681 lower_uniform_pull_constant_loads();
3685 * Three source instruction must have a GRF/MRF destination register.
3686 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3689 fs_visitor::fixup_3src_null_dest()
3691 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3692 if (inst->is_3src() && inst->dst.is_null()) {
3693 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3700 fs_visitor::allocate_registers()
3702 bool allocated_without_spills;
3704 static const enum instruction_scheduler_mode pre_modes[] = {
3706 SCHEDULE_PRE_NON_LIFO,
3710 /* Try each scheduling heuristic to see if it can successfully register
3711 * allocate without spilling. They should be ordered by decreasing
3712 * performance but increasing likelihood of allocating.
3714 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3715 schedule_instructions(pre_modes[i]);
3718 assign_regs_trivial();
3719 allocated_without_spills = true;
3721 allocated_without_spills = assign_regs(false);
3723 if (allocated_without_spills)
3727 if (!allocated_without_spills) {
3728 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3729 "Vertex" : "Fragment";
3731 /* We assume that any spilling is worse than just dropping back to
3732 * SIMD8. There's probably actually some intermediate point where
3733 * SIMD16 with a couple of spills is still better.
3735 if (dispatch_width == 16) {
3736 fail("Failure to register allocate. Reduce number of "
3737 "live scalar values to avoid this.");
3739 perf_debug("%s shader triggered register spilling. "
3740 "Try reducing the number of live scalar values to "
3741 "improve performance.\n", stage_name);
3744 /* Since we're out of heuristics, just go spill registers until we
3745 * get an allocation.
3747 while (!assign_regs(true)) {
3753 /* This must come after all optimization and register allocation, since
3754 * it inserts dead code that happens to have side effects, and it does
3755 * so based on the actual physical registers in use.
3757 insert_gen4_send_dependency_workarounds();
3762 if (!allocated_without_spills)
3763 schedule_instructions(SCHEDULE_POST);
3765 if (last_scratch > 0)
3766 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3770 fs_visitor::run_vs()
3772 assert(stage == MESA_SHADER_VERTEX);
3774 assign_common_binding_table_offsets(0);
3777 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3778 emit_shader_time_begin();
3780 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3782 this->result = reg_undef;
3795 assign_curb_setup();
3796 assign_vs_urb_setup();
3798 fixup_3src_null_dest();
3799 allocate_registers();
3805 fs_visitor::run_fs()
3807 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3808 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3810 assert(stage == MESA_SHADER_FRAGMENT);
3812 sanity_param_count = prog->Parameters->NumParameters;
3814 assign_binding_table_offsets();
3817 setup_payload_gen6();
3819 setup_payload_gen4();
3823 } else if (brw->use_rep_send && dispatch_width == 16) {
3824 emit_repclear_shader();
3826 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3827 emit_shader_time_begin();
3829 calculate_urb_setup();
3830 if (prog->InputsRead > 0) {
3832 emit_interpolation_setup_gen4();
3834 emit_interpolation_setup_gen6();
3837 /* We handle discards by keeping track of the still-live pixels in f0.1.
3838 * Initialize it with the dispatched pixels.
3840 if (wm_prog_data->uses_kill) {
3841 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3842 discard_init->flag_subreg = 1;
3845 /* Generate FS IR for main(). (the visitor only descends into
3846 * functions called "main").
3849 if (getenv("INTEL_USE_NIR") != NULL) {
3852 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3854 this->result = reg_undef;
3859 emit_fragment_program_code();
3865 emit(FS_OPCODE_PLACEHOLDER_HALT);
3867 if (wm_key->alpha_test_func)
3876 assign_curb_setup();
3879 fixup_3src_null_dest();
3880 allocate_registers();
3886 if (dispatch_width == 8)
3887 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3889 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3891 /* If any state parameters were appended, then ParameterValues could have
3892 * been realloced, in which case the driver uniform storage set up by
3893 * _mesa_associate_uniform_storage() would point to freed memory. Make
3894 * sure that didn't happen.
3896 assert(sanity_param_count == prog->Parameters->NumParameters);
3902 brw_wm_fs_emit(struct brw_context *brw,
3904 const struct brw_wm_prog_key *key,
3905 struct brw_wm_prog_data *prog_data,
3906 struct gl_fragment_program *fp,
3907 struct gl_shader_program *prog,
3908 unsigned *final_assembly_size)
3910 bool start_busy = false;
3911 double start_time = 0;
3913 if (unlikely(brw->perf_debug)) {
3914 start_busy = (brw->batch.last_bo &&
3915 drm_intel_bo_busy(brw->batch.last_bo));
3916 start_time = get_time();
3919 struct brw_shader *shader = NULL;
3921 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3923 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3924 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3926 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3928 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3931 prog->LinkStatus = false;
3932 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3935 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3941 cfg_t *simd16_cfg = NULL;
3942 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3943 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3944 brw->use_rep_send)) {
3945 if (!v.simd16_unsupported) {
3946 /* Try a SIMD16 compile */
3947 v2.import_uniforms(&v);
3949 perf_debug("SIMD16 shader failed to compile, falling back to "
3950 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3952 simd16_cfg = v2.cfg;
3955 perf_debug("SIMD16 shader unsupported, falling back to "
3956 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3961 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3962 if (no_simd8 && simd16_cfg) {
3964 prog_data->no_8 = true;
3967 prog_data->no_8 = false;
3970 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3971 &fp->Base, v.runtime_check_aads_emit, "FS");
3973 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3976 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3977 prog->Label ? prog->Label : "unnamed",
3980 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3982 g.enable_debug(name);
3986 g.generate_code(simd8_cfg, 8);
3988 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3990 if (unlikely(brw->perf_debug) && shader) {
3991 if (shader->compiled_once)
3992 brw_wm_debug_recompile(brw, prog, key);
3993 shader->compiled_once = true;
3995 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3996 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3997 (get_time() - start_time) * 1000);
4001 return g.get_assembly(final_assembly_size);
4005 brw_fs_precompile(struct gl_context *ctx,
4006 struct gl_shader_program *shader_prog,
4007 struct gl_program *prog)
4009 struct brw_context *brw = brw_context(ctx);
4010 struct brw_wm_prog_key key;
4012 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4013 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4014 bool program_uses_dfdy = fp->UsesDFdy;
4016 memset(&key, 0, sizeof(key));
4020 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4022 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4023 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4025 /* Just assume depth testing. */
4026 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4027 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4030 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4031 BRW_FS_VARYING_INPUT_MASK) > 16)
4032 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4034 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4035 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4036 for (unsigned i = 0; i < sampler_count; i++) {
4037 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4038 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4039 key.tex.swizzles[i] =
4040 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4042 /* Color sampler: assume no swizzling. */
4043 key.tex.swizzles[i] = SWIZZLE_XYZW;
4047 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4048 key.drawable_height = ctx->DrawBuffer->Height;
4051 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4052 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4053 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4055 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4056 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4057 key.nr_color_regions > 1;
4060 key.program_string_id = bfp->id;
4062 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4063 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4065 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4067 brw->wm.base.prog_offset = old_prog_offset;
4068 brw->wm.prog_data = old_prog_data;