#include "brw_eu.h"
#include "brw_wm.h"
}
-#include "brw_shader.h"
#include "brw_fs.h"
#include "glsl/glsl_types.h"
#include "glsl/ir_print_visitor.h"
assert(src[2].reg_offset >= 0);
}
+#define ALU1(op) \
+ fs_inst * \
+ fs_visitor::op(fs_reg dst, fs_reg src0) \
+ { \
+ return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
+ }
+
+#define ALU2(op) \
+ fs_inst * \
+ fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
+ { \
+ return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
+ }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU2(ADD)
+ALU2(MUL)
+ALU2(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHL)
+ALU2(SHR)
+ALU2(ASR)
+
+/** Gen4 predicated IF. */
+fs_inst *
+fs_visitor::IF(uint32_t predicate)
+{
+ fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
+ inst->predicate = predicate;
+ return inst;
+}
+
+/** Gen6+ IF with embedded comparison. */
+fs_inst *
+fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
+{
+ assert(intel->gen >= 6);
+ fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
+ reg_null_d, src0, src1);
+ inst->conditional_mod = condition;
+ return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+fs_inst *
+fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
+{
+ fs_inst *inst;
+
+ /* Take the instruction:
+ *
+ * CMP null<d> src0<f> src1<f>
+ *
+ * Original gen4 does type conversion to the destination type before
+ * comparison, producing garbage results for floating point comparisons.
+ * gen5 does the comparison on the execution type (resolved source types),
+ * so dst type doesn't matter. gen6 does comparison and then uses the
+ * result as if it was the dst type with no conversion, which happens to
+ * mostly work out for float-interpreted-as-int since our comparisons are
+ * for >0, =0, <0.
+ */
+ if (intel->gen == 4) {
+ dst.type = src0.type;
+ if (dst.file == FIXED_HW_REG)
+ dst.fixed_hw_reg.type = dst.type;
+ }
+
+ resolve_ud_negate(&src0);
+ resolve_ud_negate(&src1);
+
+ inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
+ inst->conditional_mod = condition;
+
+ return inst;
+}
+
bool
fs_inst::equals(fs_inst *inst)
{
src[1].equals(inst->src[1]) &&
src[2].equals(inst->src[2]) &&
saturate == inst->saturate &&
- predicated == inst->predicated &&
+ predicate == inst->predicate &&
conditional_mod == inst->conditional_mod &&
mlen == inst->mlen &&
base_mrf == inst->base_mrf &&
imm.u == r.imm.u);
}
+bool
+fs_reg::is_zero() const
+{
+ if (file != IMM)
+ return false;
+
+ return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
+}
+
+bool
+fs_reg::is_one() const
+{
+ if (file != IMM)
+ return false;
+
+ return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
+}
+
int
fs_visitor::type_size(const struct glsl_type *type)
{
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
- return 1 * c->dispatch_width / 8;
+ return 1 * dispatch_width / 8;
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
- return 2 * c->dispatch_width / 8;
+ return 2 * dispatch_width / 8;
case SHADER_OPCODE_TEX:
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXD:
return 1;
case FS_OPCODE_FB_WRITE:
return 2;
- case FS_OPCODE_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_UNSPILL:
return 1;
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+ return inst->header_present;
case FS_OPCODE_SPILL:
return 2;
default:
for (unsigned int i = 0; i < type->vector_elements; i++) {
unsigned int param = c->prog_data.nr_params++;
- assert(param < ARRAY_SIZE(c->prog_data.param));
-
this->param_index[param] = loc;
this->param_offset[param] = i;
}
/* gl_FragCoord.x */
if (ir->pixel_center_integer) {
- emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
+ emit(MOV(wpos, this->pixel_x));
} else {
- emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
+ emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
}
wpos.reg_offset++;
/* gl_FragCoord.y */
if (!flip && ir->pixel_center_integer) {
- emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
+ emit(MOV(wpos, this->pixel_y));
} else {
fs_reg pixel_y = this->pixel_y;
float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
offset += c->key.drawable_height - 1.0;
}
- emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
+ emit(ADD(wpos, pixel_y, fs_reg(offset)));
}
wpos.reg_offset++;
/* gl_FragCoord.z */
if (intel->gen >= 6) {
- emit(BRW_OPCODE_MOV, wpos,
- fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
+ emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
} else {
emit(FS_OPCODE_LINTERP, wpos,
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
fs_inst *inst = emit_linterp(attr, fs_reg(interp),
interpolation_mode, false);
- inst->predicated = true;
+ inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = true;
}
if (intel->gen < 6) {
/* bit 31 is "primitive is back face", so checking < (1 << 31) gives
* us front face
*/
- fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
- fs_reg(r1_6ud),
- fs_reg(1u << 31));
- inst->conditional_mod = BRW_CONDITIONAL_L;
+ emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
}
if (intel->gen < 6) {
inst->base_mrf = 2;
- inst->mlen = c->dispatch_width / 8;
+ inst->mlen = dispatch_width / 8;
}
return inst;
inst = emit(opcode, dst, op0, reg_null_f);
inst->base_mrf = base_mrf;
- inst->mlen = 2 * c->dispatch_width / 8;
+ inst->mlen = 2 * dispatch_width / 8;
}
return inst;
}
void
fs_visitor::setup_paramvalues_refs()
{
- if (c->dispatch_width != 8)
+ if (dispatch_width != 8)
return;
/* Set up the pointers to ParamValues now that that array is finalized. */
fs_visitor::assign_curb_setup()
{
c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
- if (c->dispatch_width == 8) {
+ if (dispatch_width == 8) {
c->prog_data.first_curbe_grf = c->nr_payload_regs;
} else {
c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
*
* See compile_sf_prog() for more info.
*/
- if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
+ if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
}
this->live_intervals_valid = false;
}
+/**
+ * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
+ *
+ * During code generation, we create tons of temporary variables, many of
+ * which get immediately killed and are never used again. Yet, in later
+ * optimization and analysis passes, such as compute_live_intervals, we need
+ * to loop over all the virtual GRFs. Compacting them can save a lot of
+ * overhead.
+ */
+void
+fs_visitor::compact_virtual_grfs()
+{
+ /* Mark which virtual GRFs are used, and count how many. */
+ int remap_table[this->virtual_grf_count];
+ memset(remap_table, -1, sizeof(remap_table));
+
+ foreach_list(node, &this->instructions) {
+ const fs_inst *inst = (const fs_inst *) node;
+
+ if (inst->dst.file == GRF)
+ remap_table[inst->dst.reg] = 0;
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == GRF)
+ remap_table[inst->src[i].reg] = 0;
+ }
+ }
+
+ /* In addition to registers used in instructions, fs_visitor keeps
+ * direct references to certain special values which must be patched:
+ */
+ fs_reg *special[] = {
+ &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
+ &outputs[0], &outputs[1], &outputs[2], &outputs[3],
+ &outputs[4], &outputs[5], &outputs[6], &outputs[7],
+ &delta_x[0], &delta_x[1], &delta_x[2],
+ &delta_x[3], &delta_x[4], &delta_x[5],
+ &delta_y[0], &delta_y[1], &delta_y[2],
+ &delta_y[3], &delta_y[4], &delta_y[5],
+ };
+ STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
+ STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
+
+ /* Treat all special values as used, to be conservative */
+ for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
+ if (special[i]->file == GRF)
+ remap_table[special[i]->reg] = 0;
+ }
+
+ /* Compact the GRF arrays. */
+ int new_index = 0;
+ for (int i = 0; i < this->virtual_grf_count; i++) {
+ if (remap_table[i] != -1) {
+ remap_table[i] = new_index;
+ virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
+ if (live_intervals_valid) {
+ virtual_grf_use[new_index] = virtual_grf_use[i];
+ virtual_grf_def[new_index] = virtual_grf_def[i];
+ }
+ ++new_index;
+ }
+ }
+
+ this->virtual_grf_count = new_index;
+
+ /* Patch all the instructions to use the newly renumbered registers */
+ foreach_list(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *) node;
+
+ if (inst->dst.file == GRF)
+ inst->dst.reg = remap_table[inst->dst.reg];
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == GRF)
+ inst->src[i].reg = remap_table[inst->src[i].reg];
+ }
+ }
+
+ /* Patch all the references to special values */
+ for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
+ if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
+ special[i]->reg = remap_table[special[i]->reg];
+ }
+}
+
bool
fs_visitor::remove_dead_constants()
{
- if (c->dispatch_width == 8) {
+ if (dispatch_width == 8) {
this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
if (c->prog_data.nr_params <= max_uniform_components)
return;
- if (c->dispatch_width == 16) {
+ if (dispatch_width == 16) {
fail("Pull constants not supported in 16-wide\n");
return;
}
fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
fs_reg offset = fs_reg((unsigned)(((uniform_nr -
pull_uniform_base) * 4) & ~15));
- fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
- dst, index, offset);
+ fs_inst *pull =
+ new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, index, offset);
pull->ir = inst->ir;
pull->annotation = inst->annotation;
pull->base_mrf = 14;
c->prog_data.nr_pull_params = pull_uniform_count;
}
-/**
- * Attempts to move immediate constants into the immediate
- * constant slot of following instructions.
- *
- * Immediate constants are a bit tricky -- they have to be in the last
- * operand slot, you can't do abs/negate on them,
- */
-
-bool
-fs_visitor::propagate_constants()
-{
- bool progress = false;
-
- calculate_live_intervals();
-
- foreach_list(node, &this->instructions) {
- fs_inst *inst = (fs_inst *)node;
-
- if (inst->opcode != BRW_OPCODE_MOV ||
- inst->predicated ||
- inst->dst.file != GRF || inst->src[0].file != IMM ||
- inst->dst.type != inst->src[0].type ||
- (c->dispatch_width == 16 &&
- (inst->force_uncompressed || inst->force_sechalf)))
- continue;
-
- /* Don't bother with cases where we should have had the
- * operation on the constant folded in GLSL already.
- */
- if (inst->saturate)
- continue;
-
- /* Found a move of a constant to a GRF. Find anything else using the GRF
- * before it's written, and replace it with the constant if we can.
- */
- for (fs_inst *scan_inst = (fs_inst *)inst->next;
- !scan_inst->is_tail_sentinel();
- scan_inst = (fs_inst *)scan_inst->next) {
- if (scan_inst->opcode == BRW_OPCODE_DO ||
- scan_inst->opcode == BRW_OPCODE_WHILE ||
- scan_inst->opcode == BRW_OPCODE_ELSE ||
- scan_inst->opcode == BRW_OPCODE_ENDIF) {
- break;
- }
-
- for (int i = 2; i >= 0; i--) {
- if (scan_inst->src[i].file != GRF ||
- scan_inst->src[i].reg != inst->dst.reg ||
- scan_inst->src[i].reg_offset != inst->dst.reg_offset)
- continue;
-
- /* Don't bother with cases where we should have had the
- * operation on the constant folded in GLSL already.
- */
- if (scan_inst->src[i].negate || scan_inst->src[i].abs)
- continue;
-
- switch (scan_inst->opcode) {
- case BRW_OPCODE_MOV:
- scan_inst->src[i] = inst->src[0];
- progress = true;
- break;
-
- case BRW_OPCODE_MUL:
- case BRW_OPCODE_ADD:
- if (i == 1) {
- scan_inst->src[i] = inst->src[0];
- progress = true;
- } else if (i == 0 && scan_inst->src[1].file != IMM) {
- /* Fit this constant in by commuting the operands.
- * Exception: we can't do this for 32-bit integer MUL
- * because it's asymmetric.
- */
- if (scan_inst->opcode == BRW_OPCODE_MUL &&
- (scan_inst->src[1].type == BRW_REGISTER_TYPE_D ||
- scan_inst->src[1].type == BRW_REGISTER_TYPE_UD))
- break;
- scan_inst->src[0] = scan_inst->src[1];
- scan_inst->src[1] = inst->src[0];
- progress = true;
- }
- break;
-
- case BRW_OPCODE_CMP:
- case BRW_OPCODE_IF:
- if (i == 1) {
- scan_inst->src[i] = inst->src[0];
- progress = true;
- } else if (i == 0 && scan_inst->src[1].file != IMM) {
- uint32_t new_cmod;
-
- new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
- if (new_cmod != ~0u) {
- /* Fit this constant in by swapping the operands and
- * flipping the test
- */
- scan_inst->src[0] = scan_inst->src[1];
- scan_inst->src[1] = inst->src[0];
- scan_inst->conditional_mod = new_cmod;
- progress = true;
- }
- }
- break;
-
- case BRW_OPCODE_SEL:
- if (i == 1) {
- scan_inst->src[i] = inst->src[0];
- progress = true;
- } else if (i == 0 && scan_inst->src[1].file != IMM) {
- scan_inst->src[0] = scan_inst->src[1];
- scan_inst->src[1] = inst->src[0];
-
- /* If this was predicated, flipping operands means
- * we also need to flip the predicate.
- */
- if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
- scan_inst->predicate_inverse =
- !scan_inst->predicate_inverse;
- }
- progress = true;
- }
- break;
-
- case SHADER_OPCODE_RCP:
- /* The hardware doesn't do math on immediate values
- * (because why are you doing that, seriously?), but
- * the correct answer is to just constant fold it
- * anyway.
- */
- assert(i == 0);
- if (inst->src[0].imm.f != 0.0f) {
- scan_inst->opcode = BRW_OPCODE_MOV;
- scan_inst->src[0] = inst->src[0];
- scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
- progress = true;
- }
- break;
-
- case FS_OPCODE_PULL_CONSTANT_LOAD:
- scan_inst->src[i] = inst->src[0];
- progress = true;
- break;
-
- default:
- break;
- }
- }
-
- if (scan_inst->dst.file == GRF &&
- scan_inst->overwrites_reg(inst->dst)) {
- break;
- }
- }
- }
-
- if (progress)
- this->live_intervals_valid = false;
-
- return progress;
-}
-
-
-/**
- * Attempts to move immediate constants into the immediate
- * constant slot of following instructions.
- *
- * Immediate constants are a bit tricky -- they have to be in the last
- * operand slot, you can't do abs/negate on them,
- */
-
bool
fs_visitor::opt_algebraic()
{
bool progress = false;
- calculate_live_intervals();
-
foreach_list(node, &this->instructions) {
fs_inst *inst = (fs_inst *)node;
continue;
/* a * 1.0 = a */
- if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
- inst->src[1].imm.f == 1.0) {
+ if (inst->src[1].is_one()) {
inst->opcode = BRW_OPCODE_MOV;
inst->src[1] = reg_undef;
progress = true;
break;
}
+ /* a * 0.0 = 0.0 */
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+
break;
+ case BRW_OPCODE_ADD:
+ if (inst->src[1].file != IMM)
+ continue;
+
+ /* a + 0.0 = a */
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+ break;
default:
break;
}
fs_inst *inst = (fs_inst *)node;
if (inst->opcode != BRW_OPCODE_MOV ||
- inst->predicated ||
+ inst->predicate ||
inst->saturate ||
inst->src[0].file != GRF ||
inst->src[0].negate ||
continue;
if (inst->opcode != BRW_OPCODE_MOV ||
- inst->predicated ||
+ inst->predicate ||
inst->saturate ||
inst->dst.file != GRF || (inst->src[0].file != GRF &&
inst->src[0].file != UNIFORM)||
* unusual register regions, so avoid coalescing those for
* now. We should do something more specific.
*/
- if (intel->gen >= 6 &&
+ if (intel->gen == 6 &&
scan_inst->is_math() &&
(has_source_modifiers || inst->src[0].file == UNIFORM)) {
interfered = true;
next_ip++;
if (inst->opcode != BRW_OPCODE_MOV ||
- inst->predicated ||
+ inst->predicate ||
inst->dst.file != MRF || inst->src[0].file != GRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
int mrf_high;
if (inst->dst.reg & BRW_MRF_COMPR4) {
mrf_high = mrf_low + 4;
- } else if (c->dispatch_width == 16 &&
+ } else if (dispatch_width == 16 &&
(!inst->force_uncompressed && !inst->force_sechalf)) {
mrf_high = mrf_low + 1;
} else {
* that writes that reg, but it would require smarter
* tracking to delay the rewriting until complete success.
*/
- if (scan_inst->predicated)
+ if (scan_inst->predicate)
break;
/* If it's half of register setup and not the same half as
if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
scan_mrf_high = scan_mrf_low + 4;
- } else if (c->dispatch_width == 16 &&
+ } else if (dispatch_width == 16 &&
(!scan_inst->force_uncompressed &&
!scan_inst->force_sechalf)) {
scan_mrf_high = scan_mrf_low + 1;
bool progress = false;
/* Need to update the MRF tracking for compressed instructions. */
- if (c->dispatch_width == 16)
+ if (dispatch_width == 16)
return false;
memset(last_mrf_move, 0, sizeof(last_mrf_move));
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF &&
inst->src[0].file == GRF &&
- !inst->predicated) {
+ !inst->predicate) {
last_mrf_move[inst->dst.reg] = inst;
}
}
return progress;
}
+void
+fs_visitor::dump_instruction(fs_inst *inst)
+{
+ if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
+ opcode_descs[inst->opcode].name) {
+ printf("%s", opcode_descs[inst->opcode].name);
+ } else {
+ printf("op%d", inst->opcode);
+ }
+ if (inst->saturate)
+ printf(".sat");
+ printf(" ");
+
+ switch (inst->dst.file) {
+ case GRF:
+ printf("vgrf%d", inst->dst.reg);
+ if (inst->dst.reg_offset)
+ printf("+%d", inst->dst.reg_offset);
+ break;
+ case MRF:
+ printf("m%d", inst->dst.reg);
+ break;
+ case BAD_FILE:
+ printf("(null)");
+ break;
+ case UNIFORM:
+ printf("***u%d***", inst->dst.reg);
+ break;
+ default:
+ printf("???");
+ break;
+ }
+ printf(", ");
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].negate)
+ printf("-");
+ if (inst->src[i].abs)
+ printf("|");
+ switch (inst->src[i].file) {
+ case GRF:
+ printf("vgrf%d", inst->src[i].reg);
+ if (inst->src[i].reg_offset)
+ printf("+%d", inst->src[i].reg_offset);
+ break;
+ case MRF:
+ printf("***m%d***", inst->src[i].reg);
+ break;
+ case UNIFORM:
+ printf("u%d", inst->src[i].reg);
+ if (inst->src[i].reg_offset)
+ printf(".%d", inst->src[i].reg_offset);
+ break;
+ case BAD_FILE:
+ printf("(null)");
+ break;
+ default:
+ printf("???");
+ break;
+ }
+ if (inst->src[i].abs)
+ printf("|");
+
+ if (i < 3)
+ printf(", ");
+ }
+
+ printf(" ");
+
+ if (inst->force_uncompressed)
+ printf("1sthalf ");
+
+ if (inst->force_sechalf)
+ printf("2ndhalf ");
+
+ printf("\n");
+}
+
+void
+fs_visitor::dump_instructions()
+{
+ int ip = 0;
+ foreach_list(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+ printf("%d: ", ip++);
+ dump_instruction(inst);
+ }
+}
+
/**
* Possibly returns an instruction that set up @param reg.
*
fs_reg reg)
{
if (end == start ||
- end->predicated ||
+ end->predicate ||
end->force_uncompressed ||
end->force_sechalf ||
!reg.equals(end->dst)) {
}
}
-bool
-fs_visitor::run()
+void
+fs_visitor::setup_payload_gen6()
{
- uint32_t prog_offset_16 = 0;
- uint32_t orig_nr_params = c->prog_data.nr_params;
-
- brw_wm_payload_setup(brw, c);
-
- if (c->dispatch_width == 16) {
- /* align to 64 byte boundary. */
- while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
- brw_NOP(p);
+ struct intel_context *intel = &brw->intel;
+ bool uses_depth =
+ (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
+ unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
+
+ assert(intel->gen >= 6);
+
+ /* R0-1: masks, pixel X/Y coordinates. */
+ c->nr_payload_regs = 2;
+ /* R2: only for 32-pixel dispatch.*/
+
+ /* R3-26: barycentric interpolation coordinates. These appear in the
+ * same order that they appear in the brw_wm_barycentric_interp_mode
+ * enum. Each set of coordinates occupies 2 registers if dispatch width
+ * == 8 and 4 registers if dispatch width == 16. Coordinates only
+ * appear if they were enabled using the "Barycentric Interpolation
+ * Mode" bits in WM_STATE.
+ */
+ for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
+ if (barycentric_interp_modes & (1 << i)) {
+ c->barycentric_coord_reg[i] = c->nr_payload_regs;
+ c->nr_payload_regs += 2;
+ if (dispatch_width == 16) {
+ c->nr_payload_regs += 2;
+ }
}
+ }
- /* Save off the start of this 16-wide program in case we succeed. */
- prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
+ /* R27: interpolated depth if uses source depth */
+ if (uses_depth) {
+ c->source_depth_reg = c->nr_payload_regs;
+ c->nr_payload_regs++;
+ if (dispatch_width == 16) {
+ /* R28: interpolated depth if not 8-wide. */
+ c->nr_payload_regs++;
+ }
+ }
+ /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
+ if (uses_depth) {
+ c->source_w_reg = c->nr_payload_regs;
+ c->nr_payload_regs++;
+ if (dispatch_width == 16) {
+ /* R30: interpolated W if not 8-wide. */
+ c->nr_payload_regs++;
+ }
+ }
+ /* R31: MSAA position offsets. */
+ /* R32-: bary for 32-pixel. */
+ /* R58-59: interp W for 32-pixel. */
- brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+ c->source_depth_to_render_target = true;
}
+}
+
+bool
+fs_visitor::run()
+{
+ uint32_t orig_nr_params = c->prog_data.nr_params;
+
+ if (intel->gen >= 6)
+ setup_payload_gen6();
+ else
+ setup_payload_gen4();
if (0) {
emit_dummy_fs();
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
- foreach_list(node, &*shader->ir) {
- ir_instruction *ir = (ir_instruction *)node;
- base_ir = ir;
- this->result = reg_undef;
- ir->accept(this);
+ if (shader) {
+ foreach_list(node, &*shader->ir) {
+ ir_instruction *ir = (ir_instruction *)node;
+ base_ir = ir;
+ this->result = reg_undef;
+ ir->accept(this);
+ }
+ } else {
+ emit_fragment_program_code();
}
+ base_ir = NULL;
if (failed)
return false;
do {
progress = false;
+ compact_virtual_grfs();
+
progress = remove_duplicate_mrf_writes() || progress;
- progress = propagate_constants() || progress;
progress = opt_algebraic() || progress;
progress = opt_cse() || progress;
progress = opt_copy_propagate() || progress;
+ progress = dead_code_eliminate() || progress;
progress = register_coalesce() || progress;
progress = register_coalesce_2() || progress;
progress = compute_to_mrf() || progress;
- progress = dead_code_eliminate() || progress;
} while (progress);
remove_dead_constants();
if (failed)
return false;
- generate_code();
-
- if (c->dispatch_width == 8) {
+ if (dispatch_width == 8) {
c->prog_data.reg_blocks = brw_register_blocks(grf_used);
} else {
c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
- c->prog_data.prog_offset_16 = prog_offset_16;
/* Make sure we didn't try to sneak in an extra uniform */
assert(orig_nr_params == c->prog_data.nr_params);
return !failed;
}
-bool
+const unsigned *
brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
- struct gl_shader_program *prog)
+ struct gl_fragment_program *fp,
+ struct gl_shader_program *prog,
+ unsigned *final_assembly_size)
{
struct intel_context *intel = &brw->intel;
+ bool start_busy = false;
+ float start_time = 0;
- if (!prog)
- return false;
+ if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+ start_busy = (intel->batch.last_bo &&
+ drm_intel_bo_busy(intel->batch.last_bo));
+ start_time = get_time();
+ }
- struct brw_shader *shader =
- (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
- if (!shader)
- return false;
+ struct brw_shader *shader = NULL;
+ if (prog)
+ shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
- printf("GLSL IR for native fragment shader %d:\n", prog->Name);
- _mesa_print_ir(shader->ir, NULL);
- printf("\n\n");
+ if (shader) {
+ printf("GLSL IR for native fragment shader %d:\n", prog->Name);
+ _mesa_print_ir(shader->ir, NULL);
+ printf("\n\n");
+ } else {
+ printf("ARB_fragment_program %d ir for native fragment shader\n",
+ fp->Base.Id);
+ _mesa_print_program(&fp->Base);
+ }
}
/* Now the main event: Visit the shader IR and generate our FS IR for it.
*/
- c->dispatch_width = 8;
-
- fs_visitor v(c, prog, shader);
+ fs_visitor v(brw, c, prog, fp, 8);
if (!v.run()) {
prog->LinkStatus = false;
ralloc_strcat(&prog->InfoLog, v.fail_msg);
_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
v.fail_msg);
- return false;
+ return NULL;
}
+ exec_list *simd16_instructions = NULL;
+ fs_visitor v2(brw, c, prog, fp, 16);
if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
- c->dispatch_width = 16;
- fs_visitor v2(c, prog, shader);
v2.import_uniforms(&v);
if (!v2.run()) {
perf_debug("16-wide shader failed to compile, falling back to "
"8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
+ } else {
+ simd16_instructions = &v2.instructions;
}
}
c->prog_data.dispatch_width = 8;
- if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+ fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
+ const unsigned *generated = g.generate_assembly(&v.instructions,
+ simd16_instructions,
+ final_assembly_size);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
if (shader->compiled_once)
brw_wm_debug_recompile(brw, prog, &c->key);
shader->compiled_once = true;
+
+ if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
+ perf_debug("FS compile took %.03f ms and stalled the GPU\n",
+ (get_time() - start_time) * 1000);
+ }
}
- return true;
+ return generated;
}
bool
brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
{
struct brw_context *brw = brw_context(ctx);
+ struct intel_context *intel = &brw->intel;
struct brw_wm_prog_key key;
if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
memset(&key, 0, sizeof(key));
- if (fp->UsesKill)
- key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+ if (intel->gen < 6) {
+ if (fp->UsesKill)
+ key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+ if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+ key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+ /* Just assume depth testing. */
+ key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+ key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+ }
- if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
- key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+ if (prog->Name != 0)
+ key.proj_attrib_mask = 0xffffffff;
- /* Just assume depth testing. */
- key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
- key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+ if (intel->gen < 6)
+ key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
- key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
continue;
- key.proj_attrib_mask |= 1 << i;
+ if (prog->Name == 0)
+ key.proj_attrib_mask |= 1 << i;
- int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
+ if (intel->gen < 6) {
+ int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
- if (vp_index >= 0)
- key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
+ if (vp_index >= 0)
+ key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
+ }
}
key.clamp_fragment_color = true;
- for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
- /* FINISHME: depth compares might use (0,0,0,W) for example */
- key.tex.swizzles[i] = SWIZZLE_XYZW;
+ for (int i = 0; i < MAX_SAMPLERS; i++) {
+ if (fp->Base.ShadowSamplers & (1 << i)) {
+ /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
+ key.tex.swizzles[i] =
+ MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
+ } else {
+ /* Color sampler: assume no swizzling. */
+ key.tex.swizzles[i] = SWIZZLE_XYZW;
+ }
}
if (fp->Base.InputsRead & FRAG_BIT_WPOS) {