src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "program/prog_parameter.h"
  37 #include "program/prog_print.h"
  38 #include "program/prog_optimize.h"
  39 #include "program/register_allocate.h"
  40 #include "program/sampler.h"
  41 #include "program/hash_table.h"
  42 #include "brw_context.h"
  43 #include "brw_eu.h"
  44 #include "brw_wm.h"
  45 }
  46 #include "brw_fs.h"
  47 #include "main/uniforms.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50
  51 void
  52 fs_visitor::visit(ir_variable *ir)
  53 {
  54    fs_reg *reg = NULL;
  55
  56    if (variable_storage(ir))
  57       return;
  58
  59    if (ir->mode == ir_var_shader_in) {
  60       if (!strcmp(ir->name, "gl_FragCoord")) {
  61          reg = emit_fragcoord_interpolation(ir);
  62       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  63          reg = emit_frontfacing_interpolation(ir);
  64       } else {
  65          reg = emit_general_interpolation(ir);
  66       }
  67       assert(reg);
  68       hash_table_insert(this->variable_ht, reg, ir);
  69       return;
  70    } else if (ir->mode == ir_var_shader_out) {
  71       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  72
  73       if (ir->index > 0) {
  74          assert(ir->location == FRAG_RESULT_DATA0);
  75          assert(ir->index == 1);
  76          this->dual_src_output = *reg;
  77       } else if (ir->location == FRAG_RESULT_COLOR) {
  78          /* Writing gl_FragColor outputs to all color regions. */
  79          for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
  80             this->outputs[i] = *reg;
  81             this->output_components[i] = 4;
  82          }
  83       } else if (ir->location == FRAG_RESULT_DEPTH) {
  84          this->frag_depth = *reg;
  85       } else if (ir->location == FRAG_RESULT_SAMPLE_MASK) {
  86          this->sample_mask = *reg;
  87       } else {
  88          /* gl_FragData or a user-defined FS output */
  89          assert(ir->location >= FRAG_RESULT_DATA0 &&
  90                 ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  91
  92          int vector_elements =
  93             ir->type->is_array() ? ir->type->fields.array->vector_elements
  94                                  : ir->type->vector_elements;
  95
  96          /* General color output. */
  97          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  98             int output = ir->location - FRAG_RESULT_DATA0 + i;
  99             this->outputs[output] = *reg;
 100             this->outputs[output].reg_offset += vector_elements * i;
 101             this->output_components[output] = vector_elements;
 102          }
 103       }
 104    } else if (ir->mode == ir_var_uniform) {
 105       int param_index = c->prog_data.nr_params;
 106
 107       /* Thanks to the lower_ubo_reference pass, we will see only
 108        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 109        * variables, so no need for them to be in variable_ht.
 110        */
 111       if (ir->is_in_uniform_block())
 112          return;
 113
 114       if (dispatch_width == 16) {
 115          if (!variable_storage(ir)) {
 116             fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
 117          }
 118          return;
 119       }
 120
 121       param_size[param_index] = type_size(ir->type);
 122       if (!strncmp(ir->name, "gl_", 3)) {
 123          setup_builtin_uniform_values(ir);
 124       } else {
 125          setup_uniform_values(ir);
 126       }
 127
 128       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 129       reg->type = brw_type_for_base_type(ir->type);
 130
 131    } else if (ir->mode == ir_var_system_value) {
 132       if (ir->location == SYSTEM_VALUE_SAMPLE_POS) {
 133          reg = emit_samplepos_setup(ir);
 134       } else if (ir->location == SYSTEM_VALUE_SAMPLE_ID) {
 135          reg = emit_sampleid_setup(ir);
 136       }
 137    }
 138
 139    if (!reg)
 140       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 141
 142    hash_table_insert(this->variable_ht, reg, ir);
 143 }
 144
 145 void
 146 fs_visitor::visit(ir_dereference_variable *ir)
 147 {
 148    fs_reg *reg = variable_storage(ir->var);
 149    this->result = *reg;
 150 }
 151
 152 void
 153 fs_visitor::visit(ir_dereference_record *ir)
 154 {
 155    const glsl_type *struct_type = ir->record->type;
 156
 157    ir->record->accept(this);
 158
 159    unsigned int offset = 0;
 160    for (unsigned int i = 0; i < struct_type->length; i++) {
 161       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 162          break;
 163       offset += type_size(struct_type->fields.structure[i].type);
 164    }
 165    this->result.reg_offset += offset;
 166    this->result.type = brw_type_for_base_type(ir->type);
 167 }
 168
 169 void
 170 fs_visitor::visit(ir_dereference_array *ir)
 171 {
 172    ir_constant *constant_index;
 173    fs_reg src;
 174    int element_size = type_size(ir->type);
 175
 176    constant_index = ir->array_index->as_constant();
 177
 178    ir->array->accept(this);
 179    src = this->result;
 180    src.type = brw_type_for_base_type(ir->type);
 181
 182    if (constant_index) {
 183       assert(src.file == UNIFORM || src.file == GRF);
 184       src.reg_offset += constant_index->value.i[0] * element_size;
 185    } else {
 186       /* Variable index array dereference.  We attach the variable index
 187        * component to the reg as a pointer to a register containing the
 188        * offset.  Currently only uniform arrays are supported in this patch,
 189        * and that reladdr pointer is resolved by
 190        * move_uniform_array_access_to_pull_constants().  All other array types
 191        * are lowered by lower_variable_index_to_cond_assign().
 192        */
 193       ir->array_index->accept(this);
 194
 195       fs_reg index_reg;
 196       index_reg = fs_reg(this, glsl_type::int_type);
 197       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 198
 199       if (src.reladdr) {
 200          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 201       }
 202
 203       src.reladdr = ralloc(mem_ctx, fs_reg);
 204       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 205    }
 206    this->result = src;
 207 }
 208
 209 void
 210 fs_visitor::emit_lrp(fs_reg dst, fs_reg x, fs_reg y, fs_reg a)
 211 {
 212    if (brw->gen < 6 ||
 213        !x.is_valid_3src() ||
 214        !y.is_valid_3src() ||
 215        !a.is_valid_3src()) {
 216       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 217       fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
 218       fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
 219       fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
 220
 221       emit(MUL(y_times_a, y, a));
 222
 223       a.negate = !a.negate;
 224       emit(ADD(one_minus_a, a, fs_reg(1.0f)));
 225       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 226
 227       emit(ADD(dst, x_times_one_minus_a, y_times_a));
 228    } else {
 229       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 230        * we need to reorder the operands.
 231        */
 232       emit(LRP(dst, a, y, x));
 233    }
 234 }
 235
 236 void
 237 fs_visitor::emit_minmax(uint32_t conditionalmod, fs_reg dst,
 238                         fs_reg src0, fs_reg src1)
 239 {
 240    fs_inst *inst;
 241
 242    if (brw->gen >= 6) {
 243       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 244       inst->conditional_mod = conditionalmod;
 245    } else {
 246       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 247
 248       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 249       inst->predicate = BRW_PREDICATE_NORMAL;
 250    }
 251 }
 252
 253 /* Instruction selection: Produce a MOV.sat instead of
 254  * MIN(MAX(val, 0), 1) when possible.
 255  */
 256 bool
 257 fs_visitor::try_emit_saturate(ir_expression *ir)
 258 {
 259    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 260
 261    if (!sat_val)
 262       return false;
 263
 264    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 265
 266    sat_val->accept(this);
 267    fs_reg src = this->result;
 268
 269    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 270
 271    /* If the last instruction from our accept() didn't generate our
 272     * src, generate a saturated MOV
 273     */
 274    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 275    if (!modify || modify->regs_written != 1) {
 276       this->result = fs_reg(this, ir->type);
 277       fs_inst *inst = emit(MOV(this->result, src));
 278       inst->saturate = true;
 279    } else {
 280       modify->saturate = true;
 281       this->result = src;
 282    }
 283
 284
 285    return true;
 286 }
 287
 288 bool
 289 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
 290 {
 291    /* 3-src instructions were introduced in gen6. */
 292    if (brw->gen < 6)
 293       return false;
 294
 295    /* MAD can only handle floating-point data. */
 296    if (ir->type != glsl_type::float_type)
 297       return false;
 298
 299    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
 300    ir_expression *mul = ir->operands[mul_arg]->as_expression();
 301
 302    if (!mul || mul->operation != ir_binop_mul)
 303       return false;
 304
 305    if (nonmul->as_constant() ||
 306        mul->operands[0]->as_constant() ||
 307        mul->operands[1]->as_constant())
 308       return false;
 309
 310    nonmul->accept(this);
 311    fs_reg src0 = this->result;
 312
 313    mul->operands[0]->accept(this);
 314    fs_reg src1 = this->result;
 315
 316    mul->operands[1]->accept(this);
 317    fs_reg src2 = this->result;
 318
 319    this->result = fs_reg(this, ir->type);
 320    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 321
 322    return true;
 323 }
 324
 325 void
 326 fs_visitor::visit(ir_expression *ir)
 327 {
 328    unsigned int operand;
 329    fs_reg op[3], temp;
 330    fs_inst *inst;
 331
 332    assert(ir->get_num_operands() <= 3);
 333
 334    if (try_emit_saturate(ir))
 335       return;
 336    if (ir->operation == ir_binop_add) {
 337       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
 338          return;
 339    }
 340
 341    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 342       ir->operands[operand]->accept(this);
 343       if (this->result.file == BAD_FILE) {
 344          fail("Failed to get tree for expression operand:\n");
 345          ir->operands[operand]->print();
 346          printf("\n");
 347       }
 348       assert(this->result.is_valid_3src());
 349       op[operand] = this->result;
 350
 351       /* Matrix expression operands should have been broken down to vector
 352        * operations already.
 353        */
 354       assert(!ir->operands[operand]->type->is_matrix());
 355       /* And then those vector operands should have been broken down to scalar.
 356        */
 357       assert(!ir->operands[operand]->type->is_vector());
 358    }
 359
 360    /* Storage for our result.  If our result goes into an assignment, it will
 361     * just get copy-propagated out, so no worries.
 362     */
 363    this->result = fs_reg(this, ir->type);
 364
 365    switch (ir->operation) {
 366    case ir_unop_logic_not:
 367       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 368        * ones complement of the whole register, not just bit 0.
 369        */
 370       emit(XOR(this->result, op[0], fs_reg(1)));
 371       break;
 372    case ir_unop_neg:
 373       op[0].negate = !op[0].negate;
 374       emit(MOV(this->result, op[0]));
 375       break;
 376    case ir_unop_abs:
 377       op[0].abs = true;
 378       op[0].negate = false;
 379       emit(MOV(this->result, op[0]));
 380       break;
 381    case ir_unop_sign:
 382       temp = fs_reg(this, ir->type);
 383
 384       emit(MOV(this->result, fs_reg(0.0f)));
 385
 386       emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_G));
 387       inst = emit(MOV(this->result, fs_reg(1.0f)));
 388       inst->predicate = BRW_PREDICATE_NORMAL;
 389
 390       emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_L));
 391       inst = emit(MOV(this->result, fs_reg(-1.0f)));
 392       inst->predicate = BRW_PREDICATE_NORMAL;
 393
 394       break;
 395    case ir_unop_rcp:
 396       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 397       break;
 398
 399    case ir_unop_exp2:
 400       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 401       break;
 402    case ir_unop_log2:
 403       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 404       break;
 405    case ir_unop_exp:
 406    case ir_unop_log:
 407       assert(!"not reached: should be handled by ir_explog_to_explog2");
 408       break;
 409    case ir_unop_sin:
 410    case ir_unop_sin_reduced:
 411       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 412       break;
 413    case ir_unop_cos:
 414    case ir_unop_cos_reduced:
 415       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 416       break;
 417
 418    case ir_unop_dFdx:
 419       emit(FS_OPCODE_DDX, this->result, op[0]);
 420       break;
 421    case ir_unop_dFdy:
 422       emit(FS_OPCODE_DDY, this->result, op[0]);
 423       break;
 424
 425    case ir_binop_add:
 426       emit(ADD(this->result, op[0], op[1]));
 427       break;
 428    case ir_binop_sub:
 429       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 430       break;
 431
 432    case ir_binop_mul:
 433       if (ir->type->is_integer()) {
 434          /* For integer multiplication, the MUL uses the low 16 bits
 435           * of one of the operands (src0 on gen6, src1 on gen7).  The
 436           * MACH accumulates in the contribution of the upper 16 bits
 437           * of that operand.
 438           *
 439           * FINISHME: Emit just the MUL if we know an operand is small
 440           * enough.
 441           */
 442          if (brw->gen >= 7 && dispatch_width == 16)
 443             fail("16-wide explicit accumulator operands unsupported\n");
 444
 445          struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
 446
 447          emit(MUL(acc, op[0], op[1]));
 448          emit(MACH(reg_null_d, op[0], op[1]));
 449          emit(MOV(this->result, fs_reg(acc)));
 450       } else {
 451          emit(MUL(this->result, op[0], op[1]));
 452       }
 453       break;
 454    case ir_binop_imul_high: {
 455       if (brw->gen >= 7 && dispatch_width == 16)
 456          fail("16-wide explicit accumulator operands unsupported\n");
 457
 458       struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
 459
 460       emit(MUL(acc, op[0], op[1]));
 461       emit(MACH(this->result, op[0], op[1]));
 462       break;
 463    }
 464    case ir_binop_div:
 465       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 466       assert(ir->type->is_integer());
 467       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 468       break;
 469    case ir_binop_carry: {
 470       if (brw->gen >= 7 && dispatch_width == 16)
 471          fail("16-wide explicit accumulator operands unsupported\n");
 472
 473       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
 474
 475       emit(ADDC(reg_null_ud, op[0], op[1]));
 476       emit(MOV(this->result, fs_reg(acc)));
 477       break;
 478    }
 479    case ir_binop_borrow: {
 480       if (brw->gen >= 7 && dispatch_width == 16)
 481          fail("16-wide explicit accumulator operands unsupported\n");
 482
 483       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
 484
 485       emit(SUBB(reg_null_ud, op[0], op[1]));
 486       emit(MOV(this->result, fs_reg(acc)));
 487       break;
 488    }
 489    case ir_binop_mod:
 490       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 491       assert(ir->type->is_integer());
 492       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 493       break;
 494
 495    case ir_binop_less:
 496    case ir_binop_greater:
 497    case ir_binop_lequal:
 498    case ir_binop_gequal:
 499    case ir_binop_equal:
 500    case ir_binop_all_equal:
 501    case ir_binop_nequal:
 502    case ir_binop_any_nequal:
 503       resolve_bool_comparison(ir->operands[0], &op[0]);
 504       resolve_bool_comparison(ir->operands[1], &op[1]);
 505
 506       emit(CMP(this->result, op[0], op[1],
 507                brw_conditional_for_comparison(ir->operation)));
 508       break;
 509
 510    case ir_binop_logic_xor:
 511       emit(XOR(this->result, op[0], op[1]));
 512       break;
 513
 514    case ir_binop_logic_or:
 515       emit(OR(this->result, op[0], op[1]));
 516       break;
 517
 518    case ir_binop_logic_and:
 519       emit(AND(this->result, op[0], op[1]));
 520       break;
 521
 522    case ir_binop_dot:
 523    case ir_unop_any:
 524       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 525       break;
 526
 527    case ir_unop_noise:
 528       assert(!"not reached: should be handled by lower_noise");
 529       break;
 530
 531    case ir_quadop_vector:
 532       assert(!"not reached: should be handled by lower_quadop_vector");
 533       break;
 534
 535    case ir_binop_vector_extract:
 536       assert(!"not reached: should be handled by lower_vec_index_to_cond_assign()");
 537       break;
 538
 539    case ir_triop_vector_insert:
 540       assert(!"not reached: should be handled by lower_vector_insert()");
 541       break;
 542
 543    case ir_binop_ldexp:
 544       assert(!"not reached: should be handled by ldexp_to_arith()");
 545       break;
 546
 547    case ir_unop_sqrt:
 548       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 549       break;
 550
 551    case ir_unop_rsq:
 552       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 553       break;
 554
 555    case ir_unop_bitcast_i2f:
 556    case ir_unop_bitcast_u2f:
 557       op[0].type = BRW_REGISTER_TYPE_F;
 558       this->result = op[0];
 559       break;
 560    case ir_unop_i2u:
 561    case ir_unop_bitcast_f2u:
 562       op[0].type = BRW_REGISTER_TYPE_UD;
 563       this->result = op[0];
 564       break;
 565    case ir_unop_u2i:
 566    case ir_unop_bitcast_f2i:
 567       op[0].type = BRW_REGISTER_TYPE_D;
 568       this->result = op[0];
 569       break;
 570    case ir_unop_i2f:
 571    case ir_unop_u2f:
 572    case ir_unop_f2i:
 573    case ir_unop_f2u:
 574       emit(MOV(this->result, op[0]));
 575       break;
 576
 577    case ir_unop_b2i:
 578       emit(AND(this->result, op[0], fs_reg(1)));
 579       break;
 580    case ir_unop_b2f:
 581       temp = fs_reg(this, glsl_type::int_type);
 582       emit(AND(temp, op[0], fs_reg(1)));
 583       emit(MOV(this->result, temp));
 584       break;
 585
 586    case ir_unop_f2b:
 587       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 588       break;
 589    case ir_unop_i2b:
 590       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 591       break;
 592
 593    case ir_unop_trunc:
 594       emit(RNDZ(this->result, op[0]));
 595       break;
 596    case ir_unop_ceil:
 597       op[0].negate = !op[0].negate;
 598       emit(RNDD(this->result, op[0]));
 599       this->result.negate = true;
 600       break;
 601    case ir_unop_floor:
 602       emit(RNDD(this->result, op[0]));
 603       break;
 604    case ir_unop_fract:
 605       emit(FRC(this->result, op[0]));
 606       break;
 607    case ir_unop_round_even:
 608       emit(RNDE(this->result, op[0]));
 609       break;
 610
 611    case ir_binop_min:
 612    case ir_binop_max:
 613       resolve_ud_negate(&op[0]);
 614       resolve_ud_negate(&op[1]);
 615       emit_minmax(ir->operation == ir_binop_min ?
 616                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
 617                   this->result, op[0], op[1]);
 618       break;
 619    case ir_unop_pack_snorm_2x16:
 620    case ir_unop_pack_snorm_4x8:
 621    case ir_unop_pack_unorm_2x16:
 622    case ir_unop_pack_unorm_4x8:
 623    case ir_unop_unpack_snorm_2x16:
 624    case ir_unop_unpack_snorm_4x8:
 625    case ir_unop_unpack_unorm_2x16:
 626    case ir_unop_unpack_unorm_4x8:
 627    case ir_unop_unpack_half_2x16:
 628    case ir_unop_pack_half_2x16:
 629       assert(!"not reached: should be handled by lower_packing_builtins");
 630       break;
 631    case ir_unop_unpack_half_2x16_split_x:
 632       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
 633       break;
 634    case ir_unop_unpack_half_2x16_split_y:
 635       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
 636       break;
 637    case ir_binop_pow:
 638       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 639       break;
 640
 641    case ir_unop_bitfield_reverse:
 642       emit(BFREV(this->result, op[0]));
 643       break;
 644    case ir_unop_bit_count:
 645       emit(CBIT(this->result, op[0]));
 646       break;
 647    case ir_unop_find_msb:
 648       temp = fs_reg(this, glsl_type::uint_type);
 649       emit(FBH(temp, op[0]));
 650
 651       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
 652        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
 653        * subtract the result from 31 to convert the MSB count into an LSB count.
 654        */
 655
 656       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
 657       emit(MOV(this->result, temp));
 658       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
 659
 660       temp.negate = true;
 661       inst = emit(ADD(this->result, temp, fs_reg(31)));
 662       inst->predicate = BRW_PREDICATE_NORMAL;
 663       break;
 664    case ir_unop_find_lsb:
 665       emit(FBL(this->result, op[0]));
 666       break;
 667    case ir_triop_bitfield_extract:
 668       /* Note that the instruction's argument order is reversed from GLSL
 669        * and the IR.
 670        */
 671       emit(BFE(this->result, op[2], op[1], op[0]));
 672       break;
 673    case ir_binop_bfm:
 674       emit(BFI1(this->result, op[0], op[1]));
 675       break;
 676    case ir_triop_bfi:
 677       emit(BFI2(this->result, op[0], op[1], op[2]));
 678       break;
 679    case ir_quadop_bitfield_insert:
 680       assert(!"not reached: should be handled by "
 681               "lower_instructions::bitfield_insert_to_bfm_bfi");
 682       break;
 683
 684    case ir_unop_bit_not:
 685       emit(NOT(this->result, op[0]));
 686       break;
 687    case ir_binop_bit_and:
 688       emit(AND(this->result, op[0], op[1]));
 689       break;
 690    case ir_binop_bit_xor:
 691       emit(XOR(this->result, op[0], op[1]));
 692       break;
 693    case ir_binop_bit_or:
 694       emit(OR(this->result, op[0], op[1]));
 695       break;
 696
 697    case ir_binop_lshift:
 698       emit(SHL(this->result, op[0], op[1]));
 699       break;
 700
 701    case ir_binop_rshift:
 702       if (ir->type->base_type == GLSL_TYPE_INT)
 703          emit(ASR(this->result, op[0], op[1]));
 704       else
 705          emit(SHR(this->result, op[0], op[1]));
 706       break;
 707    case ir_binop_pack_half_2x16_split:
 708       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
 709       break;
 710    case ir_binop_ubo_load: {
 711       /* This IR node takes a constant uniform block and a constant or
 712        * variable byte offset within the block and loads a vector from that.
 713        */
 714       ir_constant *uniform_block = ir->operands[0]->as_constant();
 715       ir_constant *const_offset = ir->operands[1]->as_constant();
 716       fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.ubo_start +
 717                                  uniform_block->value.u[0]);
 718       if (const_offset) {
 719          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 720          packed_consts.type = result.type;
 721
 722          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
 723          emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
 724                       packed_consts, surf_index, const_offset_reg));
 725
 726          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
 727          for (int i = 0; i < ir->type->vector_elements; i++) {
 728             /* UBO bools are any nonzero value.  We consider bools to be
 729              * values with the low bit set to 1.  Convert them using CMP.
 730              */
 731             if (ir->type->base_type == GLSL_TYPE_BOOL) {
 732                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
 733             } else {
 734                emit(MOV(result, packed_consts));
 735             }
 736
 737             packed_consts.smear++;
 738             result.reg_offset++;
 739
 740             /* The std140 packing rules don't allow vectors to cross 16-byte
 741              * boundaries, and a reg is 32 bytes.
 742              */
 743             assert(packed_consts.smear < 8);
 744          }
 745       } else {
 746          /* Turn the byte offset into a dword offset. */
 747          fs_reg base_offset = fs_reg(this, glsl_type::int_type);
 748          emit(SHR(base_offset, op[1], fs_reg(2)));
 749
 750          for (int i = 0; i < ir->type->vector_elements; i++) {
 751             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
 752                                             base_offset, i));
 753
 754             if (ir->type->base_type == GLSL_TYPE_BOOL)
 755                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
 756
 757             result.reg_offset++;
 758          }
 759       }
 760
 761       result.reg_offset = 0;
 762       break;
 763    }
 764
 765    case ir_triop_fma:
 766       /* Note that the instruction's argument order is reversed from GLSL
 767        * and the IR.
 768        */
 769       emit(MAD(this->result, op[2], op[1], op[0]));
 770       break;
 771
 772    case ir_triop_lrp:
 773       emit_lrp(this->result, op[0], op[1], op[2]);
 774       break;
 775
 776    case ir_triop_csel:
 777       emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 778       inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
 779       inst->predicate = BRW_PREDICATE_NORMAL;
 780       break;
 781    }
 782 }
 783
 784 void
 785 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 786                                    const glsl_type *type, bool predicated)
 787 {
 788    switch (type->base_type) {
 789    case GLSL_TYPE_FLOAT:
 790    case GLSL_TYPE_UINT:
 791    case GLSL_TYPE_INT:
 792    case GLSL_TYPE_BOOL:
 793       for (unsigned int i = 0; i < type->components(); i++) {
 794          l.type = brw_type_for_base_type(type);
 795          r.type = brw_type_for_base_type(type);
 796
 797          if (predicated || !l.equals(r)) {
 798             fs_inst *inst = emit(MOV(l, r));
 799             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
 800          }
 801
 802          l.reg_offset++;
 803          r.reg_offset++;
 804       }
 805       break;
 806    case GLSL_TYPE_ARRAY:
 807       for (unsigned int i = 0; i < type->length; i++) {
 808          emit_assignment_writes(l, r, type->fields.array, predicated);
 809       }
 810       break;
 811
 812    case GLSL_TYPE_STRUCT:
 813       for (unsigned int i = 0; i < type->length; i++) {
 814          emit_assignment_writes(l, r, type->fields.structure[i].type,
 815                                 predicated);
 816       }
 817       break;
 818
 819    case GLSL_TYPE_SAMPLER:
 820    case GLSL_TYPE_ATOMIC_UINT:
 821       break;
 822
 823    case GLSL_TYPE_VOID:
 824    case GLSL_TYPE_ERROR:
 825    case GLSL_TYPE_INTERFACE:
 826       assert(!"not reached");
 827       break;
 828    }
 829 }
 830
 831 /* If the RHS processing resulted in an instruction generating a
 832  * temporary value, and it would be easy to rewrite the instruction to
 833  * generate its result right into the LHS instead, do so.  This ends
 834  * up reliably removing instructions where it can be tricky to do so
 835  * later without real UD chain information.
 836  */
 837 bool
 838 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
 839                                    fs_reg dst,
 840                                    fs_reg src,
 841                                    fs_inst *pre_rhs_inst,
 842                                    fs_inst *last_rhs_inst)
 843 {
 844    /* Only attempt if we're doing a direct assignment. */
 845    if (ir->condition ||
 846        !(ir->lhs->type->is_scalar() ||
 847         (ir->lhs->type->is_vector() &&
 848          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
 849       return false;
 850
 851    /* Make sure the last instruction generated our source reg. */
 852    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
 853                                                     last_rhs_inst,
 854                                                     src);
 855    if (!modify)
 856       return false;
 857
 858    /* If last_rhs_inst wrote a different number of components than our LHS,
 859     * we can't safely rewrite it.
 860     */
 861    if (virtual_grf_sizes[dst.reg] != modify->regs_written)
 862       return false;
 863
 864    /* Success!  Rewrite the instruction. */
 865    modify->dst = dst;
 866
 867    return true;
 868 }
 869
 870 void
 871 fs_visitor::visit(ir_assignment *ir)
 872 {
 873    fs_reg l, r;
 874    fs_inst *inst;
 875
 876    /* FINISHME: arrays on the lhs */
 877    ir->lhs->accept(this);
 878    l = this->result;
 879
 880    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
 881
 882    ir->rhs->accept(this);
 883    r = this->result;
 884
 885    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
 886
 887    assert(l.file != BAD_FILE);
 888    assert(r.file != BAD_FILE);
 889
 890    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
 891       return;
 892
 893    if (ir->condition) {
 894       emit_bool_to_cond_code(ir->condition);
 895    }
 896
 897    if (ir->lhs->type->is_scalar() ||
 898        ir->lhs->type->is_vector()) {
 899       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
 900          if (ir->write_mask & (1 << i)) {
 901             inst = emit(MOV(l, r));
 902             if (ir->condition)
 903                inst->predicate = BRW_PREDICATE_NORMAL;
 904             r.reg_offset++;
 905          }
 906          l.reg_offset++;
 907       }
 908    } else {
 909       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
 910    }
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 915                               fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
 916 {
 917    int mlen;
 918    int base_mrf = 1;
 919    bool simd16 = false;
 920    fs_reg orig_dst;
 921
 922    /* g0 header. */
 923    mlen = 1;
 924
 925    if (ir->shadow_comparitor) {
 926       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 927          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 928          coordinate.reg_offset++;
 929       }
 930
 931       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
 932        * the unused slots must be zeroed.
 933        */
 934       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
 935          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
 936       }
 937       mlen += 3;
 938
 939       if (ir->op == ir_tex) {
 940          /* There's no plain shadow compare message, so we use shadow
 941           * compare with a bias of 0.0.
 942           */
 943          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
 944          mlen++;
 945       } else if (ir->op == ir_txb || ir->op == ir_txl) {
 946          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 947          mlen++;
 948       } else {
 949          assert(!"Should not get here.");
 950       }
 951
 952       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
 953       mlen++;
 954    } else if (ir->op == ir_tex) {
 955       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 956          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 957          coordinate.reg_offset++;
 958       }
 959       /* zero the others. */
 960       for (int i = ir->coordinate->type->vector_elements; i<3; i++) {
 961          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
 962       }
 963       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 964       mlen += 3;
 965    } else if (ir->op == ir_txd) {
 966       fs_reg &dPdx = lod;
 967
 968       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 969          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 970          coordinate.reg_offset++;
 971       }
 972       /* the slots for u and v are always present, but r is optional */
 973       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
 974
 975       /*  P   = u, v, r
 976        * dPdx = dudx, dvdx, drdx
 977        * dPdy = dudy, dvdy, drdy
 978        *
 979        * 1-arg: Does not exist.
 980        *
 981        * 2-arg: dudx   dvdx   dudy   dvdy
 982        *        dPdx.x dPdx.y dPdy.x dPdy.y
 983        *        m4     m5     m6     m7
 984        *
 985        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 986        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 987        *        m5     m6     m7     m8     m9     m10
 988        */
 989       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 990          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
 991          dPdx.reg_offset++;
 992       }
 993       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 994
 995       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
 996          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
 997          dPdy.reg_offset++;
 998       }
 999       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
1000    } else if (ir->op == ir_txs) {
1001       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1002       simd16 = true;
1003       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1004       mlen += 2;
1005    } else {
1006       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1007        * instructions.  We'll need to do SIMD16 here.
1008        */
1009       simd16 = true;
1010       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
1011
1012       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1013          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1014                   coordinate));
1015          coordinate.reg_offset++;
1016       }
1017
1018       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1019        * be necessary for TXF (ld), but seems wise to do for all messages.
1020        */
1021       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
1022          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1023       }
1024
1025       /* lod/bias appears after u/v/r. */
1026       mlen += 6;
1027
1028       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1029       mlen++;
1030
1031       /* The unused upper half. */
1032       mlen++;
1033    }
1034
1035    if (simd16) {
1036       /* Now, since we're doing simd16, the return is 2 interleaved
1037        * vec4s where the odd-indexed ones are junk. We'll need to move
1038        * this weirdness around to the expected layout.
1039        */
1040       orig_dst = dst;
1041       dst = fs_reg(GRF, virtual_grf_alloc(8),
1042                    (brw->is_g4x ?
1043                     brw_type_for_base_type(ir->type) :
1044                     BRW_REGISTER_TYPE_F));
1045    }
1046
1047    fs_inst *inst = NULL;
1048    switch (ir->op) {
1049    case ir_tex:
1050       inst = emit(SHADER_OPCODE_TEX, dst);
1051       break;
1052    case ir_txb:
1053       inst = emit(FS_OPCODE_TXB, dst);
1054       break;
1055    case ir_txl:
1056       inst = emit(SHADER_OPCODE_TXL, dst);
1057       break;
1058    case ir_txd:
1059       inst = emit(SHADER_OPCODE_TXD, dst);
1060       break;
1061    case ir_txs:
1062       inst = emit(SHADER_OPCODE_TXS, dst);
1063       break;
1064    case ir_txf:
1065       inst = emit(SHADER_OPCODE_TXF, dst);
1066       break;
1067    default:
1068       fail("unrecognized texture opcode");
1069    }
1070    inst->base_mrf = base_mrf;
1071    inst->mlen = mlen;
1072    inst->header_present = true;
1073    inst->regs_written = simd16 ? 8 : 4;
1074
1075    if (simd16) {
1076       for (int i = 0; i < 4; i++) {
1077          emit(MOV(orig_dst, dst));
1078          orig_dst.reg_offset++;
1079          dst.reg_offset += 2;
1080       }
1081    }
1082
1083    return inst;
1084 }
1085
1086 /* gen5's sampler has slots for u, v, r, array index, then optional
1087  * parameters like shadow comparitor or LOD bias.  If optional
1088  * parameters aren't present, those base slots are optional and don't
1089  * need to be included in the message.
1090  *
1091  * We don't fill in the unnecessary slots regardless, which may look
1092  * surprising in the disassembly.
1093  */
1094 fs_inst *
1095 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1096                               fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1097                               fs_reg sample_index)
1098 {
1099    int mlen = 0;
1100    int base_mrf = 2;
1101    int reg_width = dispatch_width / 8;
1102    bool header_present = false;
1103    const int vector_elements =
1104       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
1105
1106    if (ir->offset) {
1107       /* The offsets set up by the ir_texture visitor are in the
1108        * m1 header, so we can't go headerless.
1109        */
1110       header_present = true;
1111       mlen++;
1112       base_mrf--;
1113    }
1114
1115    for (int i = 0; i < vector_elements; i++) {
1116       emit(MOV(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
1117                coordinate));
1118       coordinate.reg_offset++;
1119    }
1120    mlen += vector_elements * reg_width;
1121
1122    if (ir->shadow_comparitor) {
1123       mlen = MAX2(mlen, header_present + 4 * reg_width);
1124
1125       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1126       mlen += reg_width;
1127    }
1128
1129    fs_inst *inst = NULL;
1130    switch (ir->op) {
1131    case ir_tex:
1132       inst = emit(SHADER_OPCODE_TEX, dst);
1133       break;
1134    case ir_txb:
1135       mlen = MAX2(mlen, header_present + 4 * reg_width);
1136       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1137       mlen += reg_width;
1138
1139       inst = emit(FS_OPCODE_TXB, dst);
1140       break;
1141    case ir_txl:
1142       mlen = MAX2(mlen, header_present + 4 * reg_width);
1143       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1144       mlen += reg_width;
1145
1146       inst = emit(SHADER_OPCODE_TXL, dst);
1147       break;
1148    case ir_txd: {
1149       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
1150
1151       /**
1152        *  P   =  u,    v,    r
1153        * dPdx = dudx, dvdx, drdx
1154        * dPdy = dudy, dvdy, drdy
1155        *
1156        * Load up these values:
1157        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1158        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1159        */
1160       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1161          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1162          lod.reg_offset++;
1163          mlen += reg_width;
1164
1165          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
1166          lod2.reg_offset++;
1167          mlen += reg_width;
1168       }
1169
1170       inst = emit(SHADER_OPCODE_TXD, dst);
1171       break;
1172    }
1173    case ir_txs:
1174       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1175       mlen += reg_width;
1176       inst = emit(SHADER_OPCODE_TXS, dst);
1177       break;
1178    case ir_query_levels:
1179       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1180       mlen += reg_width;
1181       inst = emit(SHADER_OPCODE_TXS, dst);
1182       break;
1183    case ir_txf:
1184       mlen = header_present + 4 * reg_width;
1185       emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), lod));
1186       inst = emit(SHADER_OPCODE_TXF, dst);
1187       break;
1188    case ir_txf_ms:
1189       mlen = header_present + 4 * reg_width;
1190
1191       /* lod */
1192       emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), fs_reg(0)));
1193       /* sample index */
1194       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), sample_index));
1195       mlen += reg_width;
1196       inst = emit(SHADER_OPCODE_TXF_MS, dst);
1197       break;
1198    case ir_lod:
1199       inst = emit(SHADER_OPCODE_LOD, dst);
1200       break;
1201    case ir_tg4:
1202       inst = emit(SHADER_OPCODE_TG4, dst);
1203       break;
1204    default:
1205       fail("unrecognized texture opcode");
1206       break;
1207    }
1208    inst->base_mrf = base_mrf;
1209    inst->mlen = mlen;
1210    inst->header_present = header_present;
1211    inst->regs_written = 4;
1212
1213    if (mlen > 11) {
1214       fail("Message length >11 disallowed by hardware\n");
1215    }
1216
1217    return inst;
1218 }
1219
1220 fs_inst *
1221 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1222                               fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1223                               fs_reg sample_index)
1224 {
1225    int reg_width = dispatch_width / 8;
1226    bool header_present = false;
1227
1228    fs_reg payload = fs_reg(this, glsl_type::float_type);
1229    fs_reg next = payload;
1230
1231    if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf)) {
1232       /* For general texture offsets (no txf workaround), we need a header to
1233        * put them in.  Note that for 16-wide we're making space for two actual
1234        * hardware registers here, so the emit will have to fix up for this.
1235        *
1236        * * ir4_tg4 needs to place its channel select in the header,
1237        * for interaction with ARB_texture_swizzle
1238        */
1239       header_present = true;
1240       next.reg_offset++;
1241    }
1242
1243    if (ir->shadow_comparitor) {
1244       emit(MOV(next, shadow_c));
1245       next.reg_offset++;
1246    }
1247
1248    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
1249    bool coordinate_done = false;
1250
1251    /* Set up the LOD info */
1252    switch (ir->op) {
1253    case ir_tex:
1254    case ir_lod:
1255       break;
1256    case ir_txb:
1257       emit(MOV(next, lod));
1258       next.reg_offset++;
1259       break;
1260    case ir_txl:
1261       emit(MOV(next, lod));
1262       next.reg_offset++;
1263       break;
1264    case ir_txd: {
1265       if (dispatch_width == 16)
1266          fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1267
1268       /* Load dPdx and the coordinate together:
1269        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1270        */
1271       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1272          emit(MOV(next, coordinate));
1273          coordinate.reg_offset++;
1274          next.reg_offset++;
1275
1276          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1277           * only derivatives for (u, v, r).
1278           */
1279          if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
1280             emit(MOV(next, lod));
1281             lod.reg_offset++;
1282             next.reg_offset++;
1283
1284             emit(MOV(next, lod2));
1285             lod2.reg_offset++;
1286             next.reg_offset++;
1287          }
1288       }
1289
1290       coordinate_done = true;
1291       break;
1292    }
1293    case ir_txs:
1294       emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), lod));
1295       next.reg_offset++;
1296       break;
1297    case ir_query_levels:
1298       emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1299       next.reg_offset++;
1300       break;
1301    case ir_txf:
1302       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1303       emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1304       coordinate.reg_offset++;
1305       next.reg_offset++;
1306
1307       emit(MOV(next.retype(BRW_REGISTER_TYPE_D), lod));
1308       next.reg_offset++;
1309
1310       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1311          emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1312          coordinate.reg_offset++;
1313          next.reg_offset++;
1314       }
1315
1316       coordinate_done = true;
1317       break;
1318    case ir_txf_ms:
1319       emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), sample_index));
1320       next.reg_offset++;
1321
1322       /* constant zero MCS; we arrange to never actually have a compressed
1323        * multisample surface here for now. TODO: issue ld_mcs to get this first,
1324        * if we ever support texturing from compressed multisample surfaces
1325        */
1326       emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1327       next.reg_offset++;
1328
1329       /* there is no offsetting for this message; just copy in the integer
1330        * texture coordinates
1331        */
1332       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1333          emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1334          coordinate.reg_offset++;
1335          next.reg_offset++;
1336       }
1337
1338       coordinate_done = true;
1339       break;
1340    case ir_tg4:
1341       if (has_nonconstant_offset) {
1342          if (ir->shadow_comparitor && dispatch_width == 16)
1343             fail("Gen7 does not support gather4_po_c in SIMD16 mode.");
1344
1345          /* More crazy intermixing */
1346          ir->offset->accept(this);
1347          fs_reg offset_value = this->result;
1348
1349          for (int i = 0; i < 2; i++) { /* u, v */
1350             emit(MOV(next, coordinate));
1351             coordinate.reg_offset++;
1352             next.reg_offset++;
1353          }
1354
1355          for (int i = 0; i < 2; i++) { /* offu, offv */
1356             emit(MOV(next.retype(BRW_REGISTER_TYPE_D), offset_value));
1357             offset_value.reg_offset++;
1358             next.reg_offset++;
1359          }
1360
1361          if (ir->coordinate->type->vector_elements == 3) { /* r if present */
1362             emit(MOV(next, coordinate));
1363             coordinate.reg_offset++;
1364             next.reg_offset++;
1365          }
1366
1367          coordinate_done = true;
1368       }
1369       break;
1370    }
1371
1372    /* Set up the coordinate (except for cases where it was done above) */
1373    if (ir->coordinate && !coordinate_done) {
1374       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1375          emit(MOV(next, coordinate));
1376          coordinate.reg_offset++;
1377          next.reg_offset++;
1378       }
1379    }
1380
1381    /* Generate the SEND */
1382    fs_inst *inst = NULL;
1383    switch (ir->op) {
1384    case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst, payload); break;
1385    case ir_txb: inst = emit(FS_OPCODE_TXB, dst, payload); break;
1386    case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst, payload); break;
1387    case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst, payload); break;
1388    case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst, payload); break;
1389    case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst, payload); break;
1390    case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1391    case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1392    case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst, payload); break;
1393    case ir_tg4:
1394       if (has_nonconstant_offset)
1395          inst = emit(SHADER_OPCODE_TG4_OFFSET, dst, payload);
1396       else
1397          inst = emit(SHADER_OPCODE_TG4, dst, payload);
1398       break;
1399    }
1400    inst->base_mrf = -1;
1401    if (reg_width == 2)
1402       inst->mlen = next.reg_offset * reg_width - header_present;
1403    else
1404       inst->mlen = next.reg_offset * reg_width;
1405    inst->header_present = header_present;
1406    inst->regs_written = 4;
1407
1408    virtual_grf_sizes[payload.reg] = next.reg_offset;
1409    if (inst->mlen > 11) {
1410       fail("Message length >11 disallowed by hardware\n");
1411    }
1412
1413    return inst;
1414 }
1415
1416 fs_reg
1417 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1418                              bool is_rect, int sampler, int texunit)
1419 {
1420    fs_inst *inst = NULL;
1421    bool needs_gl_clamp = true;
1422    fs_reg scale_x, scale_y;
1423
1424    /* The 965 requires the EU to do the normalization of GL rectangle
1425     * texture coordinates.  We use the program parameter state
1426     * tracking to get the scaling factor.
1427     */
1428    if (is_rect &&
1429        (brw->gen < 6 ||
1430         (brw->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1431                              c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1432       struct gl_program_parameter_list *params = prog->Parameters;
1433       int tokens[STATE_LENGTH] = {
1434          STATE_INTERNAL,
1435          STATE_TEXRECT_SCALE,
1436          texunit,
1437          0,
1438          0
1439       };
1440
1441       if (dispatch_width == 16) {
1442          fail("rectangle scale uniform setup not supported on 16-wide\n");
1443          return coordinate;
1444       }
1445
1446       scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1447       scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1448
1449       GLuint index = _mesa_add_state_reference(params,
1450                                                (gl_state_index *)tokens);
1451       c->prog_data.param[c->prog_data.nr_params++] =
1452          &prog->Parameters->ParameterValues[index][0].f;
1453       c->prog_data.param[c->prog_data.nr_params++] =
1454          &prog->Parameters->ParameterValues[index][1].f;
1455    }
1456
1457    /* The 965 requires the EU to do the normalization of GL rectangle
1458     * texture coordinates.  We use the program parameter state
1459     * tracking to get the scaling factor.
1460     */
1461    if (brw->gen < 6 && is_rect) {
1462       fs_reg dst = fs_reg(this, ir->coordinate->type);
1463       fs_reg src = coordinate;
1464       coordinate = dst;
1465
1466       emit(MUL(dst, src, scale_x));
1467       dst.reg_offset++;
1468       src.reg_offset++;
1469       emit(MUL(dst, src, scale_y));
1470    } else if (is_rect) {
1471       /* On gen6+, the sampler handles the rectangle coordinates
1472        * natively, without needing rescaling.  But that means we have
1473        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1474        * not [0, 1] like the default case below.
1475        */
1476       needs_gl_clamp = false;
1477
1478       for (int i = 0; i < 2; i++) {
1479          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1480             fs_reg chan = coordinate;
1481             chan.reg_offset += i;
1482
1483             inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1484             inst->conditional_mod = BRW_CONDITIONAL_G;
1485
1486             /* Our parameter comes in as 1.0/width or 1.0/height,
1487              * because that's what people normally want for doing
1488              * texture rectangle handling.  We need width or height
1489              * for clamping, but we don't care enough to make a new
1490              * parameter type, so just invert back.
1491              */
1492             fs_reg limit = fs_reg(this, glsl_type::float_type);
1493             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1494             emit(SHADER_OPCODE_RCP, limit, limit);
1495
1496             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1497             inst->conditional_mod = BRW_CONDITIONAL_L;
1498          }
1499       }
1500    }
1501
1502    if (ir->coordinate && needs_gl_clamp) {
1503       for (unsigned int i = 0;
1504            i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1505          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1506             fs_reg chan = coordinate;
1507             chan.reg_offset += i;
1508
1509             fs_inst *inst = emit(MOV(chan, chan));
1510             inst->saturate = true;
1511          }
1512       }
1513    }
1514    return coordinate;
1515 }
1516
1517 void
1518 fs_visitor::visit(ir_texture *ir)
1519 {
1520    fs_inst *inst = NULL;
1521
1522    int sampler =
1523       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
1524    /* FINISHME: We're failing to recompile our programs when the sampler is
1525     * updated.  This only matters for the texture rectangle scale parameters
1526     * (pre-gen6, or gen6+ with GL_CLAMP).
1527     */
1528    int texunit = prog->SamplerUnits[sampler];
1529
1530    if (ir->op == ir_tg4) {
1531       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
1532        * emitting anything other than setting up the constant result.
1533        */
1534       ir_constant *chan = ir->lod_info.component->as_constant();
1535       int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1536       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
1537
1538          fs_reg res = fs_reg(this, glsl_type::vec4_type);
1539          this->result = res;
1540
1541          for (int i=0; i<4; i++) {
1542             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
1543             res.reg_offset++;
1544          }
1545          return;
1546       }
1547    }
1548
1549    /* Should be lowered by do_lower_texture_projection */
1550    assert(!ir->projector);
1551
1552    /* Should be lowered */
1553    assert(!ir->offset || !ir->offset->type->is_array());
1554
1555    /* Generate code to compute all the subexpression trees.  This has to be
1556     * done before loading any values into MRFs for the sampler message since
1557     * generating these values may involve SEND messages that need the MRFs.
1558     */
1559    fs_reg coordinate;
1560    if (ir->coordinate) {
1561       ir->coordinate->accept(this);
1562
1563       coordinate = rescale_texcoord(ir, this->result,
1564                                     ir->sampler->type->sampler_dimensionality ==
1565                                     GLSL_SAMPLER_DIM_RECT,
1566                                     sampler, texunit);
1567    }
1568
1569    fs_reg shadow_comparitor;
1570    if (ir->shadow_comparitor) {
1571       ir->shadow_comparitor->accept(this);
1572       shadow_comparitor = this->result;
1573    }
1574
1575    fs_reg lod, lod2, sample_index;
1576    switch (ir->op) {
1577    case ir_tex:
1578    case ir_lod:
1579    case ir_tg4:
1580    case ir_query_levels:
1581       break;
1582    case ir_txb:
1583       ir->lod_info.bias->accept(this);
1584       lod = this->result;
1585       break;
1586    case ir_txd:
1587       ir->lod_info.grad.dPdx->accept(this);
1588       lod = this->result;
1589
1590       ir->lod_info.grad.dPdy->accept(this);
1591       lod2 = this->result;
1592       break;
1593    case ir_txf:
1594    case ir_txl:
1595    case ir_txs:
1596       ir->lod_info.lod->accept(this);
1597       lod = this->result;
1598       break;
1599    case ir_txf_ms:
1600       ir->lod_info.sample_index->accept(this);
1601       sample_index = this->result;
1602       break;
1603    default:
1604       assert(!"Unrecognized texture opcode");
1605    };
1606
1607    /* Writemasking doesn't eliminate channels on SIMD8 texture
1608     * samples, so don't worry about them.
1609     */
1610    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1611
1612    if (brw->gen >= 7) {
1613       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1614                                lod, lod2, sample_index);
1615    } else if (brw->gen >= 5) {
1616       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1617                                lod, lod2, sample_index);
1618    } else {
1619       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1620                                lod, lod2);
1621    }
1622
1623    if (ir->offset != NULL && ir->op != ir_txf)
1624       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
1625
1626    if (ir->op == ir_tg4)
1627       inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
1628
1629    inst->sampler = sampler;
1630
1631    if (ir->shadow_comparitor)
1632       inst->shadow_compare = true;
1633
1634    /* fixup #layers for cube map arrays */
1635    if (ir->op == ir_txs) {
1636       glsl_type const *type = ir->sampler->type;
1637       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
1638           type->sampler_array) {
1639          fs_reg depth = dst;
1640          depth.reg_offset = 2;
1641          emit_math(SHADER_OPCODE_INT_QUOTIENT, depth, depth, fs_reg(6));
1642       }
1643    }
1644
1645    swizzle_result(ir, dst, sampler);
1646 }
1647
1648 /**
1649  * Set up the gather channel based on the swizzle, for gather4.
1650  */
1651 uint32_t
1652 fs_visitor::gather_channel(ir_texture *ir, int sampler)
1653 {
1654    ir_constant *chan = ir->lod_info.component->as_constant();
1655    int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1656    switch (swiz) {
1657       case SWIZZLE_X: return 0;
1658       case SWIZZLE_Y:
1659          /* gather4 sampler is broken for green channel on RG32F --
1660           * we must ask for blue instead.
1661           */
1662          if (c->key.tex.gather_channel_quirk_mask & (1<<sampler))
1663             return 2;
1664          return 1;
1665       case SWIZZLE_Z: return 2;
1666       case SWIZZLE_W: return 3;
1667       default:
1668          assert(!"Not reached"); /* zero, one swizzles handled already */
1669          return 0;
1670    }
1671 }
1672
1673 /**
1674  * Swizzle the result of a texture result.  This is necessary for
1675  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1676  */
1677 void
1678 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1679 {
1680    if (ir->op == ir_query_levels) {
1681       /* # levels is in .w */
1682       orig_val.reg_offset += 3;
1683       this->result = orig_val;
1684       return;
1685    }
1686
1687    this->result = orig_val;
1688
1689    /* txs,lod don't actually sample the texture, so swizzling the result
1690     * makes no sense.
1691     */
1692    if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
1693       return;
1694
1695    if (ir->type == glsl_type::float_type) {
1696       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1697       assert(ir->sampler->type->sampler_shadow);
1698    } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1699       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1700
1701       for (int i = 0; i < 4; i++) {
1702          int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1703          fs_reg l = swizzled_result;
1704          l.reg_offset += i;
1705
1706          if (swiz == SWIZZLE_ZERO) {
1707             emit(MOV(l, fs_reg(0.0f)));
1708          } else if (swiz == SWIZZLE_ONE) {
1709             emit(MOV(l, fs_reg(1.0f)));
1710          } else {
1711             fs_reg r = orig_val;
1712             r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1713             emit(MOV(l, r));
1714          }
1715       }
1716       this->result = swizzled_result;
1717    }
1718 }
1719
1720 void
1721 fs_visitor::visit(ir_swizzle *ir)
1722 {
1723    ir->val->accept(this);
1724    fs_reg val = this->result;
1725
1726    if (ir->type->vector_elements == 1) {
1727       this->result.reg_offset += ir->mask.x;
1728       return;
1729    }
1730
1731    fs_reg result = fs_reg(this, ir->type);
1732    this->result = result;
1733
1734    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1735       fs_reg channel = val;
1736       int swiz = 0;
1737
1738       switch (i) {
1739       case 0:
1740          swiz = ir->mask.x;
1741          break;
1742       case 1:
1743          swiz = ir->mask.y;
1744          break;
1745       case 2:
1746          swiz = ir->mask.z;
1747          break;
1748       case 3:
1749          swiz = ir->mask.w;
1750          break;
1751       }
1752
1753       channel.reg_offset += swiz;
1754       emit(MOV(result, channel));
1755       result.reg_offset++;
1756    }
1757 }
1758
1759 void
1760 fs_visitor::visit(ir_discard *ir)
1761 {
1762    assert(ir->condition == NULL); /* FINISHME */
1763
1764    /* We track our discarded pixels in f0.1.  By predicating on it, we can
1765     * update just the flag bits that aren't yet discarded.  By emitting a
1766     * CMP of g0 != g0, all our currently executing channels will get turned
1767     * off.
1768     */
1769    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1770                                    BRW_REGISTER_TYPE_UW));
1771    fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
1772                            BRW_CONDITIONAL_NZ));
1773    cmp->predicate = BRW_PREDICATE_NORMAL;
1774    cmp->flag_subreg = 1;
1775
1776    if (brw->gen >= 6) {
1777       /* For performance, after a discard, jump to the end of the shader.
1778        * However, many people will do foliage by discarding based on a
1779        * texture's alpha mask, and then continue on to texture with the
1780        * remaining pixels.  To avoid trashing the derivatives for those
1781        * texture samples, we'll only jump if all of the pixels in the subspan
1782        * have been discarded.
1783        */
1784       fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1785       discard_jump->flag_subreg = 1;
1786       discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
1787       discard_jump->predicate_inverse = true;
1788    }
1789 }
1790
1791 void
1792 fs_visitor::visit(ir_constant *ir)
1793 {
1794    /* Set this->result to reg at the bottom of the function because some code
1795     * paths will cause this visitor to be applied to other fields.  This will
1796     * cause the value stored in this->result to be modified.
1797     *
1798     * Make reg constant so that it doesn't get accidentally modified along the
1799     * way.  Yes, I actually had this problem. :(
1800     */
1801    const fs_reg reg(this, ir->type);
1802    fs_reg dst_reg = reg;
1803
1804    if (ir->type->is_array()) {
1805       const unsigned size = type_size(ir->type->fields.array);
1806
1807       for (unsigned i = 0; i < ir->type->length; i++) {
1808          ir->array_elements[i]->accept(this);
1809          fs_reg src_reg = this->result;
1810
1811          dst_reg.type = src_reg.type;
1812          for (unsigned j = 0; j < size; j++) {
1813             emit(MOV(dst_reg, src_reg));
1814             src_reg.reg_offset++;
1815             dst_reg.reg_offset++;
1816          }
1817       }
1818    } else if (ir->type->is_record()) {
1819       foreach_list(node, &ir->components) {
1820          ir_constant *const field = (ir_constant *) node;
1821          const unsigned size = type_size(field->type);
1822
1823          field->accept(this);
1824          fs_reg src_reg = this->result;
1825
1826          dst_reg.type = src_reg.type;
1827          for (unsigned j = 0; j < size; j++) {
1828             emit(MOV(dst_reg, src_reg));
1829             src_reg.reg_offset++;
1830             dst_reg.reg_offset++;
1831          }
1832       }
1833    } else {
1834       const unsigned size = type_size(ir->type);
1835
1836       for (unsigned i = 0; i < size; i++) {
1837          switch (ir->type->base_type) {
1838          case GLSL_TYPE_FLOAT:
1839             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
1840             break;
1841          case GLSL_TYPE_UINT:
1842             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
1843             break;
1844          case GLSL_TYPE_INT:
1845             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
1846             break;
1847          case GLSL_TYPE_BOOL:
1848             emit(MOV(dst_reg, fs_reg((int)ir->value.b[i])));
1849             break;
1850          default:
1851             assert(!"Non-float/uint/int/bool constant");
1852          }
1853          dst_reg.reg_offset++;
1854       }
1855    }
1856
1857    this->result = reg;
1858 }
1859
1860 void
1861 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1862 {
1863    ir_expression *expr = ir->as_expression();
1864
1865    if (expr &&
1866        expr->operation != ir_binop_logic_and &&
1867        expr->operation != ir_binop_logic_or &&
1868        expr->operation != ir_binop_logic_xor) {
1869       fs_reg op[2];
1870       fs_inst *inst;
1871
1872       assert(expr->get_num_operands() <= 2);
1873       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1874          assert(expr->operands[i]->type->is_scalar());
1875
1876          expr->operands[i]->accept(this);
1877          op[i] = this->result;
1878
1879          resolve_ud_negate(&op[i]);
1880       }
1881
1882       switch (expr->operation) {
1883       case ir_unop_logic_not:
1884          inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
1885          inst->conditional_mod = BRW_CONDITIONAL_Z;
1886          break;
1887
1888       case ir_unop_f2b:
1889          if (brw->gen >= 6) {
1890             emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1891          } else {
1892             inst = emit(MOV(reg_null_f, op[0]));
1893             inst->conditional_mod = BRW_CONDITIONAL_NZ;
1894          }
1895          break;
1896
1897       case ir_unop_i2b:
1898          if (brw->gen >= 6) {
1899             emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1900          } else {
1901             inst = emit(MOV(reg_null_d, op[0]));
1902             inst->conditional_mod = BRW_CONDITIONAL_NZ;
1903          }
1904          break;
1905
1906       case ir_binop_greater:
1907       case ir_binop_gequal:
1908       case ir_binop_less:
1909       case ir_binop_lequal:
1910       case ir_binop_equal:
1911       case ir_binop_all_equal:
1912       case ir_binop_nequal:
1913       case ir_binop_any_nequal:
1914          resolve_bool_comparison(expr->operands[0], &op[0]);
1915          resolve_bool_comparison(expr->operands[1], &op[1]);
1916
1917          emit(CMP(reg_null_d, op[0], op[1],
1918                   brw_conditional_for_comparison(expr->operation)));
1919          break;
1920
1921       default:
1922          assert(!"not reached");
1923          fail("bad cond code\n");
1924          break;
1925       }
1926       return;
1927    }
1928
1929    ir->accept(this);
1930
1931    fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
1932    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1933 }
1934
1935 /**
1936  * Emit a gen6 IF statement with the comparison folded into the IF
1937  * instruction.
1938  */
1939 void
1940 fs_visitor::emit_if_gen6(ir_if *ir)
1941 {
1942    ir_expression *expr = ir->condition->as_expression();
1943
1944    if (expr) {
1945       fs_reg op[2];
1946       fs_inst *inst;
1947       fs_reg temp;
1948
1949       assert(expr->get_num_operands() <= 2);
1950       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1951          assert(expr->operands[i]->type->is_scalar());
1952
1953          expr->operands[i]->accept(this);
1954          op[i] = this->result;
1955       }
1956
1957       switch (expr->operation) {
1958       case ir_unop_logic_not:
1959       case ir_binop_logic_xor:
1960       case ir_binop_logic_or:
1961       case ir_binop_logic_and:
1962          /* For operations on bool arguments, only the low bit of the bool is
1963           * valid, and the others are undefined.  Fall back to the condition
1964           * code path.
1965           */
1966          break;
1967
1968       case ir_unop_f2b:
1969          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1970          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1971          return;
1972
1973       case ir_unop_i2b:
1974          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1975          return;
1976
1977       case ir_binop_greater:
1978       case ir_binop_gequal:
1979       case ir_binop_less:
1980       case ir_binop_lequal:
1981       case ir_binop_equal:
1982       case ir_binop_all_equal:
1983       case ir_binop_nequal:
1984       case ir_binop_any_nequal:
1985          resolve_bool_comparison(expr->operands[0], &op[0]);
1986          resolve_bool_comparison(expr->operands[1], &op[1]);
1987
1988          emit(IF(op[0], op[1],
1989                  brw_conditional_for_comparison(expr->operation)));
1990          return;
1991       default:
1992          assert(!"not reached");
1993          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1994          fail("bad condition\n");
1995          return;
1996       }
1997    }
1998
1999    emit_bool_to_cond_code(ir->condition);
2000    fs_inst *inst = emit(BRW_OPCODE_IF);
2001    inst->predicate = BRW_PREDICATE_NORMAL;
2002 }
2003
2004 /**
2005  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2006  *
2007  * Many GLSL shaders contain the following pattern:
2008  *
2009  *    x = condition ? foo : bar
2010  *
2011  * The compiler emits an ir_if tree for this, since each subexpression might be
2012  * a complex tree that could have side-effects or short-circuit logic.
2013  *
2014  * However, the common case is to simply select one of two constants or
2015  * variable values---which is exactly what SEL is for.  In this case, the
2016  * assembly looks like:
2017  *
2018  *    (+f0) IF
2019  *    MOV dst src0
2020  *    ELSE
2021  *    MOV dst src1
2022  *    ENDIF
2023  *
2024  * which can be easily translated into:
2025  *
2026  *    (+f0) SEL dst src0 src1
2027  *
2028  * If src0 is an immediate value, we promote it to a temporary GRF.
2029  */
2030 void
2031 fs_visitor::try_replace_with_sel()
2032 {
2033    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2034    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2035
2036    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2037    int opcodes[] = {
2038       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2039    };
2040
2041    fs_inst *match = (fs_inst *) endif_inst->prev;
2042    for (int i = 0; i < 4; i++) {
2043       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2044          return;
2045       match = (fs_inst *) match->prev;
2046    }
2047
2048    /* The opcodes match; it looks like the right sequence of instructions. */
2049    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2050    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2051    fs_inst *if_inst = (fs_inst *) then_mov->prev;
2052
2053    /* Check that the MOVs are the right form. */
2054    if (then_mov->dst.equals(else_mov->dst) &&
2055        !then_mov->is_partial_write() &&
2056        !else_mov->is_partial_write()) {
2057
2058       /* Remove the matched instructions; we'll emit a SEL to replace them. */
2059       while (!if_inst->next->is_tail_sentinel())
2060          if_inst->next->remove();
2061       if_inst->remove();
2062
2063       /* Only the last source register can be a constant, so if the MOV in
2064        * the "then" clause uses a constant, we need to put it in a temporary.
2065        */
2066       fs_reg src0(then_mov->src[0]);
2067       if (src0.file == IMM) {
2068          src0 = fs_reg(this, glsl_type::float_type);
2069          src0.type = then_mov->src[0].type;
2070          emit(MOV(src0, then_mov->src[0]));
2071       }
2072
2073       fs_inst *sel;
2074       if (if_inst->conditional_mod) {
2075          /* Sandybridge-specific IF with embedded comparison */
2076          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2077                   if_inst->conditional_mod));
2078          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2079          sel->predicate = BRW_PREDICATE_NORMAL;
2080       } else {
2081          /* Separate CMP and IF instructions */
2082          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2083          sel->predicate = if_inst->predicate;
2084          sel->predicate_inverse = if_inst->predicate_inverse;
2085       }
2086    }
2087 }
2088
2089 void
2090 fs_visitor::visit(ir_if *ir)
2091 {
2092    if (brw->gen < 6 && dispatch_width == 16) {
2093       fail("Can't support (non-uniform) control flow on 16-wide\n");
2094    }
2095
2096    /* Don't point the annotation at the if statement, because then it plus
2097     * the then and else blocks get printed.
2098     */
2099    this->base_ir = ir->condition;
2100
2101    if (brw->gen == 6) {
2102       emit_if_gen6(ir);
2103    } else {
2104       emit_bool_to_cond_code(ir->condition);
2105
2106       emit(IF(BRW_PREDICATE_NORMAL));
2107    }
2108
2109    foreach_list(node, &ir->then_instructions) {
2110       ir_instruction *ir = (ir_instruction *)node;
2111       this->base_ir = ir;
2112
2113       ir->accept(this);
2114    }
2115
2116    if (!ir->else_instructions.is_empty()) {
2117       emit(BRW_OPCODE_ELSE);
2118
2119       foreach_list(node, &ir->else_instructions) {
2120          ir_instruction *ir = (ir_instruction *)node;
2121          this->base_ir = ir;
2122
2123          ir->accept(this);
2124       }
2125    }
2126
2127    emit(BRW_OPCODE_ENDIF);
2128
2129    try_replace_with_sel();
2130 }
2131
2132 void
2133 fs_visitor::visit(ir_loop *ir)
2134 {
2135    fs_reg counter = reg_undef;
2136
2137    if (brw->gen < 6 && dispatch_width == 16) {
2138       fail("Can't support (non-uniform) control flow on 16-wide\n");
2139    }
2140
2141    if (ir->counter) {
2142       this->base_ir = ir->counter;
2143       ir->counter->accept(this);
2144       counter = *(variable_storage(ir->counter));
2145
2146       if (ir->from) {
2147          this->base_ir = ir->from;
2148          ir->from->accept(this);
2149
2150          emit(MOV(counter, this->result));
2151       }
2152    }
2153
2154    this->base_ir = NULL;
2155    emit(BRW_OPCODE_DO);
2156
2157    if (ir->to) {
2158       this->base_ir = ir->to;
2159       ir->to->accept(this);
2160
2161       emit(CMP(reg_null_d, counter, this->result,
2162                brw_conditional_for_comparison(ir->cmp)));
2163
2164       fs_inst *inst = emit(BRW_OPCODE_BREAK);
2165       inst->predicate = BRW_PREDICATE_NORMAL;
2166    }
2167
2168    foreach_list(node, &ir->body_instructions) {
2169       ir_instruction *ir = (ir_instruction *)node;
2170
2171       this->base_ir = ir;
2172       ir->accept(this);
2173    }
2174
2175    if (ir->increment) {
2176       this->base_ir = ir->increment;
2177       ir->increment->accept(this);
2178       emit(ADD(counter, counter, this->result));
2179    }
2180
2181    this->base_ir = NULL;
2182    emit(BRW_OPCODE_WHILE);
2183 }
2184
2185 void
2186 fs_visitor::visit(ir_loop_jump *ir)
2187 {
2188    switch (ir->mode) {
2189    case ir_loop_jump::jump_break:
2190       emit(BRW_OPCODE_BREAK);
2191       break;
2192    case ir_loop_jump::jump_continue:
2193       emit(BRW_OPCODE_CONTINUE);
2194       break;
2195    }
2196 }
2197
2198 void
2199 fs_visitor::visit(ir_call *ir)
2200 {
2201    assert(!"FINISHME");
2202 }
2203
2204 void
2205 fs_visitor::visit(ir_return *ir)
2206 {
2207    assert(!"FINISHME");
2208 }
2209
2210 void
2211 fs_visitor::visit(ir_function *ir)
2212 {
2213    /* Ignore function bodies other than main() -- we shouldn't see calls to
2214     * them since they should all be inlined before we get to ir_to_mesa.
2215     */
2216    if (strcmp(ir->name, "main") == 0) {
2217       const ir_function_signature *sig;
2218       exec_list empty;
2219
2220       sig = ir->matching_signature(NULL, &empty);
2221
2222       assert(sig);
2223
2224       foreach_list(node, &sig->body) {
2225          ir_instruction *ir = (ir_instruction *)node;
2226          this->base_ir = ir;
2227
2228          ir->accept(this);
2229       }
2230    }
2231 }
2232
2233 void
2234 fs_visitor::visit(ir_function_signature *ir)
2235 {
2236    assert(!"not reached");
2237    (void)ir;
2238 }
2239
2240 void
2241 fs_visitor::visit(ir_emit_vertex *)
2242 {
2243    assert(!"not reached");
2244 }
2245
2246 void
2247 fs_visitor::visit(ir_end_primitive *)
2248 {
2249    assert(!"not reached");
2250 }
2251
2252 fs_inst *
2253 fs_visitor::emit(fs_inst inst)
2254 {
2255    fs_inst *list_inst = new(mem_ctx) fs_inst;
2256    *list_inst = inst;
2257    emit(list_inst);
2258    return list_inst;
2259 }
2260
2261 fs_inst *
2262 fs_visitor::emit(fs_inst *inst)
2263 {
2264    if (force_uncompressed_stack > 0)
2265       inst->force_uncompressed = true;
2266    else if (force_sechalf_stack > 0)
2267       inst->force_sechalf = true;
2268
2269    inst->annotation = this->current_annotation;
2270    inst->ir = this->base_ir;
2271
2272    this->instructions.push_tail(inst);
2273
2274    return inst;
2275 }
2276
2277 void
2278 fs_visitor::emit(exec_list list)
2279 {
2280    foreach_list_safe(node, &list) {
2281       fs_inst *inst = (fs_inst *)node;
2282       inst->remove();
2283       emit(inst);
2284    }
2285 }
2286
2287 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2288 void
2289 fs_visitor::emit_dummy_fs()
2290 {
2291    int reg_width = dispatch_width / 8;
2292
2293    /* Everyone's favorite color. */
2294    emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
2295    emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
2296    emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
2297    emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
2298
2299    fs_inst *write;
2300    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2301    write->base_mrf = 2;
2302    write->mlen = 4 * reg_width;
2303    write->eot = true;
2304 }
2305
2306 /* The register location here is relative to the start of the URB
2307  * data.  It will get adjusted to be a real location before
2308  * generate_code() time.
2309  */
2310 struct brw_reg
2311 fs_visitor::interp_reg(int location, int channel)
2312 {
2313    int regnr = c->prog_data.urb_setup[location] * 2 + channel / 2;
2314    int stride = (channel & 1) * 4;
2315
2316    assert(c->prog_data.urb_setup[location] != -1);
2317
2318    return brw_vec1_grf(regnr, stride);
2319 }
2320
2321 /** Emits the interpolation for the varying inputs. */
2322 void
2323 fs_visitor::emit_interpolation_setup_gen4()
2324 {
2325    this->current_annotation = "compute pixel centers";
2326    this->pixel_x = fs_reg(this, glsl_type::uint_type);
2327    this->pixel_y = fs_reg(this, glsl_type::uint_type);
2328    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2329    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2330
2331    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2332    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2333
2334    this->current_annotation = "compute pixel deltas from v0";
2335    if (brw->has_pln) {
2336       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2337          fs_reg(this, glsl_type::vec2_type);
2338       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2339          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
2340       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
2341    } else {
2342       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2343          fs_reg(this, glsl_type::float_type);
2344       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2345          fs_reg(this, glsl_type::float_type);
2346    }
2347    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2348             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
2349    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2350             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
2351
2352    this->current_annotation = "compute pos.w and 1/pos.w";
2353    /* Compute wpos.w.  It's always in our setup, since it's needed to
2354     * interpolate the other attributes.
2355     */
2356    this->wpos_w = fs_reg(this, glsl_type::float_type);
2357    emit(FS_OPCODE_LINTERP, wpos_w,
2358         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2359         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2360         interp_reg(VARYING_SLOT_POS, 3));
2361    /* Compute the pixel 1/W value from wpos.w. */
2362    this->pixel_w = fs_reg(this, glsl_type::float_type);
2363    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
2364    this->current_annotation = NULL;
2365 }
2366
2367 /** Emits the interpolation for the varying inputs. */
2368 void
2369 fs_visitor::emit_interpolation_setup_gen6()
2370 {
2371    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2372
2373    /* If the pixel centers end up used, the setup is the same as for gen4. */
2374    this->current_annotation = "compute pixel centers";
2375    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2376    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2377    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2378    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2379    emit(ADD(int_pixel_x,
2380             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2381             fs_reg(brw_imm_v(0x10101010))));
2382    emit(ADD(int_pixel_y,
2383             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2384             fs_reg(brw_imm_v(0x11001100))));
2385
2386    /* As of gen6, we can no longer mix float and int sources.  We have
2387     * to turn the integer pixel centers into floats for their actual
2388     * use.
2389     */
2390    this->pixel_x = fs_reg(this, glsl_type::float_type);
2391    this->pixel_y = fs_reg(this, glsl_type::float_type);
2392    emit(MOV(this->pixel_x, int_pixel_x));
2393    emit(MOV(this->pixel_y, int_pixel_y));
2394
2395    this->current_annotation = "compute pos.w";
2396    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2397    this->wpos_w = fs_reg(this, glsl_type::float_type);
2398    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
2399
2400    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2401       uint8_t reg = c->barycentric_coord_reg[i];
2402       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
2403       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
2404    }
2405
2406    this->current_annotation = NULL;
2407 }
2408
2409 void
2410 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
2411 {
2412    int reg_width = dispatch_width / 8;
2413    fs_inst *inst;
2414    fs_reg color = outputs[target];
2415    fs_reg mrf;
2416
2417    /* If there's no color data to be written, skip it. */
2418    if (color.file == BAD_FILE)
2419       return;
2420
2421    color.reg_offset += index;
2422
2423    if (dispatch_width == 8 || brw->gen >= 6) {
2424       /* SIMD8 write looks like:
2425        * m + 0: r0
2426        * m + 1: r1
2427        * m + 2: g0
2428        * m + 3: g1
2429        *
2430        * gen6 SIMD16 DP write looks like:
2431        * m + 0: r0
2432        * m + 1: r1
2433        * m + 2: g0
2434        * m + 3: g1
2435        * m + 4: b0
2436        * m + 5: b1
2437        * m + 6: a0
2438        * m + 7: a1
2439        */
2440       inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width,
2441                              color.type),
2442                       color));
2443       inst->saturate = c->key.clamp_fragment_color;
2444    } else {
2445       /* pre-gen6 SIMD16 single source DP write looks like:
2446        * m + 0: r0
2447        * m + 1: g0
2448        * m + 2: b0
2449        * m + 3: a0
2450        * m + 4: r1
2451        * m + 5: g1
2452        * m + 6: b1
2453        * m + 7: a1
2454        */
2455       if (brw->has_compr4) {
2456          /* By setting the high bit of the MRF register number, we
2457           * indicate that we want COMPR4 mode - instead of doing the
2458           * usual destination + 1 for the second half we get
2459           * destination + 4.
2460           */
2461          inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2462                                 color.type),
2463                          color));
2464          inst->saturate = c->key.clamp_fragment_color;
2465       } else {
2466          push_force_uncompressed();
2467          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type),
2468                          color));
2469          inst->saturate = c->key.clamp_fragment_color;
2470          pop_force_uncompressed();
2471
2472          push_force_sechalf();
2473          color.sechalf = true;
2474          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type),
2475                          color));
2476          inst->saturate = c->key.clamp_fragment_color;
2477          pop_force_sechalf();
2478          color.sechalf = false;
2479       }
2480    }
2481 }
2482
2483 void
2484 fs_visitor::emit_fb_writes()
2485 {
2486    this->current_annotation = "FB write header";
2487    bool header_present = true;
2488    /* We can potentially have a message length of up to 15, so we have to set
2489     * base_mrf to either 0 or 1 in order to fit in m0..m15.
2490     */
2491    int base_mrf = 1;
2492    int nr = base_mrf;
2493    int reg_width = dispatch_width / 8;
2494    bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2495    bool src0_alpha_to_render_target = false;
2496
2497    if (dispatch_width == 16 && do_dual_src) {
2498       fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2499       do_dual_src = false;
2500    }
2501
2502    /* From the Sandy Bridge PRM, volume 4, page 198:
2503     *
2504     *     "Dispatched Pixel Enables. One bit per pixel indicating
2505     *      which pixels were originally enabled when the thread was
2506     *      dispatched. This field is only required for the end-of-
2507     *      thread message and on all dual-source messages."
2508     */
2509    if (brw->gen >= 6 &&
2510        !this->fp->UsesKill &&
2511        !do_dual_src &&
2512        c->key.nr_color_regions == 1) {
2513       header_present = false;
2514    }
2515
2516    if (header_present) {
2517       src0_alpha_to_render_target = brw->gen >= 6 &&
2518                                     !do_dual_src &&
2519                                     c->key.replicate_alpha;
2520       /* m2, m3 header */
2521       nr += 2;
2522    }
2523
2524    if (c->aa_dest_stencil_reg) {
2525       push_force_uncompressed();
2526       emit(MOV(fs_reg(MRF, nr++),
2527                fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2528       pop_force_uncompressed();
2529    }
2530
2531    c->prog_data.uses_omask =
2532       fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
2533    if(c->prog_data.uses_omask) {
2534       this->current_annotation = "FB write oMask";
2535       assert(this->sample_mask.file != BAD_FILE);
2536       /* Hand over gl_SampleMask. Only lower 16 bits are relevant. */
2537       emit(FS_OPCODE_SET_OMASK, fs_reg(MRF, nr, BRW_REGISTER_TYPE_UW), this->sample_mask);
2538       nr += 1;
2539    }
2540
2541    /* Reserve space for color. It'll be filled in per MRT below. */
2542    int color_mrf = nr;
2543    nr += 4 * reg_width;
2544    if (do_dual_src)
2545       nr += 4;
2546    if (src0_alpha_to_render_target)
2547       nr += reg_width;
2548
2549    if (c->source_depth_to_render_target) {
2550       if (brw->gen == 6 && dispatch_width == 16) {
2551          /* For outputting oDepth on gen6, SIMD8 writes have to be
2552           * used.  This would require 8-wide moves of each half to
2553           * message regs, kind of like pre-gen5 SIMD16 FB writes.
2554           * Just bail on doing so for now.
2555           */
2556          fail("Missing support for simd16 depth writes on gen6\n");
2557       }
2558
2559       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2560          /* Hand over gl_FragDepth. */
2561          assert(this->frag_depth.file != BAD_FILE);
2562          emit(MOV(fs_reg(MRF, nr), this->frag_depth));
2563       } else {
2564          /* Pass through the payload depth. */
2565          emit(MOV(fs_reg(MRF, nr),
2566                   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2567       }
2568       nr += reg_width;
2569    }
2570
2571    if (c->dest_depth_reg) {
2572       emit(MOV(fs_reg(MRF, nr),
2573                fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2574       nr += reg_width;
2575    }
2576
2577    if (do_dual_src) {
2578       fs_reg src0 = this->outputs[0];
2579       fs_reg src1 = this->dual_src_output;
2580
2581       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2582                                                  "FB write src0");
2583       for (int i = 0; i < 4; i++) {
2584          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + i, src0.type), src0));
2585          src0.reg_offset++;
2586          inst->saturate = c->key.clamp_fragment_color;
2587       }
2588
2589       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2590                                                  "FB write src1");
2591       for (int i = 0; i < 4; i++) {
2592          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + 4 + i, src1.type),
2593                                   src1));
2594          src1.reg_offset++;
2595          inst->saturate = c->key.clamp_fragment_color;
2596       }
2597
2598       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2599          emit_shader_time_end();
2600
2601       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2602       inst->target = 0;
2603       inst->base_mrf = base_mrf;
2604       inst->mlen = nr - base_mrf;
2605       inst->eot = true;
2606       inst->header_present = header_present;
2607
2608       c->prog_data.dual_src_blend = true;
2609       this->current_annotation = NULL;
2610       return;
2611    }
2612
2613    for (int target = 0; target < c->key.nr_color_regions; target++) {
2614       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2615                                                  "FB write target %d",
2616                                                  target);
2617       /* If src0_alpha_to_render_target is true, include source zero alpha
2618        * data in RenderTargetWrite message for targets > 0.
2619        */
2620       int write_color_mrf = color_mrf;
2621       if (src0_alpha_to_render_target && target != 0) {
2622          fs_inst *inst;
2623          fs_reg color = outputs[0];
2624          color.reg_offset += 3;
2625
2626          inst = emit(MOV(fs_reg(MRF, write_color_mrf, color.type),
2627                          color));
2628          inst->saturate = c->key.clamp_fragment_color;
2629          write_color_mrf = color_mrf + reg_width;
2630       }
2631
2632       for (unsigned i = 0; i < this->output_components[target]; i++)
2633          emit_color_write(target, i, write_color_mrf);
2634
2635       bool eot = false;
2636       if (target == c->key.nr_color_regions - 1) {
2637          eot = true;
2638
2639          if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2640             emit_shader_time_end();
2641       }
2642
2643       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2644       inst->target = target;
2645       inst->base_mrf = base_mrf;
2646       if (src0_alpha_to_render_target && target == 0)
2647          inst->mlen = nr - base_mrf - reg_width;
2648       else
2649          inst->mlen = nr - base_mrf;
2650       inst->eot = eot;
2651       inst->header_present = header_present;
2652    }
2653
2654    if (c->key.nr_color_regions == 0) {
2655       /* Even if there's no color buffers enabled, we still need to send
2656        * alpha out the pipeline to our null renderbuffer to support
2657        * alpha-testing, alpha-to-coverage, and so on.
2658        */
2659       emit_color_write(0, 3, color_mrf);
2660
2661       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2662          emit_shader_time_end();
2663
2664       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2665       inst->base_mrf = base_mrf;
2666       inst->mlen = nr - base_mrf;
2667       inst->eot = true;
2668       inst->header_present = header_present;
2669    }
2670
2671    this->current_annotation = NULL;
2672 }
2673
2674 void
2675 fs_visitor::resolve_ud_negate(fs_reg *reg)
2676 {
2677    if (reg->type != BRW_REGISTER_TYPE_UD ||
2678        !reg->negate)
2679       return;
2680
2681    fs_reg temp = fs_reg(this, glsl_type::uint_type);
2682    emit(MOV(temp, *reg));
2683    *reg = temp;
2684 }
2685
2686 void
2687 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2688 {
2689    if (rvalue->type != glsl_type::bool_type)
2690       return;
2691
2692    fs_reg temp = fs_reg(this, glsl_type::bool_type);
2693    emit(AND(temp, *reg, fs_reg(1)));
2694    *reg = temp;
2695 }
2696
2697 fs_visitor::fs_visitor(struct brw_context *brw,
2698                        struct brw_wm_compile *c,
2699                        struct gl_shader_program *shader_prog,
2700                        struct gl_fragment_program *fp,
2701                        unsigned dispatch_width)
2702    : dispatch_width(dispatch_width)
2703 {
2704    this->c = c;
2705    this->brw = brw;
2706    this->fp = fp;
2707    this->prog = &fp->Base;
2708    this->shader_prog = shader_prog;
2709    this->prog = &fp->Base;
2710    this->stage_prog_data = &c->prog_data.base;
2711    this->ctx = &brw->ctx;
2712    this->mem_ctx = ralloc_context(NULL);
2713    if (shader_prog)
2714       shader = (struct brw_shader *)
2715          shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2716    else
2717       shader = NULL;
2718    this->failed = false;
2719    this->variable_ht = hash_table_ctor(0,
2720                                        hash_table_pointer_hash,
2721                                        hash_table_pointer_compare);
2722
2723    memset(this->outputs, 0, sizeof(this->outputs));
2724    memset(this->output_components, 0, sizeof(this->output_components));
2725    this->first_non_payload_grf = 0;
2726    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2727
2728    this->current_annotation = NULL;
2729    this->base_ir = NULL;
2730
2731    this->virtual_grf_sizes = NULL;
2732    this->virtual_grf_count = 0;
2733    this->virtual_grf_array_size = 0;
2734    this->virtual_grf_start = NULL;
2735    this->virtual_grf_end = NULL;
2736    this->live_intervals = NULL;
2737
2738    this->params_remap = NULL;
2739    this->nr_params_remap = 0;
2740
2741    this->force_uncompressed_stack = 0;
2742    this->force_sechalf_stack = 0;
2743
2744    this->spilled_any_registers = false;
2745
2746    memset(&this->param_size, 0, sizeof(this->param_size));
2747 }
2748
2749 fs_visitor::~fs_visitor()
2750 {
2751    ralloc_free(this->mem_ctx);
2752    hash_table_dtor(this->variable_ht);
2753 }