src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 #define ALU3(op)                                                        \
 111    vec4_instruction *                                                   \
 112    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 113    {                                                                    \
 114       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 115                                            src0, src1, src2);           \
 116    }
 117
 118 ALU1(NOT)
 119 ALU1(MOV)
 120 ALU1(FRC)
 121 ALU1(RNDD)
 122 ALU1(RNDE)
 123 ALU1(RNDZ)
 124 ALU1(F32TO16)
 125 ALU1(F16TO32)
 126 ALU2(ADD)
 127 ALU2(MUL)
 128 ALU2(MACH)
 129 ALU2(AND)
 130 ALU2(OR)
 131 ALU2(XOR)
 132 ALU2(DP3)
 133 ALU2(DP4)
 134 ALU2(DPH)
 135 ALU2(SHL)
 136 ALU2(SHR)
 137 ALU2(ASR)
 138 ALU3(LRP)
 139 ALU1(BFREV)
 140 ALU3(BFE)
 141 ALU2(BFI1)
 142 ALU3(BFI2)
 143 ALU1(FBH)
 144 ALU1(FBL)
 145 ALU1(CBIT)
 146
 147 /** Gen4 predicated IF. */
 148 vec4_instruction *
 149 vec4_visitor::IF(uint32_t predicate)
 150 {
 151    vec4_instruction *inst;
 152
 153    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 154    inst->predicate = predicate;
 155
 156    return inst;
 157 }
 158
 159 /** Gen6+ IF with embedded comparison. */
 160 vec4_instruction *
 161 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 162 {
 163    assert(intel->gen >= 6);
 164
 165    vec4_instruction *inst;
 166
 167    resolve_ud_negate(&src0);
 168    resolve_ud_negate(&src1);
 169
 170    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 171                                         src0, src1);
 172    inst->conditional_mod = condition;
 173
 174    return inst;
 175 }
 176
 177 /**
 178  * CMP: Sets the low bit of the destination channels with the result
 179  * of the comparison, while the upper bits are undefined, and updates
 180  * the flag register with the packed 16 bits of the result.
 181  */
 182 vec4_instruction *
 183 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 184 {
 185    vec4_instruction *inst;
 186
 187    /* original gen4 does type conversion to the destination type
 188     * before before comparison, producing garbage results for floating
 189     * point comparisons.
 190     */
 191    if (intel->gen == 4) {
 192       dst.type = src0.type;
 193       if (dst.file == HW_REG)
 194          dst.fixed_hw_reg.type = dst.type;
 195    }
 196
 197    resolve_ud_negate(&src0);
 198    resolve_ud_negate(&src1);
 199
 200    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 201    inst->conditional_mod = condition;
 202
 203    return inst;
 204 }
 205
 206 vec4_instruction *
 207 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 208 {
 209    vec4_instruction *inst;
 210
 211    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 212                                         dst, index);
 213    inst->base_mrf = 14;
 214    inst->mlen = 2;
 215
 216    return inst;
 217 }
 218
 219 vec4_instruction *
 220 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 221 {
 222    vec4_instruction *inst;
 223
 224    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 225                                         dst, src, index);
 226    inst->base_mrf = 13;
 227    inst->mlen = 3;
 228
 229    return inst;
 230 }
 231
 232 void
 233 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 234 {
 235    static enum opcode dot_opcodes[] = {
 236       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 237    };
 238
 239    emit(dot_opcodes[elements - 2], dst, src0, src1);
 240 }
 241
 242 src_reg
 243 vec4_visitor::fix_3src_operand(src_reg src)
 244 {
 245    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 246     * able to use vertical stride of zero to replicate the vec4 uniform, like
 247     *
 248     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 249     *
 250     * But you can't, since vertical stride is always four in three-source
 251     * instructions. Instead, insert a MOV instruction to do the replication so
 252     * that the three-source instruction can consume it.
 253     */
 254
 255    /* The MOV is only needed if the source is a uniform or immediate. */
 256    if (src.file != UNIFORM && src.file != IMM)
 257       return src;
 258
 259    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 260    expanded.type = src.type;
 261    emit(MOV(expanded, src));
 262    return src_reg(expanded);
 263 }
 264
 265 src_reg
 266 vec4_visitor::fix_math_operand(src_reg src)
 267 {
 268    /* The gen6 math instruction ignores the source modifiers --
 269     * swizzle, abs, negate, and at least some parts of the register
 270     * region description.
 271     *
 272     * Rather than trying to enumerate all these cases, *always* expand the
 273     * operand to a temp GRF for gen6.
 274     *
 275     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 276     * can't use.
 277     */
 278
 279    if (intel->gen == 7 && src.file != IMM)
 280       return src;
 281
 282    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 283    expanded.type = src.type;
 284    emit(MOV(expanded, src));
 285    return src_reg(expanded);
 286 }
 287
 288 void
 289 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 290 {
 291    src = fix_math_operand(src);
 292
 293    if (dst.writemask != WRITEMASK_XYZW) {
 294       /* The gen6 math instruction must be align1, so we can't do
 295        * writemasks.
 296        */
 297       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 298
 299       emit(opcode, temp_dst, src);
 300
 301       emit(MOV(dst, src_reg(temp_dst)));
 302    } else {
 303       emit(opcode, dst, src);
 304    }
 305 }
 306
 307 void
 308 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 309 {
 310    vec4_instruction *inst = emit(opcode, dst, src);
 311    inst->base_mrf = 1;
 312    inst->mlen = 1;
 313 }
 314
 315 void
 316 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 317 {
 318    switch (opcode) {
 319    case SHADER_OPCODE_RCP:
 320    case SHADER_OPCODE_RSQ:
 321    case SHADER_OPCODE_SQRT:
 322    case SHADER_OPCODE_EXP2:
 323    case SHADER_OPCODE_LOG2:
 324    case SHADER_OPCODE_SIN:
 325    case SHADER_OPCODE_COS:
 326       break;
 327    default:
 328       assert(!"not reached: bad math opcode");
 329       return;
 330    }
 331
 332    if (intel->gen >= 6) {
 333       return emit_math1_gen6(opcode, dst, src);
 334    } else {
 335       return emit_math1_gen4(opcode, dst, src);
 336    }
 337 }
 338
 339 void
 340 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 341                               dst_reg dst, src_reg src0, src_reg src1)
 342 {
 343    src0 = fix_math_operand(src0);
 344    src1 = fix_math_operand(src1);
 345
 346    if (dst.writemask != WRITEMASK_XYZW) {
 347       /* The gen6 math instruction must be align1, so we can't do
 348        * writemasks.
 349        */
 350       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 351       temp_dst.type = dst.type;
 352
 353       emit(opcode, temp_dst, src0, src1);
 354
 355       emit(MOV(dst, src_reg(temp_dst)));
 356    } else {
 357       emit(opcode, dst, src0, src1);
 358    }
 359 }
 360
 361 void
 362 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 363                               dst_reg dst, src_reg src0, src_reg src1)
 364 {
 365    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 366    inst->base_mrf = 1;
 367    inst->mlen = 2;
 368 }
 369
 370 void
 371 vec4_visitor::emit_math(enum opcode opcode,
 372                         dst_reg dst, src_reg src0, src_reg src1)
 373 {
 374    switch (opcode) {
 375    case SHADER_OPCODE_POW:
 376    case SHADER_OPCODE_INT_QUOTIENT:
 377    case SHADER_OPCODE_INT_REMAINDER:
 378       break;
 379    default:
 380       assert(!"not reached: unsupported binary math opcode");
 381       return;
 382    }
 383
 384    if (intel->gen >= 6) {
 385       return emit_math2_gen6(opcode, dst, src0, src1);
 386    } else {
 387       return emit_math2_gen4(opcode, dst, src0, src1);
 388    }
 389 }
 390
 391 void
 392 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 393 {
 394    if (intel->gen < 7)
 395       assert(!"ir_unop_pack_half_2x16 should be lowered");
 396
 397    assert(dst.type == BRW_REGISTER_TYPE_UD);
 398    assert(src0.type == BRW_REGISTER_TYPE_F);
 399
 400    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 401     *
 402     *   Because this instruction does not have a 16-bit floating-point type,
 403     *   the destination data type must be Word (W).
 404     *
 405     *   The destination must be DWord-aligned and specify a horizontal stride
 406     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 407     *   each destination channel and the upper word is not modified.
 408     *
 409     * The above restriction implies that the f32to16 instruction must use
 410     * align1 mode, because only in align1 mode is it possible to specify
 411     * horizontal stride.  We choose here to defy the hardware docs and emit
 412     * align16 instructions.
 413     *
 414     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 415     * instructions. I was partially successful in that the code passed all
 416     * tests.  However, the code was dubiously correct and fragile, and the
 417     * tests were not harsh enough to probe that frailty. Not trusting the
 418     * code, I chose instead to remain in align16 mode in defiance of the hw
 419     * docs).
 420     *
 421     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 422     * simulator, emitting a f32to16 in align16 mode with UD as destination
 423     * data type is safe. The behavior differs from that specified in the PRM
 424     * in that the upper word of each destination channel is cleared to 0.
 425     */
 426
 427    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 428    src_reg tmp_src(tmp_dst);
 429
 430 #if 0
 431    /* Verify the undocumented behavior on which the following instructions
 432     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 433     * then the result of the bit-or instruction below will be incorrect.
 434     *
 435     * You should inspect the disasm output in order to verify that the MOV is
 436     * not optimized away.
 437     */
 438    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 439 #endif
 440
 441    /* Give tmp the form below, where "." means untouched.
 442     *
 443     *     w z          y          x w z          y          x
 444     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 445     *
 446     * That the upper word of each write-channel be 0 is required for the
 447     * following bit-shift and bit-or instructions to work. Note that this
 448     * relies on the undocumented hardware behavior mentioned above.
 449     */
 450    tmp_dst.writemask = WRITEMASK_XY;
 451    emit(F32TO16(tmp_dst, src0));
 452
 453    /* Give the write-channels of dst the form:
 454     *   0xhhhh0000
 455     */
 456    tmp_src.swizzle = SWIZZLE_Y;
 457    emit(SHL(dst, tmp_src, src_reg(16u)));
 458
 459    /* Finally, give the write-channels of dst the form of packHalf2x16's
 460     * output:
 461     *   0xhhhhllll
 462     */
 463    tmp_src.swizzle = SWIZZLE_X;
 464    emit(OR(dst, src_reg(dst), tmp_src));
 465 }
 466
 467 void
 468 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 469 {
 470    if (intel->gen < 7)
 471       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 472
 473    assert(dst.type == BRW_REGISTER_TYPE_F);
 474    assert(src0.type == BRW_REGISTER_TYPE_UD);
 475
 476    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 477     *
 478     *   Because this instruction does not have a 16-bit floating-point type,
 479     *   the source data type must be Word (W). The destination type must be
 480     *   F (Float).
 481     *
 482     * To use W as the source data type, we must adjust horizontal strides,
 483     * which is only possible in align1 mode. All my [chadv] attempts at
 484     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 485     * Piglit tests, so I gave up.
 486     *
 487     * I've verified that, on gen7 hardware and the simulator, it is safe to
 488     * emit f16to32 in align16 mode with UD as source data type.
 489     */
 490
 491    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 492    src_reg tmp_src(tmp_dst);
 493
 494    tmp_dst.writemask = WRITEMASK_X;
 495    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 496
 497    tmp_dst.writemask = WRITEMASK_Y;
 498    emit(SHR(tmp_dst, src0, src_reg(16u)));
 499
 500    dst.writemask = WRITEMASK_XY;
 501    emit(F16TO32(dst, tmp_src));
 502 }
 503
 504 void
 505 vec4_visitor::visit_instructions(const exec_list *list)
 506 {
 507    foreach_list(node, list) {
 508       ir_instruction *ir = (ir_instruction *)node;
 509
 510       base_ir = ir;
 511       ir->accept(this);
 512    }
 513 }
 514
 515
 516 static int
 517 type_size(const struct glsl_type *type)
 518 {
 519    unsigned int i;
 520    int size;
 521
 522    switch (type->base_type) {
 523    case GLSL_TYPE_UINT:
 524    case GLSL_TYPE_INT:
 525    case GLSL_TYPE_FLOAT:
 526    case GLSL_TYPE_BOOL:
 527       if (type->is_matrix()) {
 528          return type->matrix_columns;
 529       } else {
 530          /* Regardless of size of vector, it gets a vec4. This is bad
 531           * packing for things like floats, but otherwise arrays become a
 532           * mess.  Hopefully a later pass over the code can pack scalars
 533           * down if appropriate.
 534           */
 535          return 1;
 536       }
 537    case GLSL_TYPE_ARRAY:
 538       assert(type->length > 0);
 539       return type_size(type->fields.array) * type->length;
 540    case GLSL_TYPE_STRUCT:
 541       size = 0;
 542       for (i = 0; i < type->length; i++) {
 543          size += type_size(type->fields.structure[i].type);
 544       }
 545       return size;
 546    case GLSL_TYPE_SAMPLER:
 547       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 548        * at link time.
 549        */
 550       return 1;
 551    case GLSL_TYPE_VOID:
 552    case GLSL_TYPE_ERROR:
 553    case GLSL_TYPE_INTERFACE:
 554       assert(0);
 555       break;
 556    }
 557
 558    return 0;
 559 }
 560
 561 int
 562 vec4_visitor::virtual_grf_alloc(int size)
 563 {
 564    if (virtual_grf_array_size <= virtual_grf_count) {
 565       if (virtual_grf_array_size == 0)
 566          virtual_grf_array_size = 16;
 567       else
 568          virtual_grf_array_size *= 2;
 569       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 570                                    virtual_grf_array_size);
 571       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 572                                      virtual_grf_array_size);
 573    }
 574    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 575    virtual_grf_reg_count += size;
 576    virtual_grf_sizes[virtual_grf_count] = size;
 577    return virtual_grf_count++;
 578 }
 579
 580 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 581 {
 582    init();
 583
 584    this->file = GRF;
 585    this->reg = v->virtual_grf_alloc(type_size(type));
 586
 587    if (type->is_array() || type->is_record()) {
 588       this->swizzle = BRW_SWIZZLE_NOOP;
 589    } else {
 590       this->swizzle = swizzle_for_size(type->vector_elements);
 591    }
 592
 593    this->type = brw_type_for_base_type(type);
 594 }
 595
 596 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 597 {
 598    init();
 599
 600    this->file = GRF;
 601    this->reg = v->virtual_grf_alloc(type_size(type));
 602
 603    if (type->is_array() || type->is_record()) {
 604       this->writemask = WRITEMASK_XYZW;
 605    } else {
 606       this->writemask = (1 << type->vector_elements) - 1;
 607    }
 608
 609    this->type = brw_type_for_base_type(type);
 610 }
 611
 612 /* Our support for uniforms is piggy-backed on the struct
 613  * gl_fragment_program, because that's where the values actually
 614  * get stored, rather than in some global gl_shader_program uniform
 615  * store.
 616  */
 617 void
 618 vec4_visitor::setup_uniform_values(ir_variable *ir)
 619 {
 620    int namelen = strlen(ir->name);
 621
 622    /* The data for our (non-builtin) uniforms is stored in a series of
 623     * gl_uniform_driver_storage structs for each subcomponent that
 624     * glGetUniformLocation() could name.  We know it's been set up in the same
 625     * order we'd walk the type, so walk the list of storage and find anything
 626     * with our name, or the prefix of a component that starts with our name.
 627     */
 628    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 629       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 630
 631       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 632           (storage->name[namelen] != 0 &&
 633            storage->name[namelen] != '.' &&
 634            storage->name[namelen] != '[')) {
 635          continue;
 636       }
 637
 638       gl_constant_value *components = storage->storage;
 639       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 640                                storage->type->matrix_columns);
 641
 642       for (unsigned s = 0; s < vector_count; s++) {
 643          uniform_vector_size[uniforms] = storage->type->vector_elements;
 644
 645          int i;
 646          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 647             prog_data->param[uniforms * 4 + i] = &components->f;
 648             components++;
 649          }
 650          for (; i < 4; i++) {
 651             static float zero = 0;
 652             prog_data->param[uniforms * 4 + i] = &zero;
 653          }
 654
 655          uniforms++;
 656       }
 657    }
 658 }
 659
 660 void
 661 vec4_visitor::setup_uniform_clipplane_values()
 662 {
 663    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 664
 665    if (intel->gen < 6) {
 666       /* Pre-Gen6, we compact clip planes.  For example, if the user
 667        * enables just clip planes 0, 1, and 3, we will enable clip planes
 668        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 669        * plane 2.  This simplifies the implementation of the Gen6 clip
 670        * thread.
 671        */
 672       int compacted_clipplane_index = 0;
 673       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 674          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
 675             continue;
 676
 677          this->uniform_vector_size[this->uniforms] = 4;
 678          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 679          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 680          for (int j = 0; j < 4; ++j) {
 681             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 682          }
 683          ++compacted_clipplane_index;
 684          ++this->uniforms;
 685       }
 686    } else {
 687       /* In Gen6 and later, we don't compact clip planes, because this
 688        * simplifies the implementation of gl_ClipDistance.
 689        */
 690       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 691          this->uniform_vector_size[this->uniforms] = 4;
 692          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 693          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 694          for (int j = 0; j < 4; ++j) {
 695             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 696          }
 697          ++this->uniforms;
 698       }
 699    }
 700 }
 701
 702 /* Our support for builtin uniforms is even scarier than non-builtin.
 703  * It sits on top of the PROG_STATE_VAR parameters that are
 704  * automatically updated from GL context state.
 705  */
 706 void
 707 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 708 {
 709    const ir_state_slot *const slots = ir->state_slots;
 710    assert(ir->state_slots != NULL);
 711
 712    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 713       /* This state reference has already been setup by ir_to_mesa,
 714        * but we'll get the same index back here.  We can reference
 715        * ParameterValues directly, since unlike brw_fs.cpp, we never
 716        * add new state references during compile.
 717        */
 718       int index = _mesa_add_state_reference(this->prog->Parameters,
 719                                             (gl_state_index *)slots[i].tokens);
 720       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 721
 722       this->uniform_vector_size[this->uniforms] = 0;
 723       /* Add each of the unique swizzled channels of the element.
 724        * This will end up matching the size of the glsl_type of this field.
 725        */
 726       int last_swiz = -1;
 727       for (unsigned int j = 0; j < 4; j++) {
 728          int swiz = GET_SWZ(slots[i].swizzle, j);
 729          last_swiz = swiz;
 730
 731          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 732          if (swiz <= last_swiz)
 733             this->uniform_vector_size[this->uniforms]++;
 734       }
 735       this->uniforms++;
 736    }
 737 }
 738
 739 dst_reg *
 740 vec4_visitor::variable_storage(ir_variable *var)
 741 {
 742    return (dst_reg *)hash_table_find(this->variable_ht, var);
 743 }
 744
 745 void
 746 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 747 {
 748    ir_expression *expr = ir->as_expression();
 749
 750    *predicate = BRW_PREDICATE_NORMAL;
 751
 752    if (expr) {
 753       src_reg op[2];
 754       vec4_instruction *inst;
 755
 756       assert(expr->get_num_operands() <= 2);
 757       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 758          expr->operands[i]->accept(this);
 759          op[i] = this->result;
 760
 761          resolve_ud_negate(&op[i]);
 762       }
 763
 764       switch (expr->operation) {
 765       case ir_unop_logic_not:
 766          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 767          inst->conditional_mod = BRW_CONDITIONAL_Z;
 768          break;
 769
 770       case ir_binop_logic_xor:
 771          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 772          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 773          break;
 774
 775       case ir_binop_logic_or:
 776          inst = emit(OR(dst_null_d(), op[0], op[1]));
 777          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          break;
 779
 780       case ir_binop_logic_and:
 781          inst = emit(AND(dst_null_d(), op[0], op[1]));
 782          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 783          break;
 784
 785       case ir_unop_f2b:
 786          if (intel->gen >= 6) {
 787             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 788          } else {
 789             inst = emit(MOV(dst_null_f(), op[0]));
 790             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 791          }
 792          break;
 793
 794       case ir_unop_i2b:
 795          if (intel->gen >= 6) {
 796             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 797          } else {
 798             inst = emit(MOV(dst_null_d(), op[0]));
 799             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          }
 801          break;
 802
 803       case ir_binop_all_equal:
 804          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 805          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 806          break;
 807
 808       case ir_binop_any_nequal:
 809          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 810          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 811          break;
 812
 813       case ir_unop_any:
 814          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 815          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 816          break;
 817
 818       case ir_binop_greater:
 819       case ir_binop_gequal:
 820       case ir_binop_less:
 821       case ir_binop_lequal:
 822       case ir_binop_equal:
 823       case ir_binop_nequal:
 824          emit(CMP(dst_null_d(), op[0], op[1],
 825                   brw_conditional_for_comparison(expr->operation)));
 826          break;
 827
 828       default:
 829          assert(!"not reached");
 830          break;
 831       }
 832       return;
 833    }
 834
 835    ir->accept(this);
 836
 837    resolve_ud_negate(&this->result);
 838
 839    if (intel->gen >= 6) {
 840       vec4_instruction *inst = emit(AND(dst_null_d(),
 841                                         this->result, src_reg(1)));
 842       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843    } else {
 844       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 845       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 846    }
 847 }
 848
 849 /**
 850  * Emit a gen6 IF statement with the comparison folded into the IF
 851  * instruction.
 852  */
 853 void
 854 vec4_visitor::emit_if_gen6(ir_if *ir)
 855 {
 856    ir_expression *expr = ir->condition->as_expression();
 857
 858    if (expr) {
 859       src_reg op[2];
 860       dst_reg temp;
 861
 862       assert(expr->get_num_operands() <= 2);
 863       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 864          expr->operands[i]->accept(this);
 865          op[i] = this->result;
 866       }
 867
 868       switch (expr->operation) {
 869       case ir_unop_logic_not:
 870          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 871          return;
 872
 873       case ir_binop_logic_xor:
 874          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 875          return;
 876
 877       case ir_binop_logic_or:
 878          temp = dst_reg(this, glsl_type::bool_type);
 879          emit(OR(temp, op[0], op[1]));
 880          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 881          return;
 882
 883       case ir_binop_logic_and:
 884          temp = dst_reg(this, glsl_type::bool_type);
 885          emit(AND(temp, op[0], op[1]));
 886          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 887          return;
 888
 889       case ir_unop_f2b:
 890          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          return;
 892
 893       case ir_unop_i2b:
 894          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          return;
 896
 897       case ir_binop_greater:
 898       case ir_binop_gequal:
 899       case ir_binop_less:
 900       case ir_binop_lequal:
 901       case ir_binop_equal:
 902       case ir_binop_nequal:
 903          emit(IF(op[0], op[1],
 904                  brw_conditional_for_comparison(expr->operation)));
 905          return;
 906
 907       case ir_binop_all_equal:
 908          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 909          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 910          return;
 911
 912       case ir_binop_any_nequal:
 913          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 914          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 915          return;
 916
 917       case ir_unop_any:
 918          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 919          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 920          return;
 921
 922       default:
 923          assert(!"not reached");
 924          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 925          return;
 926       }
 927       return;
 928    }
 929
 930    ir->condition->accept(this);
 931
 932    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 933 }
 934
 935 static dst_reg
 936 with_writemask(dst_reg const & r, int mask)
 937 {
 938    dst_reg result = r;
 939    result.writemask = mask;
 940    return result;
 941 }
 942
 943 void
 944 vec4_vs_visitor::emit_prolog()
 945 {
 946    dst_reg sign_recovery_shift;
 947    dst_reg normalize_factor;
 948    dst_reg es3_normalize_factor;
 949
 950    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 951       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 952          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 953          dst_reg reg(ATTR, i);
 954          dst_reg reg_d = reg;
 955          reg_d.type = BRW_REGISTER_TYPE_D;
 956          dst_reg reg_ud = reg;
 957          reg_ud.type = BRW_REGISTER_TYPE_UD;
 958
 959          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 960           * come in as floating point conversions of the integer values.
 961           */
 962          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 963             dst_reg dst = reg;
 964             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 965             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 966             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 967          }
 968
 969          /* Do sign recovery for 2101010 formats if required. */
 970          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 971             if (sign_recovery_shift.file == BAD_FILE) {
 972                /* shift constant: <22,22,22,30> */
 973                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 974                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 975                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 976             }
 977
 978             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 979             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 980          }
 981
 982          /* Apply BGRA swizzle if required. */
 983          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 984             src_reg temp = src_reg(reg);
 985             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 986             emit(MOV(reg, temp));
 987          }
 988
 989          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 990             /* ES 3.0 has different rules for converting signed normalized
 991              * fixed-point numbers than desktop GL.
 992              */
 993             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 994                /* According to equation 2.2 of the ES 3.0 specification,
 995                 * signed normalization conversion is done by:
 996                 *
 997                 * f = c / (2^(b-1)-1)
 998                 */
 999                if (es3_normalize_factor.file == BAD_FILE) {
1000                   /* mul constant: 1 / (2^(b-1) - 1) */
1001                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
1002                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
1003                            src_reg(1.0f / ((1<<9) - 1))));
1004                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
1005                            src_reg(1.0f / ((1<<1) - 1))));
1006                }
1007
1008                dst_reg dst = reg;
1009                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1010                emit(MOV(dst, src_reg(reg_d)));
1011                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1012                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1013             } else {
1014                /* The following equations are from the OpenGL 3.2 specification:
1015                 *
1016                 * 2.1 unsigned normalization
1017                 * f = c/(2^n-1)
1018                 *
1019                 * 2.2 signed normalization
1020                 * f = (2c+1)/(2^n-1)
1021                 *
1022                 * Both of these share a common divisor, which is represented by
1023                 * "normalize_factor" in the code below.
1024                 */
1025                if (normalize_factor.file == BAD_FILE) {
1026                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1027                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1028                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1029                            src_reg(1.0f / ((1<<10) - 1))));
1030                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1031                            src_reg(1.0f / ((1<<2) - 1))));
1032                }
1033
1034                dst_reg dst = reg;
1035                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1036                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1037
1038                /* For signed normalization, we want the numerator to be 2c+1. */
1039                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1040                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1041                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1042                }
1043
1044                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1045             }
1046          }
1047
1048          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1049             dst_reg dst = reg;
1050             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1051             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1052          }
1053       }
1054    }
1055 }
1056
1057
1058 dst_reg *
1059 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1060 {
1061    /* VertexID is stored by the VF as the last vertex element, but
1062     * we don't represent it with a flag in inputs_read, so we call
1063     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1064     */
1065    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1066    vs_prog_data->uses_vertexid = true;
1067
1068    switch (ir->location) {
1069    case SYSTEM_VALUE_VERTEX_ID:
1070       reg->writemask = WRITEMASK_X;
1071       break;
1072    case SYSTEM_VALUE_INSTANCE_ID:
1073       reg->writemask = WRITEMASK_Y;
1074       break;
1075    default:
1076       assert(!"not reached");
1077       break;
1078    }
1079
1080    return reg;
1081 }
1082
1083
1084 void
1085 vec4_visitor::visit(ir_variable *ir)
1086 {
1087    dst_reg *reg = NULL;
1088
1089    if (variable_storage(ir))
1090       return;
1091
1092    switch (ir->mode) {
1093    case ir_var_shader_in:
1094       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1095       break;
1096
1097    case ir_var_shader_out:
1098       reg = new(mem_ctx) dst_reg(this, ir->type);
1099
1100       for (int i = 0; i < type_size(ir->type); i++) {
1101          output_reg[ir->location + i] = *reg;
1102          output_reg[ir->location + i].reg_offset = i;
1103          output_reg[ir->location + i].type =
1104             brw_type_for_base_type(ir->type->get_scalar_type());
1105          output_reg_annotation[ir->location + i] = ir->name;
1106       }
1107       break;
1108
1109    case ir_var_auto:
1110    case ir_var_temporary:
1111       reg = new(mem_ctx) dst_reg(this, ir->type);
1112       break;
1113
1114    case ir_var_uniform:
1115       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1116
1117       /* Thanks to the lower_ubo_reference pass, we will see only
1118        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1119        * variables, so no need for them to be in variable_ht.
1120        */
1121       if (ir->is_in_uniform_block())
1122          return;
1123
1124       /* Track how big the whole uniform variable is, in case we need to put a
1125        * copy of its data into pull constants for array access.
1126        */
1127       this->uniform_size[this->uniforms] = type_size(ir->type);
1128
1129       if (!strncmp(ir->name, "gl_", 3)) {
1130          setup_builtin_uniform_values(ir);
1131       } else {
1132          setup_uniform_values(ir);
1133       }
1134       break;
1135
1136    case ir_var_system_value:
1137       reg = make_reg_for_system_value(ir);
1138       break;
1139
1140    default:
1141       assert(!"not reached");
1142    }
1143
1144    reg->type = brw_type_for_base_type(ir->type);
1145    hash_table_insert(this->variable_ht, reg, ir);
1146 }
1147
1148 void
1149 vec4_visitor::visit(ir_loop *ir)
1150 {
1151    dst_reg counter;
1152
1153    /* We don't want debugging output to print the whole body of the
1154     * loop as the annotation.
1155     */
1156    this->base_ir = NULL;
1157
1158    if (ir->counter != NULL) {
1159       this->base_ir = ir->counter;
1160       ir->counter->accept(this);
1161       counter = *(variable_storage(ir->counter));
1162
1163       if (ir->from != NULL) {
1164          this->base_ir = ir->from;
1165          ir->from->accept(this);
1166
1167          emit(MOV(counter, this->result));
1168       }
1169    }
1170
1171    emit(BRW_OPCODE_DO);
1172
1173    if (ir->to) {
1174       this->base_ir = ir->to;
1175       ir->to->accept(this);
1176
1177       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1178                brw_conditional_for_comparison(ir->cmp)));
1179
1180       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1181       inst->predicate = BRW_PREDICATE_NORMAL;
1182    }
1183
1184    visit_instructions(&ir->body_instructions);
1185
1186
1187    if (ir->increment) {
1188       this->base_ir = ir->increment;
1189       ir->increment->accept(this);
1190       emit(ADD(counter, src_reg(counter), this->result));
1191    }
1192
1193    emit(BRW_OPCODE_WHILE);
1194 }
1195
1196 void
1197 vec4_visitor::visit(ir_loop_jump *ir)
1198 {
1199    switch (ir->mode) {
1200    case ir_loop_jump::jump_break:
1201       emit(BRW_OPCODE_BREAK);
1202       break;
1203    case ir_loop_jump::jump_continue:
1204       emit(BRW_OPCODE_CONTINUE);
1205       break;
1206    }
1207 }
1208
1209
1210 void
1211 vec4_visitor::visit(ir_function_signature *ir)
1212 {
1213    assert(0);
1214    (void)ir;
1215 }
1216
1217 void
1218 vec4_visitor::visit(ir_function *ir)
1219 {
1220    /* Ignore function bodies other than main() -- we shouldn't see calls to
1221     * them since they should all be inlined.
1222     */
1223    if (strcmp(ir->name, "main") == 0) {
1224       const ir_function_signature *sig;
1225       exec_list empty;
1226
1227       sig = ir->matching_signature(&empty);
1228
1229       assert(sig);
1230
1231       visit_instructions(&sig->body);
1232    }
1233 }
1234
1235 bool
1236 vec4_visitor::try_emit_sat(ir_expression *ir)
1237 {
1238    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1239    if (!sat_src)
1240       return false;
1241
1242    sat_src->accept(this);
1243    src_reg src = this->result;
1244
1245    this->result = src_reg(this, ir->type);
1246    vec4_instruction *inst;
1247    inst = emit(MOV(dst_reg(this->result), src));
1248    inst->saturate = true;
1249
1250    return true;
1251 }
1252
1253 bool
1254 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1255 {
1256    /* 3-src instructions were introduced in gen6. */
1257    if (intel->gen < 6)
1258       return false;
1259
1260    /* MAD can only handle floating-point data. */
1261    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1262       return false;
1263
1264    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1265    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1266
1267    if (!mul || mul->operation != ir_binop_mul)
1268       return false;
1269
1270    nonmul->accept(this);
1271    src_reg src0 = fix_3src_operand(this->result);
1272
1273    mul->operands[0]->accept(this);
1274    src_reg src1 = fix_3src_operand(this->result);
1275
1276    mul->operands[1]->accept(this);
1277    src_reg src2 = fix_3src_operand(this->result);
1278
1279    this->result = src_reg(this, ir->type);
1280    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1281
1282    return true;
1283 }
1284
1285 void
1286 vec4_visitor::emit_bool_comparison(unsigned int op,
1287                                  dst_reg dst, src_reg src0, src_reg src1)
1288 {
1289    /* original gen4 does destination conversion before comparison. */
1290    if (intel->gen < 5)
1291       dst.type = src0.type;
1292
1293    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1294
1295    dst.type = BRW_REGISTER_TYPE_D;
1296    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1297 }
1298
1299 void
1300 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1301                           src_reg src0, src_reg src1)
1302 {
1303    vec4_instruction *inst;
1304
1305    if (intel->gen >= 6) {
1306       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1307       inst->conditional_mod = conditionalmod;
1308    } else {
1309       emit(CMP(dst, src0, src1, conditionalmod));
1310
1311       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1312       inst->predicate = BRW_PREDICATE_NORMAL;
1313    }
1314 }
1315
1316 static bool
1317 is_16bit_constant(ir_rvalue *rvalue)
1318 {
1319    ir_constant *constant = rvalue->as_constant();
1320    if (!constant)
1321       return false;
1322
1323    if (constant->type != glsl_type::int_type &&
1324        constant->type != glsl_type::uint_type)
1325       return false;
1326
1327    return constant->value.u[0] < (1 << 16);
1328 }
1329
1330 void
1331 vec4_visitor::visit(ir_expression *ir)
1332 {
1333    unsigned int operand;
1334    src_reg op[Elements(ir->operands)];
1335    src_reg result_src;
1336    dst_reg result_dst;
1337    vec4_instruction *inst;
1338
1339    if (try_emit_sat(ir))
1340       return;
1341
1342    if (ir->operation == ir_binop_add) {
1343       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1344          return;
1345    }
1346
1347    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1348       this->result.file = BAD_FILE;
1349       ir->operands[operand]->accept(this);
1350       if (this->result.file == BAD_FILE) {
1351          printf("Failed to get tree for expression operand:\n");
1352          ir->operands[operand]->print();
1353          exit(1);
1354       }
1355       op[operand] = this->result;
1356
1357       /* Matrix expression operands should have been broken down to vector
1358        * operations already.
1359        */
1360       assert(!ir->operands[operand]->type->is_matrix());
1361    }
1362
1363    int vector_elements = ir->operands[0]->type->vector_elements;
1364    if (ir->operands[1]) {
1365       vector_elements = MAX2(vector_elements,
1366                              ir->operands[1]->type->vector_elements);
1367    }
1368
1369    this->result.file = BAD_FILE;
1370
1371    /* Storage for our result.  Ideally for an assignment we'd be using
1372     * the actual storage for the result here, instead.
1373     */
1374    result_src = src_reg(this, ir->type);
1375    /* convenience for the emit functions below. */
1376    result_dst = dst_reg(result_src);
1377    /* If nothing special happens, this is the result. */
1378    this->result = result_src;
1379    /* Limit writes to the channels that will be used by result_src later.
1380     * This does limit this temp's use as a temporary for multi-instruction
1381     * sequences.
1382     */
1383    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1384
1385    switch (ir->operation) {
1386    case ir_unop_logic_not:
1387       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1388        * ones complement of the whole register, not just bit 0.
1389        */
1390       emit(XOR(result_dst, op[0], src_reg(1)));
1391       break;
1392    case ir_unop_neg:
1393       op[0].negate = !op[0].negate;
1394       this->result = op[0];
1395       break;
1396    case ir_unop_abs:
1397       op[0].abs = true;
1398       op[0].negate = false;
1399       this->result = op[0];
1400       break;
1401
1402    case ir_unop_sign:
1403       emit(MOV(result_dst, src_reg(0.0f)));
1404
1405       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1406       inst = emit(MOV(result_dst, src_reg(1.0f)));
1407       inst->predicate = BRW_PREDICATE_NORMAL;
1408
1409       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1410       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1411       inst->predicate = BRW_PREDICATE_NORMAL;
1412
1413       break;
1414
1415    case ir_unop_rcp:
1416       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1417       break;
1418
1419    case ir_unop_exp2:
1420       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1421       break;
1422    case ir_unop_log2:
1423       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1424       break;
1425    case ir_unop_exp:
1426    case ir_unop_log:
1427       assert(!"not reached: should be handled by ir_explog_to_explog2");
1428       break;
1429    case ir_unop_sin:
1430    case ir_unop_sin_reduced:
1431       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1432       break;
1433    case ir_unop_cos:
1434    case ir_unop_cos_reduced:
1435       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1436       break;
1437
1438    case ir_unop_dFdx:
1439    case ir_unop_dFdy:
1440       assert(!"derivatives not valid in vertex shader");
1441       break;
1442
1443    case ir_unop_bitfield_reverse:
1444       emit(BFREV(result_dst, op[0]));
1445       break;
1446    case ir_unop_bit_count:
1447       emit(CBIT(result_dst, op[0]));
1448       break;
1449    case ir_unop_find_msb: {
1450       src_reg temp = src_reg(this, glsl_type::uint_type);
1451
1452       inst = emit(FBH(dst_reg(temp), op[0]));
1453       inst->dst.writemask = WRITEMASK_XYZW;
1454
1455       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1456        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1457        * subtract the result from 31 to convert the MSB count into an LSB count.
1458        */
1459
1460       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1461       temp.swizzle = BRW_SWIZZLE_NOOP;
1462       emit(MOV(result_dst, temp));
1463
1464       src_reg src_tmp = src_reg(result_dst);
1465       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1466
1467       src_tmp.negate = true;
1468       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1469       inst->predicate = BRW_PREDICATE_NORMAL;
1470       break;
1471    }
1472    case ir_unop_find_lsb:
1473       emit(FBL(result_dst, op[0]));
1474       break;
1475
1476    case ir_unop_noise:
1477       assert(!"not reached: should be handled by lower_noise");
1478       break;
1479
1480    case ir_binop_add:
1481       emit(ADD(result_dst, op[0], op[1]));
1482       break;
1483    case ir_binop_sub:
1484       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1485       break;
1486
1487    case ir_binop_mul:
1488       if (ir->type->is_integer()) {
1489          /* For integer multiplication, the MUL uses the low 16 bits of one of
1490           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1491           * accumulates in the contribution of the upper 16 bits of that
1492           * operand.  If we can determine that one of the args is in the low
1493           * 16 bits, though, we can just emit a single MUL.
1494           */
1495          if (is_16bit_constant(ir->operands[0])) {
1496             if (intel->gen < 7)
1497                emit(MUL(result_dst, op[0], op[1]));
1498             else
1499                emit(MUL(result_dst, op[1], op[0]));
1500          } else if (is_16bit_constant(ir->operands[1])) {
1501             if (intel->gen < 7)
1502                emit(MUL(result_dst, op[1], op[0]));
1503             else
1504                emit(MUL(result_dst, op[0], op[1]));
1505          } else {
1506             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1507
1508             emit(MUL(acc, op[0], op[1]));
1509             emit(MACH(dst_null_d(), op[0], op[1]));
1510             emit(MOV(result_dst, src_reg(acc)));
1511          }
1512       } else {
1513          emit(MUL(result_dst, op[0], op[1]));
1514       }
1515       break;
1516    case ir_binop_div:
1517       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1518       assert(ir->type->is_integer());
1519       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1520       break;
1521    case ir_binop_mod:
1522       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1523       assert(ir->type->is_integer());
1524       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1525       break;
1526
1527    case ir_binop_less:
1528    case ir_binop_greater:
1529    case ir_binop_lequal:
1530    case ir_binop_gequal:
1531    case ir_binop_equal:
1532    case ir_binop_nequal: {
1533       emit(CMP(result_dst, op[0], op[1],
1534                brw_conditional_for_comparison(ir->operation)));
1535       emit(AND(result_dst, result_src, src_reg(0x1)));
1536       break;
1537    }
1538
1539    case ir_binop_all_equal:
1540       /* "==" operator producing a scalar boolean. */
1541       if (ir->operands[0]->type->is_vector() ||
1542           ir->operands[1]->type->is_vector()) {
1543          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1544          emit(MOV(result_dst, src_reg(0)));
1545          inst = emit(MOV(result_dst, src_reg(1)));
1546          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1547       } else {
1548          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1549          emit(AND(result_dst, result_src, src_reg(0x1)));
1550       }
1551       break;
1552    case ir_binop_any_nequal:
1553       /* "!=" operator producing a scalar boolean. */
1554       if (ir->operands[0]->type->is_vector() ||
1555           ir->operands[1]->type->is_vector()) {
1556          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1557
1558          emit(MOV(result_dst, src_reg(0)));
1559          inst = emit(MOV(result_dst, src_reg(1)));
1560          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1561       } else {
1562          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1563          emit(AND(result_dst, result_src, src_reg(0x1)));
1564       }
1565       break;
1566
1567    case ir_unop_any:
1568       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1569       emit(MOV(result_dst, src_reg(0)));
1570
1571       inst = emit(MOV(result_dst, src_reg(1)));
1572       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1573       break;
1574
1575    case ir_binop_logic_xor:
1576       emit(XOR(result_dst, op[0], op[1]));
1577       break;
1578
1579    case ir_binop_logic_or:
1580       emit(OR(result_dst, op[0], op[1]));
1581       break;
1582
1583    case ir_binop_logic_and:
1584       emit(AND(result_dst, op[0], op[1]));
1585       break;
1586
1587    case ir_binop_dot:
1588       assert(ir->operands[0]->type->is_vector());
1589       assert(ir->operands[0]->type == ir->operands[1]->type);
1590       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1591       break;
1592
1593    case ir_unop_sqrt:
1594       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1595       break;
1596    case ir_unop_rsq:
1597       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1598       break;
1599
1600    case ir_unop_bitcast_i2f:
1601    case ir_unop_bitcast_u2f:
1602       this->result = op[0];
1603       this->result.type = BRW_REGISTER_TYPE_F;
1604       break;
1605
1606    case ir_unop_bitcast_f2i:
1607       this->result = op[0];
1608       this->result.type = BRW_REGISTER_TYPE_D;
1609       break;
1610
1611    case ir_unop_bitcast_f2u:
1612       this->result = op[0];
1613       this->result.type = BRW_REGISTER_TYPE_UD;
1614       break;
1615
1616    case ir_unop_i2f:
1617    case ir_unop_i2u:
1618    case ir_unop_u2i:
1619    case ir_unop_u2f:
1620    case ir_unop_b2f:
1621    case ir_unop_b2i:
1622    case ir_unop_f2i:
1623    case ir_unop_f2u:
1624       emit(MOV(result_dst, op[0]));
1625       break;
1626    case ir_unop_f2b:
1627    case ir_unop_i2b: {
1628       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1629       emit(AND(result_dst, result_src, src_reg(1)));
1630       break;
1631    }
1632
1633    case ir_unop_trunc:
1634       emit(RNDZ(result_dst, op[0]));
1635       break;
1636    case ir_unop_ceil:
1637       op[0].negate = !op[0].negate;
1638       inst = emit(RNDD(result_dst, op[0]));
1639       this->result.negate = true;
1640       break;
1641    case ir_unop_floor:
1642       inst = emit(RNDD(result_dst, op[0]));
1643       break;
1644    case ir_unop_fract:
1645       inst = emit(FRC(result_dst, op[0]));
1646       break;
1647    case ir_unop_round_even:
1648       emit(RNDE(result_dst, op[0]));
1649       break;
1650
1651    case ir_binop_min:
1652       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1653       break;
1654    case ir_binop_max:
1655       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1656       break;
1657
1658    case ir_binop_pow:
1659       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1660       break;
1661
1662    case ir_unop_bit_not:
1663       inst = emit(NOT(result_dst, op[0]));
1664       break;
1665    case ir_binop_bit_and:
1666       inst = emit(AND(result_dst, op[0], op[1]));
1667       break;
1668    case ir_binop_bit_xor:
1669       inst = emit(XOR(result_dst, op[0], op[1]));
1670       break;
1671    case ir_binop_bit_or:
1672       inst = emit(OR(result_dst, op[0], op[1]));
1673       break;
1674
1675    case ir_binop_lshift:
1676       inst = emit(SHL(result_dst, op[0], op[1]));
1677       break;
1678
1679    case ir_binop_rshift:
1680       if (ir->type->base_type == GLSL_TYPE_INT)
1681          inst = emit(ASR(result_dst, op[0], op[1]));
1682       else
1683          inst = emit(SHR(result_dst, op[0], op[1]));
1684       break;
1685
1686    case ir_binop_bfm:
1687       emit(BFI1(result_dst, op[0], op[1]));
1688       break;
1689
1690    case ir_binop_ubo_load: {
1691       ir_constant *uniform_block = ir->operands[0]->as_constant();
1692       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1693       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1694       src_reg offset = op[1];
1695
1696       /* Now, load the vector from that offset. */
1697       assert(ir->type->is_vector() || ir->type->is_scalar());
1698
1699       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1700       packed_consts.type = result.type;
1701       src_reg surf_index =
1702          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1703       if (const_offset_ir) {
1704          offset = src_reg(const_offset / 16);
1705       } else {
1706          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1707       }
1708
1709       vec4_instruction *pull =
1710          emit(new(mem_ctx) vec4_instruction(this,
1711                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1712                                             dst_reg(packed_consts),
1713                                             surf_index,
1714                                             offset));
1715       pull->base_mrf = 14;
1716       pull->mlen = 1;
1717
1718       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1719       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1720                                             const_offset % 16 / 4,
1721                                             const_offset % 16 / 4,
1722                                             const_offset % 16 / 4);
1723
1724       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1725       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1726          emit(CMP(result_dst, packed_consts, src_reg(0u),
1727                   BRW_CONDITIONAL_NZ));
1728          emit(AND(result_dst, result, src_reg(0x1)));
1729       } else {
1730          emit(MOV(result_dst, packed_consts));
1731       }
1732       break;
1733    }
1734
1735    case ir_binop_vector_extract:
1736       assert(!"should have been lowered by vec_index_to_cond_assign");
1737       break;
1738
1739    case ir_triop_lrp:
1740       op[0] = fix_3src_operand(op[0]);
1741       op[1] = fix_3src_operand(op[1]);
1742       op[2] = fix_3src_operand(op[2]);
1743       /* Note that the instruction's argument order is reversed from GLSL
1744        * and the IR.
1745        */
1746       emit(LRP(result_dst, op[2], op[1], op[0]));
1747       break;
1748
1749    case ir_triop_bfi:
1750       op[0] = fix_3src_operand(op[0]);
1751       op[1] = fix_3src_operand(op[1]);
1752       op[2] = fix_3src_operand(op[2]);
1753       emit(BFI2(result_dst, op[0], op[1], op[2]));
1754       break;
1755
1756    case ir_triop_bitfield_extract:
1757       op[0] = fix_3src_operand(op[0]);
1758       op[1] = fix_3src_operand(op[1]);
1759       op[2] = fix_3src_operand(op[2]);
1760       /* Note that the instruction's argument order is reversed from GLSL
1761        * and the IR.
1762        */
1763       emit(BFE(result_dst, op[2], op[1], op[0]));
1764       break;
1765
1766    case ir_triop_vector_insert:
1767       assert(!"should have been lowered by lower_vector_insert");
1768       break;
1769
1770    case ir_quadop_bitfield_insert:
1771       assert(!"not reached: should be handled by "
1772               "bitfield_insert_to_bfm_bfi\n");
1773       break;
1774
1775    case ir_quadop_vector:
1776       assert(!"not reached: should be handled by lower_quadop_vector");
1777       break;
1778
1779    case ir_unop_pack_half_2x16:
1780       emit_pack_half_2x16(result_dst, op[0]);
1781       break;
1782    case ir_unop_unpack_half_2x16:
1783       emit_unpack_half_2x16(result_dst, op[0]);
1784       break;
1785    case ir_unop_pack_snorm_2x16:
1786    case ir_unop_pack_snorm_4x8:
1787    case ir_unop_pack_unorm_2x16:
1788    case ir_unop_pack_unorm_4x8:
1789    case ir_unop_unpack_snorm_2x16:
1790    case ir_unop_unpack_snorm_4x8:
1791    case ir_unop_unpack_unorm_2x16:
1792    case ir_unop_unpack_unorm_4x8:
1793       assert(!"not reached: should be handled by lower_packing_builtins");
1794       break;
1795    case ir_unop_unpack_half_2x16_split_x:
1796    case ir_unop_unpack_half_2x16_split_y:
1797    case ir_binop_pack_half_2x16_split:
1798       assert(!"not reached: should not occur in vertex shader");
1799       break;
1800    }
1801 }
1802
1803
1804 void
1805 vec4_visitor::visit(ir_swizzle *ir)
1806 {
1807    src_reg src;
1808    int i = 0;
1809    int swizzle[4];
1810
1811    /* Note that this is only swizzles in expressions, not those on the left
1812     * hand side of an assignment, which do write masking.  See ir_assignment
1813     * for that.
1814     */
1815
1816    ir->val->accept(this);
1817    src = this->result;
1818    assert(src.file != BAD_FILE);
1819
1820    for (i = 0; i < ir->type->vector_elements; i++) {
1821       switch (i) {
1822       case 0:
1823          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1824          break;
1825       case 1:
1826          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1827          break;
1828       case 2:
1829          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1830          break;
1831       case 3:
1832          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1833             break;
1834       }
1835    }
1836    for (; i < 4; i++) {
1837       /* Replicate the last channel out. */
1838       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1839    }
1840
1841    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1842
1843    this->result = src;
1844 }
1845
1846 void
1847 vec4_visitor::visit(ir_dereference_variable *ir)
1848 {
1849    const struct glsl_type *type = ir->type;
1850    dst_reg *reg = variable_storage(ir->var);
1851
1852    if (!reg) {
1853       fail("Failed to find variable storage for %s\n", ir->var->name);
1854       this->result = src_reg(brw_null_reg());
1855       return;
1856    }
1857
1858    this->result = src_reg(*reg);
1859
1860    /* System values get their swizzle from the dst_reg writemask */
1861    if (ir->var->mode == ir_var_system_value)
1862       return;
1863
1864    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1865       this->result.swizzle = swizzle_for_size(type->vector_elements);
1866 }
1867
1868
1869 int
1870 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1871 {
1872    /* Under normal circumstances array elements are stored consecutively, so
1873     * the stride is equal to the size of the array element.
1874     */
1875    return type_size(ir->type);
1876 }
1877
1878
1879 void
1880 vec4_visitor::visit(ir_dereference_array *ir)
1881 {
1882    ir_constant *constant_index;
1883    src_reg src;
1884    int array_stride = compute_array_stride(ir);
1885
1886    constant_index = ir->array_index->constant_expression_value();
1887
1888    ir->array->accept(this);
1889    src = this->result;
1890
1891    if (constant_index) {
1892       src.reg_offset += constant_index->value.i[0] * array_stride;
1893    } else {
1894       /* Variable index array dereference.  It eats the "vec4" of the
1895        * base of the array and an index that offsets the Mesa register
1896        * index.
1897        */
1898       ir->array_index->accept(this);
1899
1900       src_reg index_reg;
1901
1902       if (array_stride == 1) {
1903          index_reg = this->result;
1904       } else {
1905          index_reg = src_reg(this, glsl_type::int_type);
1906
1907          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1908       }
1909
1910       if (src.reladdr) {
1911          src_reg temp = src_reg(this, glsl_type::int_type);
1912
1913          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1914
1915          index_reg = temp;
1916       }
1917
1918       src.reladdr = ralloc(mem_ctx, src_reg);
1919       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1920    }
1921
1922    /* If the type is smaller than a vec4, replicate the last channel out. */
1923    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1924       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1925    else
1926       src.swizzle = BRW_SWIZZLE_NOOP;
1927    src.type = brw_type_for_base_type(ir->type);
1928
1929    this->result = src;
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_dereference_record *ir)
1934 {
1935    unsigned int i;
1936    const glsl_type *struct_type = ir->record->type;
1937    int offset = 0;
1938
1939    ir->record->accept(this);
1940
1941    for (i = 0; i < struct_type->length; i++) {
1942       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1943          break;
1944       offset += type_size(struct_type->fields.structure[i].type);
1945    }
1946
1947    /* If the type is smaller than a vec4, replicate the last channel out. */
1948    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1949       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1950    else
1951       this->result.swizzle = BRW_SWIZZLE_NOOP;
1952    this->result.type = brw_type_for_base_type(ir->type);
1953
1954    this->result.reg_offset += offset;
1955 }
1956
1957 /**
1958  * We want to be careful in assignment setup to hit the actual storage
1959  * instead of potentially using a temporary like we might with the
1960  * ir_dereference handler.
1961  */
1962 static dst_reg
1963 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1964 {
1965    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1966     * access of a vector, it must be separated into a series conditional moves
1967     * before reaching this point (see ir_vec_index_to_cond_assign).
1968     */
1969    assert(ir->as_dereference());
1970    ir_dereference_array *deref_array = ir->as_dereference_array();
1971    if (deref_array) {
1972       assert(!deref_array->array->type->is_vector());
1973    }
1974
1975    /* Use the rvalue deref handler for the most part.  We'll ignore
1976     * swizzles in it and write swizzles using writemask, though.
1977     */
1978    ir->accept(v);
1979    return dst_reg(v->result);
1980 }
1981
1982 void
1983 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1984                               const struct glsl_type *type, uint32_t predicate)
1985 {
1986    if (type->base_type == GLSL_TYPE_STRUCT) {
1987       for (unsigned int i = 0; i < type->length; i++) {
1988          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1989       }
1990       return;
1991    }
1992
1993    if (type->is_array()) {
1994       for (unsigned int i = 0; i < type->length; i++) {
1995          emit_block_move(dst, src, type->fields.array, predicate);
1996       }
1997       return;
1998    }
1999
2000    if (type->is_matrix()) {
2001       const struct glsl_type *vec_type;
2002
2003       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2004                                          type->vector_elements, 1);
2005
2006       for (int i = 0; i < type->matrix_columns; i++) {
2007          emit_block_move(dst, src, vec_type, predicate);
2008       }
2009       return;
2010    }
2011
2012    assert(type->is_scalar() || type->is_vector());
2013
2014    dst->type = brw_type_for_base_type(type);
2015    src->type = dst->type;
2016
2017    dst->writemask = (1 << type->vector_elements) - 1;
2018
2019    src->swizzle = swizzle_for_size(type->vector_elements);
2020
2021    vec4_instruction *inst = emit(MOV(*dst, *src));
2022    inst->predicate = predicate;
2023
2024    dst->reg_offset++;
2025    src->reg_offset++;
2026 }
2027
2028
2029 /* If the RHS processing resulted in an instruction generating a
2030  * temporary value, and it would be easy to rewrite the instruction to
2031  * generate its result right into the LHS instead, do so.  This ends
2032  * up reliably removing instructions where it can be tricky to do so
2033  * later without real UD chain information.
2034  */
2035 bool
2036 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2037                                      dst_reg dst,
2038                                      src_reg src,
2039                                      vec4_instruction *pre_rhs_inst,
2040                                      vec4_instruction *last_rhs_inst)
2041 {
2042    /* This could be supported, but it would take more smarts. */
2043    if (ir->condition)
2044       return false;
2045
2046    if (pre_rhs_inst == last_rhs_inst)
2047       return false; /* No instructions generated to work with. */
2048
2049    /* Make sure the last instruction generated our source reg. */
2050    if (src.file != GRF ||
2051        src.file != last_rhs_inst->dst.file ||
2052        src.reg != last_rhs_inst->dst.reg ||
2053        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2054        src.reladdr ||
2055        src.abs ||
2056        src.negate ||
2057        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2058       return false;
2059
2060    /* Check that that last instruction fully initialized the channels
2061     * we want to use, in the order we want to use them.  We could
2062     * potentially reswizzle the operands of many instructions so that
2063     * we could handle out of order channels, but don't yet.
2064     */
2065
2066    for (unsigned i = 0; i < 4; i++) {
2067       if (dst.writemask & (1 << i)) {
2068          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2069             return false;
2070
2071          if (BRW_GET_SWZ(src.swizzle, i) != i)
2072             return false;
2073       }
2074    }
2075
2076    /* Success!  Rewrite the instruction. */
2077    last_rhs_inst->dst.file = dst.file;
2078    last_rhs_inst->dst.reg = dst.reg;
2079    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2080    last_rhs_inst->dst.reladdr = dst.reladdr;
2081    last_rhs_inst->dst.writemask &= dst.writemask;
2082
2083    return true;
2084 }
2085
2086 void
2087 vec4_visitor::visit(ir_assignment *ir)
2088 {
2089    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2090    uint32_t predicate = BRW_PREDICATE_NONE;
2091
2092    if (!ir->lhs->type->is_scalar() &&
2093        !ir->lhs->type->is_vector()) {
2094       ir->rhs->accept(this);
2095       src_reg src = this->result;
2096
2097       if (ir->condition) {
2098          emit_bool_to_cond_code(ir->condition, &predicate);
2099       }
2100
2101       /* emit_block_move doesn't account for swizzles in the source register.
2102        * This should be ok, since the source register is a structure or an
2103        * array, and those can't be swizzled.  But double-check to be sure.
2104        */
2105       assert(src.swizzle ==
2106              (ir->rhs->type->is_matrix()
2107               ? swizzle_for_size(ir->rhs->type->vector_elements)
2108               : BRW_SWIZZLE_NOOP));
2109
2110       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2111       return;
2112    }
2113
2114    /* Now we're down to just a scalar/vector with writemasks. */
2115    int i;
2116
2117    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2118    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2119
2120    ir->rhs->accept(this);
2121
2122    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2123
2124    src_reg src = this->result;
2125
2126    int swizzles[4];
2127    int first_enabled_chan = 0;
2128    int src_chan = 0;
2129
2130    assert(ir->lhs->type->is_vector() ||
2131           ir->lhs->type->is_scalar());
2132    dst.writemask = ir->write_mask;
2133
2134    for (int i = 0; i < 4; i++) {
2135       if (dst.writemask & (1 << i)) {
2136          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2137          break;
2138       }
2139    }
2140
2141    /* Swizzle a small RHS vector into the channels being written.
2142     *
2143     * glsl ir treats write_mask as dictating how many channels are
2144     * present on the RHS while in our instructions we need to make
2145     * those channels appear in the slots of the vec4 they're written to.
2146     */
2147    for (int i = 0; i < 4; i++) {
2148       if (dst.writemask & (1 << i))
2149          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2150       else
2151          swizzles[i] = first_enabled_chan;
2152    }
2153    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2154                               swizzles[2], swizzles[3]);
2155
2156    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2157       return;
2158    }
2159
2160    if (ir->condition) {
2161       emit_bool_to_cond_code(ir->condition, &predicate);
2162    }
2163
2164    for (i = 0; i < type_size(ir->lhs->type); i++) {
2165       vec4_instruction *inst = emit(MOV(dst, src));
2166       inst->predicate = predicate;
2167
2168       dst.reg_offset++;
2169       src.reg_offset++;
2170    }
2171 }
2172
2173 void
2174 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2175 {
2176    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2177       foreach_list(node, &ir->components) {
2178          ir_constant *field_value = (ir_constant *)node;
2179
2180          emit_constant_values(dst, field_value);
2181       }
2182       return;
2183    }
2184
2185    if (ir->type->is_array()) {
2186       for (unsigned int i = 0; i < ir->type->length; i++) {
2187          emit_constant_values(dst, ir->array_elements[i]);
2188       }
2189       return;
2190    }
2191
2192    if (ir->type->is_matrix()) {
2193       for (int i = 0; i < ir->type->matrix_columns; i++) {
2194          float *vec = &ir->value.f[i * ir->type->vector_elements];
2195
2196          for (int j = 0; j < ir->type->vector_elements; j++) {
2197             dst->writemask = 1 << j;
2198             dst->type = BRW_REGISTER_TYPE_F;
2199
2200             emit(MOV(*dst, src_reg(vec[j])));
2201          }
2202          dst->reg_offset++;
2203       }
2204       return;
2205    }
2206
2207    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2208
2209    for (int i = 0; i < ir->type->vector_elements; i++) {
2210       if (!(remaining_writemask & (1 << i)))
2211          continue;
2212
2213       dst->writemask = 1 << i;
2214       dst->type = brw_type_for_base_type(ir->type);
2215
2216       /* Find other components that match the one we're about to
2217        * write.  Emits fewer instructions for things like vec4(0.5,
2218        * 1.5, 1.5, 1.5).
2219        */
2220       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2221          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2222             if (ir->value.b[i] == ir->value.b[j])
2223                dst->writemask |= (1 << j);
2224          } else {
2225             /* u, i, and f storage all line up, so no need for a
2226              * switch case for comparing each type.
2227              */
2228             if (ir->value.u[i] == ir->value.u[j])
2229                dst->writemask |= (1 << j);
2230          }
2231       }
2232
2233       switch (ir->type->base_type) {
2234       case GLSL_TYPE_FLOAT:
2235          emit(MOV(*dst, src_reg(ir->value.f[i])));
2236          break;
2237       case GLSL_TYPE_INT:
2238          emit(MOV(*dst, src_reg(ir->value.i[i])));
2239          break;
2240       case GLSL_TYPE_UINT:
2241          emit(MOV(*dst, src_reg(ir->value.u[i])));
2242          break;
2243       case GLSL_TYPE_BOOL:
2244          emit(MOV(*dst, src_reg(ir->value.b[i])));
2245          break;
2246       default:
2247          assert(!"Non-float/uint/int/bool constant");
2248          break;
2249       }
2250
2251       remaining_writemask &= ~dst->writemask;
2252    }
2253    dst->reg_offset++;
2254 }
2255
2256 void
2257 vec4_visitor::visit(ir_constant *ir)
2258 {
2259    dst_reg dst = dst_reg(this, ir->type);
2260    this->result = src_reg(dst);
2261
2262    emit_constant_values(&dst, ir);
2263 }
2264
2265 void
2266 vec4_visitor::visit(ir_call *ir)
2267 {
2268    assert(!"not reached");
2269 }
2270
2271 void
2272 vec4_visitor::visit(ir_texture *ir)
2273 {
2274    int sampler =
2275       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2276
2277    /* Should be lowered by do_lower_texture_projection */
2278    assert(!ir->projector);
2279
2280    /* Generate code to compute all the subexpression trees.  This has to be
2281     * done before loading any values into MRFs for the sampler message since
2282     * generating these values may involve SEND messages that need the MRFs.
2283     */
2284    src_reg coordinate;
2285    if (ir->coordinate) {
2286       ir->coordinate->accept(this);
2287       coordinate = this->result;
2288    }
2289
2290    src_reg shadow_comparitor;
2291    if (ir->shadow_comparitor) {
2292       ir->shadow_comparitor->accept(this);
2293       shadow_comparitor = this->result;
2294    }
2295
2296    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2297    src_reg lod, dPdx, dPdy, sample_index;
2298    switch (ir->op) {
2299    case ir_tex:
2300       lod = src_reg(0.0f);
2301       lod_type = glsl_type::float_type;
2302       break;
2303    case ir_txf:
2304    case ir_txl:
2305    case ir_txs:
2306       ir->lod_info.lod->accept(this);
2307       lod = this->result;
2308       lod_type = ir->lod_info.lod->type;
2309       break;
2310    case ir_txf_ms:
2311       ir->lod_info.sample_index->accept(this);
2312       sample_index = this->result;
2313       sample_index_type = ir->lod_info.sample_index->type;
2314       break;
2315    case ir_txd:
2316       ir->lod_info.grad.dPdx->accept(this);
2317       dPdx = this->result;
2318
2319       ir->lod_info.grad.dPdy->accept(this);
2320       dPdy = this->result;
2321
2322       lod_type = ir->lod_info.grad.dPdx->type;
2323       break;
2324    case ir_txb:
2325    case ir_lod:
2326       break;
2327    }
2328
2329    vec4_instruction *inst = NULL;
2330    switch (ir->op) {
2331    case ir_tex:
2332    case ir_txl:
2333       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2334       break;
2335    case ir_txd:
2336       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2337       break;
2338    case ir_txf:
2339       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2340       break;
2341    case ir_txf_ms:
2342       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2343       break;
2344    case ir_txs:
2345       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2346       break;
2347    case ir_txb:
2348       assert(!"TXB is not valid for vertex shaders.");
2349       break;
2350    case ir_lod:
2351       assert(!"LOD is not valid for vertex shaders.");
2352       break;
2353    }
2354
2355    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2356
2357    /* Texel offsets go in the message header; Gen4 also requires headers. */
2358    inst->header_present = use_texture_offset || intel->gen < 5;
2359    inst->base_mrf = 2;
2360    inst->mlen = inst->header_present + 1; /* always at least one */
2361    inst->sampler = sampler;
2362    inst->dst = dst_reg(this, ir->type);
2363    inst->dst.writemask = WRITEMASK_XYZW;
2364    inst->shadow_compare = ir->shadow_comparitor != NULL;
2365
2366    if (use_texture_offset)
2367       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2368
2369    /* MRF for the first parameter */
2370    int param_base = inst->base_mrf + inst->header_present;
2371
2372    if (ir->op == ir_txs) {
2373       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2374       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2375    } else {
2376       int i, coord_mask = 0, zero_mask = 0;
2377       /* Load the coordinate */
2378       /* FINISHME: gl_clamp_mask and saturate */
2379       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2380          coord_mask |= (1 << i);
2381       for (; i < 4; i++)
2382          zero_mask |= (1 << i);
2383
2384       if (ir->offset && ir->op == ir_txf) {
2385          /* It appears that the ld instruction used for txf does its
2386           * address bounds check before adding in the offset.  To work
2387           * around this, just add the integer offset to the integer
2388           * texel coordinate, and don't put the offset in the header.
2389           */
2390          ir_constant *offset = ir->offset->as_constant();
2391          assert(offset);
2392
2393          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2394             src_reg src = coordinate;
2395             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2396                                        BRW_GET_SWZ(src.swizzle, j),
2397                                        BRW_GET_SWZ(src.swizzle, j),
2398                                        BRW_GET_SWZ(src.swizzle, j));
2399             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2400                      src, offset->value.i[j]));
2401          }
2402       } else {
2403          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2404                   coordinate));
2405       }
2406       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2407                src_reg(0)));
2408       /* Load the shadow comparitor */
2409       if (ir->shadow_comparitor && ir->op != ir_txd) {
2410          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2411                           WRITEMASK_X),
2412                   shadow_comparitor));
2413          inst->mlen++;
2414       }
2415
2416       /* Load the LOD info */
2417       if (ir->op == ir_tex || ir->op == ir_txl) {
2418          int mrf, writemask;
2419          if (intel->gen >= 5) {
2420             mrf = param_base + 1;
2421             if (ir->shadow_comparitor) {
2422                writemask = WRITEMASK_Y;
2423                /* mlen already incremented */
2424             } else {
2425                writemask = WRITEMASK_X;
2426                inst->mlen++;
2427             }
2428          } else /* intel->gen == 4 */ {
2429             mrf = param_base;
2430             writemask = WRITEMASK_Z;
2431          }
2432          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2433       } else if (ir->op == ir_txf) {
2434          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2435       } else if (ir->op == ir_txf_ms) {
2436          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2437                   sample_index));
2438          inst->mlen++;
2439
2440          /* on Gen7, there is an additional MCS parameter here after SI,
2441           * but we don't bother to emit it since it's always zero. If
2442           * we start supporting texturing from CMS surfaces, this will have
2443           * to change
2444           */
2445       } else if (ir->op == ir_txd) {
2446          const glsl_type *type = lod_type;
2447
2448          if (intel->gen >= 5) {
2449             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2450             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2451             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2452             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2453             inst->mlen++;
2454
2455             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2456                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2457                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2458                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2459                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2460                inst->mlen++;
2461
2462                if (ir->shadow_comparitor) {
2463                   emit(MOV(dst_reg(MRF, param_base + 2,
2464                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2465                            shadow_comparitor));
2466                }
2467             }
2468          } else /* intel->gen == 4 */ {
2469             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2470             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2471             inst->mlen += 2;
2472          }
2473       }
2474    }
2475
2476    emit(inst);
2477
2478    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2479     * spec requires layers.
2480     */
2481    if (ir->op == ir_txs) {
2482       glsl_type const *type = ir->sampler->type;
2483       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2484           type->sampler_array) {
2485          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2486                    with_writemask(inst->dst, WRITEMASK_Z),
2487                    src_reg(inst->dst), src_reg(6));
2488       }
2489    }
2490
2491    swizzle_result(ir, src_reg(inst->dst), sampler);
2492 }
2493
2494 void
2495 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2496 {
2497    int s = key->tex.swizzles[sampler];
2498
2499    this->result = src_reg(this, ir->type);
2500    dst_reg swizzled_result(this->result);
2501
2502    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2503                         || s == SWIZZLE_NOOP) {
2504       emit(MOV(swizzled_result, orig_val));
2505       return;
2506    }
2507
2508    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2509    int swizzle[4];
2510
2511    for (int i = 0; i < 4; i++) {
2512       switch (GET_SWZ(s, i)) {
2513       case SWIZZLE_ZERO:
2514          zero_mask |= (1 << i);
2515          break;
2516       case SWIZZLE_ONE:
2517          one_mask |= (1 << i);
2518          break;
2519       default:
2520          copy_mask |= (1 << i);
2521          swizzle[i] = GET_SWZ(s, i);
2522          break;
2523       }
2524    }
2525
2526    if (copy_mask) {
2527       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2528       swizzled_result.writemask = copy_mask;
2529       emit(MOV(swizzled_result, orig_val));
2530    }
2531
2532    if (zero_mask) {
2533       swizzled_result.writemask = zero_mask;
2534       emit(MOV(swizzled_result, src_reg(0.0f)));
2535    }
2536
2537    if (one_mask) {
2538       swizzled_result.writemask = one_mask;
2539       emit(MOV(swizzled_result, src_reg(1.0f)));
2540    }
2541 }
2542
2543 void
2544 vec4_visitor::visit(ir_return *ir)
2545 {
2546    assert(!"not reached");
2547 }
2548
2549 void
2550 vec4_visitor::visit(ir_discard *ir)
2551 {
2552    assert(!"not reached");
2553 }
2554
2555 void
2556 vec4_visitor::visit(ir_if *ir)
2557 {
2558    /* Don't point the annotation at the if statement, because then it plus
2559     * the then and else blocks get printed.
2560     */
2561    this->base_ir = ir->condition;
2562
2563    if (intel->gen == 6) {
2564       emit_if_gen6(ir);
2565    } else {
2566       uint32_t predicate;
2567       emit_bool_to_cond_code(ir->condition, &predicate);
2568       emit(IF(predicate));
2569    }
2570
2571    visit_instructions(&ir->then_instructions);
2572
2573    if (!ir->else_instructions.is_empty()) {
2574       this->base_ir = ir->condition;
2575       emit(BRW_OPCODE_ELSE);
2576
2577       visit_instructions(&ir->else_instructions);
2578    }
2579
2580    this->base_ir = ir->condition;
2581    emit(BRW_OPCODE_ENDIF);
2582 }
2583
2584 void
2585 vec4_visitor::emit_ndc_computation()
2586 {
2587    /* Get the position */
2588    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2589
2590    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2591    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2592    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2593
2594    current_annotation = "NDC";
2595    dst_reg ndc_w = ndc;
2596    ndc_w.writemask = WRITEMASK_W;
2597    src_reg pos_w = pos;
2598    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2599    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2600
2601    dst_reg ndc_xyz = ndc;
2602    ndc_xyz.writemask = WRITEMASK_XYZ;
2603
2604    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2605 }
2606
2607 void
2608 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2609 {
2610    if (intel->gen < 6 &&
2611        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2612         key->userclip_active || brw->has_negative_rhw_bug)) {
2613       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2614       dst_reg header1_w = header1;
2615       header1_w.writemask = WRITEMASK_W;
2616       GLuint i;
2617
2618       emit(MOV(header1, 0u));
2619
2620       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2621          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2622
2623          current_annotation = "Point size";
2624          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2625          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2626       }
2627
2628       current_annotation = "Clipping flags";
2629       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2630          vec4_instruction *inst;
2631          gl_varying_slot slot = (prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)
2632             ? VARYING_SLOT_CLIP_VERTEX : VARYING_SLOT_POS;
2633
2634          inst = emit(DP4(dst_null_f(), src_reg(output_reg[slot]),
2635                          src_reg(this->userplane[i])));
2636          inst->conditional_mod = BRW_CONDITIONAL_L;
2637
2638          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2639          inst->predicate = BRW_PREDICATE_NORMAL;
2640       }
2641
2642       /* i965 clipping workaround:
2643        * 1) Test for -ve rhw
2644        * 2) If set,
2645        *      set ndc = (0,0,0,0)
2646        *      set ucp[6] = 1
2647        *
2648        * Later, clipping will detect ucp[6] and ensure the primitive is
2649        * clipped against all fixed planes.
2650        */
2651       if (brw->has_negative_rhw_bug) {
2652          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2653          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2654          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2655          vec4_instruction *inst;
2656          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2657          inst->predicate = BRW_PREDICATE_NORMAL;
2658          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2659          inst->predicate = BRW_PREDICATE_NORMAL;
2660       }
2661
2662       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2663    } else if (intel->gen < 6) {
2664       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2665    } else {
2666       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2667       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2668          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2669                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2670       }
2671       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2672          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2673                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2674       }
2675    }
2676 }
2677
2678 void
2679 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2680 {
2681    if (intel->gen < 6) {
2682       /* Clip distance slots are set aside in gen5, but they are not used.  It
2683        * is not clear whether we actually need to set aside space for them,
2684        * but the performance cost is negligible.
2685        */
2686       return;
2687    }
2688
2689    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2690     *
2691     *     "If a linked set of shaders forming the vertex stage contains no
2692     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2693     *     application has requested clipping against user clip planes through
2694     *     the API, then the coordinate written to gl_Position is used for
2695     *     comparison against the user clip planes."
2696     *
2697     * This function is only called if the shader didn't write to
2698     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2699     * if the user wrote to it; otherwise we use gl_Position.
2700     */
2701    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2702    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2703       clip_vertex = VARYING_SLOT_POS;
2704    }
2705
2706    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2707         ++i) {
2708       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2709                src_reg(output_reg[clip_vertex]),
2710                src_reg(this->userplane[i + offset])));
2711    }
2712 }
2713
2714 void
2715 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2716 {
2717    assert (varying < VARYING_SLOT_MAX);
2718    reg.type = output_reg[varying].type;
2719    current_annotation = output_reg_annotation[varying];
2720    /* Copy the register, saturating if necessary */
2721    vec4_instruction *inst = emit(MOV(reg,
2722                                      src_reg(output_reg[varying])));
2723    if ((varying == VARYING_SLOT_COL0 ||
2724         varying == VARYING_SLOT_COL1 ||
2725         varying == VARYING_SLOT_BFC0 ||
2726         varying == VARYING_SLOT_BFC1) &&
2727        key->clamp_vertex_color) {
2728       inst->saturate = true;
2729    }
2730 }
2731
2732 void
2733 vec4_visitor::emit_urb_slot(int mrf, int varying)
2734 {
2735    struct brw_reg hw_reg = brw_message_reg(mrf);
2736    dst_reg reg = dst_reg(MRF, mrf);
2737    reg.type = BRW_REGISTER_TYPE_F;
2738
2739    switch (varying) {
2740    case VARYING_SLOT_PSIZ:
2741       /* PSIZ is always in slot 0, and is coupled with other flags. */
2742       current_annotation = "indices, point width, clip flags";
2743       emit_psiz_and_flags(hw_reg);
2744       break;
2745    case BRW_VARYING_SLOT_NDC:
2746       current_annotation = "NDC";
2747       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2748       break;
2749    case BRW_VARYING_SLOT_POS_DUPLICATE:
2750    case VARYING_SLOT_POS:
2751       current_annotation = "gl_Position";
2752       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2753       break;
2754    case VARYING_SLOT_CLIP_DIST0:
2755    case VARYING_SLOT_CLIP_DIST1:
2756       if (this->key->uses_clip_distance) {
2757          emit_generic_urb_slot(reg, varying);
2758       } else {
2759          current_annotation = "user clip distances";
2760          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2761       }
2762       break;
2763    case VARYING_SLOT_EDGE:
2764       /* This is present when doing unfilled polygons.  We're supposed to copy
2765        * the edge flag from the user-provided vertex array
2766        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2767        * of that attribute (starts as 1.0f).  This is then used in clipping to
2768        * determine which edges should be drawn as wireframe.
2769        */
2770       current_annotation = "edge flag";
2771       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2772                                     glsl_type::float_type, WRITEMASK_XYZW))));
2773       break;
2774    case BRW_VARYING_SLOT_PAD:
2775       /* No need to write to this slot */
2776       break;
2777    default:
2778       emit_generic_urb_slot(reg, varying);
2779       break;
2780    }
2781 }
2782
2783 static int
2784 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2785 {
2786    struct intel_context *intel = &brw->intel;
2787
2788    if (intel->gen >= 6) {
2789       /* URB data written (does not include the message header reg) must
2790        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2791        * section 5.4.3.2.2: URB_INTERLEAVED.
2792        *
2793        * URB entries are allocated on a multiple of 1024 bits, so an
2794        * extra 128 bits written here to make the end align to 256 is
2795        * no problem.
2796        */
2797       if ((mlen % 2) != 1)
2798          mlen++;
2799    }
2800
2801    return mlen;
2802 }
2803
2804 void
2805 vec4_vs_visitor::emit_urb_write_header(int mrf)
2806 {
2807    /* No need to do anything for VS; an implied write to this MRF will be
2808     * performed by VS_OPCODE_URB_WRITE.
2809     */
2810    (void) mrf;
2811 }
2812
2813 vec4_instruction *
2814 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2815 {
2816    /* For VS, the URB writes end the thread. */
2817    if (complete) {
2818       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2819          emit_shader_time_end();
2820    }
2821
2822    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2823    inst->eot = complete;
2824
2825    return inst;
2826 }
2827
2828 /**
2829  * Generates the VUE payload plus the necessary URB write instructions to
2830  * output it.
2831  *
2832  * The VUE layout is documented in Volume 2a.
2833  */
2834 void
2835 vec4_visitor::emit_vertex()
2836 {
2837    /* MRF 0 is reserved for the debugger, so start with message header
2838     * in MRF 1.
2839     */
2840    int base_mrf = 1;
2841    int mrf = base_mrf;
2842    /* In the process of generating our URB write message contents, we
2843     * may need to unspill a register or load from an array.  Those
2844     * reads would use MRFs 14-15.
2845     */
2846    int max_usable_mrf = 13;
2847
2848    /* The following assertion verifies that max_usable_mrf causes an
2849     * even-numbered amount of URB write data, which will meet gen6's
2850     * requirements for length alignment.
2851     */
2852    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2853
2854    /* First mrf is the g0-based message header containing URB handles and
2855     * such.
2856     */
2857    emit_urb_write_header(mrf++);
2858
2859    if (intel->gen < 6) {
2860       emit_ndc_computation();
2861    }
2862
2863    /* Set up the VUE data for the first URB write */
2864    int slot;
2865    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2866       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2867
2868       /* If this was max_usable_mrf, we can't fit anything more into this URB
2869        * WRITE.
2870        */
2871       if (mrf > max_usable_mrf) {
2872          slot++;
2873          break;
2874       }
2875    }
2876
2877    bool complete = slot >= prog_data->vue_map.num_slots;
2878    current_annotation = "URB write";
2879    vec4_instruction *inst = emit_urb_write_opcode(complete);
2880    inst->base_mrf = base_mrf;
2881    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2882
2883    /* Optional second URB write */
2884    if (!complete) {
2885       mrf = base_mrf + 1;
2886
2887       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2888          assert(mrf < max_usable_mrf);
2889
2890          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2891       }
2892
2893       current_annotation = "URB write";
2894       inst = emit_urb_write_opcode(true /* complete */);
2895       inst->base_mrf = base_mrf;
2896       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2897       /* URB destination offset.  In the previous write, we got MRFs
2898        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2899        * URB row increments, and each of our MRFs is half of one of
2900        * those, since we're doing interleaved writes.
2901        */
2902       inst->offset = (max_usable_mrf - base_mrf) / 2;
2903    }
2904 }
2905
2906 void
2907 vec4_vs_visitor::emit_thread_end()
2908 {
2909    /* For VS, we always end the thread by emitting a single vertex.
2910     * emit_urb_write_opcode() will take care of setting the eot flag on the
2911     * SEND instruction.
2912     */
2913    emit_vertex();
2914 }
2915
2916 src_reg
2917 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2918                                  src_reg *reladdr, int reg_offset)
2919 {
2920    /* Because we store the values to scratch interleaved like our
2921     * vertex data, we need to scale the vec4 index by 2.
2922     */
2923    int message_header_scale = 2;
2924
2925    /* Pre-gen6, the message header uses byte offsets instead of vec4
2926     * (16-byte) offset units.
2927     */
2928    if (intel->gen < 6)
2929       message_header_scale *= 16;
2930
2931    if (reladdr) {
2932       src_reg index = src_reg(this, glsl_type::int_type);
2933
2934       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2935       emit_before(inst, MUL(dst_reg(index),
2936                             index, src_reg(message_header_scale)));
2937
2938       return index;
2939    } else {
2940       return src_reg(reg_offset * message_header_scale);
2941    }
2942 }
2943
2944 src_reg
2945 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2946                                        src_reg *reladdr, int reg_offset)
2947 {
2948    if (reladdr) {
2949       src_reg index = src_reg(this, glsl_type::int_type);
2950
2951       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2952
2953       /* Pre-gen6, the message header uses byte offsets instead of vec4
2954        * (16-byte) offset units.
2955        */
2956       if (intel->gen < 6) {
2957          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2958       }
2959
2960       return index;
2961    } else {
2962       int message_header_scale = intel->gen < 6 ? 16 : 1;
2963       return src_reg(reg_offset * message_header_scale);
2964    }
2965 }
2966
2967 /**
2968  * Emits an instruction before @inst to load the value named by @orig_src
2969  * from scratch space at @base_offset to @temp.
2970  *
2971  * @base_offset is measured in 32-byte units (the size of a register).
2972  */
2973 void
2974 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2975                                 dst_reg temp, src_reg orig_src,
2976                                 int base_offset)
2977 {
2978    int reg_offset = base_offset + orig_src.reg_offset;
2979    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2980
2981    emit_before(inst, SCRATCH_READ(temp, index));
2982 }
2983
2984 /**
2985  * Emits an instruction after @inst to store the value to be written
2986  * to @orig_dst to scratch space at @base_offset, from @temp.
2987  *
2988  * @base_offset is measured in 32-byte units (the size of a register).
2989  */
2990 void
2991 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2992 {
2993    int reg_offset = base_offset + inst->dst.reg_offset;
2994    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2995
2996    /* Create a temporary register to store *inst's result in.
2997     *
2998     * We have to be careful in MOVing from our temporary result register in
2999     * the scratch write.  If we swizzle from channels of the temporary that
3000     * weren't initialized, it will confuse live interval analysis, which will
3001     * make spilling fail to make progress.
3002     */
3003    src_reg temp = src_reg(this, glsl_type::vec4_type);
3004    temp.type = inst->dst.type;
3005    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3006    int swizzles[4];
3007    for (int i = 0; i < 4; i++)
3008       if (inst->dst.writemask & (1 << i))
3009          swizzles[i] = i;
3010       else
3011          swizzles[i] = first_writemask_chan;
3012    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3013                                swizzles[2], swizzles[3]);
3014
3015    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3016                                        inst->dst.writemask));
3017    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3018    write->predicate = inst->predicate;
3019    write->ir = inst->ir;
3020    write->annotation = inst->annotation;
3021    inst->insert_after(write);
3022
3023    inst->dst.file = temp.file;
3024    inst->dst.reg = temp.reg;
3025    inst->dst.reg_offset = temp.reg_offset;
3026    inst->dst.reladdr = NULL;
3027 }
3028
3029 /**
3030  * We can't generally support array access in GRF space, because a
3031  * single instruction's destination can only span 2 contiguous
3032  * registers.  So, we send all GRF arrays that get variable index
3033  * access to scratch space.
3034  */
3035 void
3036 vec4_visitor::move_grf_array_access_to_scratch()
3037 {
3038    int scratch_loc[this->virtual_grf_count];
3039
3040    for (int i = 0; i < this->virtual_grf_count; i++) {
3041       scratch_loc[i] = -1;
3042    }
3043
3044    /* First, calculate the set of virtual GRFs that need to be punted
3045     * to scratch due to having any array access on them, and where in
3046     * scratch.
3047     */
3048    foreach_list(node, &this->instructions) {
3049       vec4_instruction *inst = (vec4_instruction *)node;
3050
3051       if (inst->dst.file == GRF && inst->dst.reladdr &&
3052           scratch_loc[inst->dst.reg] == -1) {
3053          scratch_loc[inst->dst.reg] = c->last_scratch;
3054          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3055       }
3056
3057       for (int i = 0 ; i < 3; i++) {
3058          src_reg *src = &inst->src[i];
3059
3060          if (src->file == GRF && src->reladdr &&
3061              scratch_loc[src->reg] == -1) {
3062             scratch_loc[src->reg] = c->last_scratch;
3063             c->last_scratch += this->virtual_grf_sizes[src->reg];
3064          }
3065       }
3066    }
3067
3068    /* Now, for anything that will be accessed through scratch, rewrite
3069     * it to load/store.  Note that this is a _safe list walk, because
3070     * we may generate a new scratch_write instruction after the one
3071     * we're processing.
3072     */
3073    foreach_list_safe(node, &this->instructions) {
3074       vec4_instruction *inst = (vec4_instruction *)node;
3075
3076       /* Set up the annotation tracking for new generated instructions. */
3077       base_ir = inst->ir;
3078       current_annotation = inst->annotation;
3079
3080       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3081          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3082       }
3083
3084       for (int i = 0 ; i < 3; i++) {
3085          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3086             continue;
3087
3088          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3089
3090          emit_scratch_read(inst, temp, inst->src[i],
3091                            scratch_loc[inst->src[i].reg]);
3092
3093          inst->src[i].file = temp.file;
3094          inst->src[i].reg = temp.reg;
3095          inst->src[i].reg_offset = temp.reg_offset;
3096          inst->src[i].reladdr = NULL;
3097       }
3098    }
3099 }
3100
3101 /**
3102  * Emits an instruction before @inst to load the value named by @orig_src
3103  * from the pull constant buffer (surface) at @base_offset to @temp.
3104  */
3105 void
3106 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3107                                       dst_reg temp, src_reg orig_src,
3108                                       int base_offset)
3109 {
3110    int reg_offset = base_offset + orig_src.reg_offset;
3111    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
3112    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3113    vec4_instruction *load;
3114
3115    if (intel->gen >= 7) {
3116       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3117       grf_offset.type = offset.type;
3118       emit_before(inst, MOV(grf_offset, offset));
3119
3120       load = new(mem_ctx) vec4_instruction(this,
3121                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3122                                            temp, index, src_reg(grf_offset));
3123    } else {
3124       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3125                                            temp, index, offset);
3126       load->base_mrf = 14;
3127       load->mlen = 1;
3128    }
3129    emit_before(inst, load);
3130 }
3131
3132 /**
3133  * Implements array access of uniforms by inserting a
3134  * PULL_CONSTANT_LOAD instruction.
3135  *
3136  * Unlike temporary GRF array access (where we don't support it due to
3137  * the difficulty of doing relative addressing on instruction
3138  * destinations), we could potentially do array access of uniforms
3139  * that were loaded in GRF space as push constants.  In real-world
3140  * usage we've seen, though, the arrays being used are always larger
3141  * than we could load as push constants, so just always move all
3142  * uniform array access out to a pull constant buffer.
3143  */
3144 void
3145 vec4_visitor::move_uniform_array_access_to_pull_constants()
3146 {
3147    int pull_constant_loc[this->uniforms];
3148
3149    for (int i = 0; i < this->uniforms; i++) {
3150       pull_constant_loc[i] = -1;
3151    }
3152
3153    /* Walk through and find array access of uniforms.  Put a copy of that
3154     * uniform in the pull constant buffer.
3155     *
3156     * Note that we don't move constant-indexed accesses to arrays.  No
3157     * testing has been done of the performance impact of this choice.
3158     */
3159    foreach_list_safe(node, &this->instructions) {
3160       vec4_instruction *inst = (vec4_instruction *)node;
3161
3162       for (int i = 0 ; i < 3; i++) {
3163          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3164             continue;
3165
3166          int uniform = inst->src[i].reg;
3167
3168          /* If this array isn't already present in the pull constant buffer,
3169           * add it.
3170           */
3171          if (pull_constant_loc[uniform] == -1) {
3172             const float **values = &prog_data->param[uniform * 4];
3173
3174             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3175
3176             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3177                prog_data->pull_param[prog_data->nr_pull_params++]
3178                   = values[j];
3179             }
3180          }
3181
3182          /* Set up the annotation tracking for new generated instructions. */
3183          base_ir = inst->ir;
3184          current_annotation = inst->annotation;
3185
3186          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3187
3188          emit_pull_constant_load(inst, temp, inst->src[i],
3189                                  pull_constant_loc[uniform]);
3190
3191          inst->src[i].file = temp.file;
3192          inst->src[i].reg = temp.reg;
3193          inst->src[i].reg_offset = temp.reg_offset;
3194          inst->src[i].reladdr = NULL;
3195       }
3196    }
3197
3198    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3199     * no need to track them as larger-than-vec4 objects.  This will be
3200     * relied on in cutting out unused uniform vectors from push
3201     * constants.
3202     */
3203    split_uniform_registers();
3204 }
3205
3206 void
3207 vec4_visitor::resolve_ud_negate(src_reg *reg)
3208 {
3209    if (reg->type != BRW_REGISTER_TYPE_UD ||
3210        !reg->negate)
3211       return;
3212
3213    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3214    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3215    *reg = temp;
3216 }
3217
3218 vec4_visitor::vec4_visitor(struct brw_context *brw,
3219                            struct brw_vec4_compile *c,
3220                            struct gl_program *prog,
3221                            const struct brw_vec4_prog_key *key,
3222                            struct brw_vec4_prog_data *prog_data,
3223                            struct gl_shader_program *shader_prog,
3224                            struct brw_shader *shader,
3225                            void *mem_ctx,
3226                            bool debug_flag)
3227    : debug_flag(debug_flag)
3228 {
3229    this->brw = brw;
3230    this->intel = &brw->intel;
3231    this->ctx = &intel->ctx;
3232    this->shader_prog = shader_prog;
3233    this->shader = shader;
3234
3235    this->mem_ctx = mem_ctx;
3236    this->failed = false;
3237
3238    this->base_ir = NULL;
3239    this->current_annotation = NULL;
3240    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3241
3242    this->c = c;
3243    this->prog = prog;
3244    this->key = key;
3245    this->prog_data = prog_data;
3246
3247    this->variable_ht = hash_table_ctor(0,
3248                                        hash_table_pointer_hash,
3249                                        hash_table_pointer_compare);
3250
3251    this->virtual_grf_start = NULL;
3252    this->virtual_grf_end = NULL;
3253    this->virtual_grf_sizes = NULL;
3254    this->virtual_grf_count = 0;
3255    this->virtual_grf_reg_map = NULL;
3256    this->virtual_grf_reg_count = 0;
3257    this->virtual_grf_array_size = 0;
3258    this->live_intervals_valid = false;
3259
3260    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3261
3262    this->uniforms = 0;
3263 }
3264
3265 vec4_visitor::~vec4_visitor()
3266 {
3267    hash_table_dtor(this->variable_ht);
3268 }
3269
3270
3271 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3272                                  struct brw_vs_compile *vs_compile,
3273                                  struct brw_vs_prog_data *vs_prog_data,
3274                                  struct gl_shader_program *prog,
3275                                  struct brw_shader *shader,
3276                                  void *mem_ctx)
3277    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3278                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3279                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3280      vs_compile(vs_compile),
3281      vs_prog_data(vs_prog_data)
3282 {
3283 }
3284
3285
3286 void
3287 vec4_visitor::fail(const char *format, ...)
3288 {
3289    va_list va;
3290    char *msg;
3291
3292    if (failed)
3293       return;
3294
3295    failed = true;
3296
3297    va_start(va, format);
3298    msg = ralloc_vasprintf(mem_ctx, format, va);
3299    va_end(va);
3300    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3301
3302    this->fail_msg = msg;
3303
3304    if (debug_flag) {
3305       fprintf(stderr, "%s",  msg);
3306    }
3307 }
3308
3309 } /* namespace brw */