src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_fs_surface_builder.h"
  27 #include "brw_nir.h"
  28 #include "util/u_math.h"
  29
  30 using namespace brw;
  31 using namespace brw::surface_access;
  32
  33 void
  34 fs_visitor::emit_nir_code()
  35 {
  36    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  37     * be converted to reads/writes of these arrays
  38     */
  39    nir_setup_outputs();
  40    nir_setup_uniforms();
  41    nir_emit_system_values();
  42
  43    /* get the main function and emit it */
  44    nir_foreach_function(function, nir) {
  45       assert(strcmp(function->name, "main") == 0);
  46       assert(function->impl);
  47       nir_emit_impl(function->impl);
  48    }
  49 }
  50
  51 void
  52 fs_visitor::nir_setup_outputs()
  53 {
  54    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  55       return;
  56
  57    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
  58
  59    /* Calculate the size of output registers in a separate pass, before
  60     * allocating them.  With ARB_enhanced_layouts, multiple output variables
  61     * may occupy the same slot, but have different type sizes.
  62     */
  63    nir_foreach_variable(var, &nir->outputs) {
  64       const int loc = var->data.driver_location;
  65       const unsigned var_vec4s =
  66          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  67                            : type_size_vec4(var->type);
  68       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
  69    }
  70
  71    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
  72       if (vec4s[loc] == 0) {
  73          loc++;
  74          continue;
  75       }
  76
  77       unsigned reg_size = vec4s[loc];
  78
  79       /* Check if there are any ranges that start within this range and extend
  80        * past it. If so, include them in this allocation.
  81        */
  82       for (unsigned i = 1; i < reg_size; i++)
  83          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
  84
  85       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
  86       for (unsigned i = 0; i < reg_size; i++)
  87          outputs[loc + i] = offset(reg, bld, 4 * i);
  88
  89       loc += reg_size;
  90    }
  91 }
  92
  93 void
  94 fs_visitor::nir_setup_uniforms()
  95 {
  96    /* Only the first compile gets to set up uniforms. */
  97    if (push_constant_loc) {
  98       assert(pull_constant_loc);
  99       return;
 100    }
 101
 102    uniforms = nir->num_uniforms / 4;
 103
 104    if (stage == MESA_SHADER_COMPUTE) {
 105       /* Add a uniform for the thread local id.  It must be the last uniform
 106        * on the list.
 107        */
 108       assert(uniforms == prog_data->nr_params);
 109       uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
 110       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
 111       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 112    }
 113 }
 114
 115 static bool
 116 emit_system_values_block(nir_block *block, fs_visitor *v)
 117 {
 118    fs_reg *reg;
 119
 120    nir_foreach_instr(instr, block) {
 121       if (instr->type != nir_instr_type_intrinsic)
 122          continue;
 123
 124       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 125       switch (intrin->intrinsic) {
 126       case nir_intrinsic_load_vertex_id:
 127       case nir_intrinsic_load_base_vertex:
 128          unreachable("should be lowered by nir_lower_system_values().");
 129
 130       case nir_intrinsic_load_vertex_id_zero_base:
 131       case nir_intrinsic_load_is_indexed_draw:
 132       case nir_intrinsic_load_first_vertex:
 133       case nir_intrinsic_load_instance_id:
 134       case nir_intrinsic_load_base_instance:
 135       case nir_intrinsic_load_draw_id:
 136          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 137
 138       case nir_intrinsic_load_invocation_id:
 139          if (v->stage == MESA_SHADER_TESS_CTRL)
 140             break;
 141          assert(v->stage == MESA_SHADER_GEOMETRY);
 142          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 143          if (reg->file == BAD_FILE) {
 144             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 145             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 146             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 147             abld.SHR(iid, g1, brw_imm_ud(27u));
 148             *reg = iid;
 149          }
 150          break;
 151
 152       case nir_intrinsic_load_sample_pos:
 153          assert(v->stage == MESA_SHADER_FRAGMENT);
 154          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 155          if (reg->file == BAD_FILE)
 156             *reg = *v->emit_samplepos_setup();
 157          break;
 158
 159       case nir_intrinsic_load_sample_id:
 160          assert(v->stage == MESA_SHADER_FRAGMENT);
 161          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 162          if (reg->file == BAD_FILE)
 163             *reg = *v->emit_sampleid_setup();
 164          break;
 165
 166       case nir_intrinsic_load_sample_mask_in:
 167          assert(v->stage == MESA_SHADER_FRAGMENT);
 168          assert(v->devinfo->gen >= 7);
 169          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 170          if (reg->file == BAD_FILE)
 171             *reg = *v->emit_samplemaskin_setup();
 172          break;
 173
 174       case nir_intrinsic_load_work_group_id:
 175          assert(v->stage == MESA_SHADER_COMPUTE);
 176          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 177          if (reg->file == BAD_FILE)
 178             *reg = *v->emit_cs_work_group_id_setup();
 179          break;
 180
 181       case nir_intrinsic_load_helper_invocation:
 182          assert(v->stage == MESA_SHADER_FRAGMENT);
 183          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 184          if (reg->file == BAD_FILE) {
 185             const fs_builder abld =
 186                v->bld.annotate("gl_HelperInvocation", NULL);
 187
 188             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 189              * pixel mask is in g1.7 of the thread payload.
 190              *
 191              * We move the per-channel pixel enable bit to the low bit of each
 192              * channel by shifting the byte containing the pixel mask by the
 193              * vector immediate 0x76543210UV.
 194              *
 195              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 196              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 197              * masks for 2 and 3) in SIMD16.
 198              */
 199             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 200
 201             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
 202                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
 203                hbld.SHR(offset(shifted, hbld, i),
 204                         stride(retype(brw_vec1_grf(1 + i, 7),
 205                                       BRW_REGISTER_TYPE_UB),
 206                                1, 8, 0),
 207                         brw_imm_v(0x76543210));
 208             }
 209
 210             /* A set bit in the pixel mask means the channel is enabled, but
 211              * that is the opposite of gl_HelperInvocation so we need to invert
 212              * the mask.
 213              *
 214              * The negate source-modifier bit of logical instructions on Gen8+
 215              * performs 1's complement negation, so we can use that instead of
 216              * a NOT instruction.
 217              */
 218             fs_reg inverted = negate(shifted);
 219             if (v->devinfo->gen < 8) {
 220                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 221                abld.NOT(inverted, shifted);
 222             }
 223
 224             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 225              * with 1 and negating.
 226              */
 227             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 228             abld.AND(anded, inverted, brw_imm_uw(1));
 229
 230             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 231             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 232             *reg = dst;
 233          }
 234          break;
 235
 236       default:
 237          break;
 238       }
 239    }
 240
 241    return true;
 242 }
 243
 244 void
 245 fs_visitor::nir_emit_system_values()
 246 {
 247    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 248    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 249       nir_system_values[i] = fs_reg();
 250    }
 251
 252    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
 253     * never end up using it.
 254     */
 255    {
 256       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
 257       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
 258       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 259
 260       const fs_builder allbld8 = abld.group(8, 0).exec_all();
 261       allbld8.MOV(reg, brw_imm_v(0x76543210));
 262       if (dispatch_width > 8)
 263          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
 264       if (dispatch_width > 16) {
 265          const fs_builder allbld16 = abld.group(16, 0).exec_all();
 266          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
 267       }
 268    }
 269
 270    nir_foreach_function(function, nir) {
 271       assert(strcmp(function->name, "main") == 0);
 272       assert(function->impl);
 273       nir_foreach_block(block, function->impl) {
 274          emit_system_values_block(block, this);
 275       }
 276    }
 277 }
 278
 279 /*
 280  * Returns a type based on a reference_type (word, float, half-float) and a
 281  * given bit_size.
 282  *
 283  * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
 284  *
 285  * @FIXME: 64-bit return types are always DF on integer types to maintain
 286  * compability with uses of DF previously to the introduction of int64
 287  * support.
 288  */
 289 static brw_reg_type
 290 brw_reg_type_from_bit_size(const unsigned bit_size,
 291                            const brw_reg_type reference_type)
 292 {
 293    switch(reference_type) {
 294    case BRW_REGISTER_TYPE_HF:
 295    case BRW_REGISTER_TYPE_F:
 296    case BRW_REGISTER_TYPE_DF:
 297       switch(bit_size) {
 298       case 16:
 299          return BRW_REGISTER_TYPE_HF;
 300       case 32:
 301          return BRW_REGISTER_TYPE_F;
 302       case 64:
 303          return BRW_REGISTER_TYPE_DF;
 304       default:
 305          unreachable("Invalid bit size");
 306       }
 307    case BRW_REGISTER_TYPE_B:
 308    case BRW_REGISTER_TYPE_W:
 309    case BRW_REGISTER_TYPE_D:
 310    case BRW_REGISTER_TYPE_Q:
 311       switch(bit_size) {
 312       case 8:
 313          return BRW_REGISTER_TYPE_B;
 314       case 16:
 315          return BRW_REGISTER_TYPE_W;
 316       case 32:
 317          return BRW_REGISTER_TYPE_D;
 318       case 64:
 319          return BRW_REGISTER_TYPE_Q;
 320       default:
 321          unreachable("Invalid bit size");
 322       }
 323    case BRW_REGISTER_TYPE_UB:
 324    case BRW_REGISTER_TYPE_UW:
 325    case BRW_REGISTER_TYPE_UD:
 326    case BRW_REGISTER_TYPE_UQ:
 327       switch(bit_size) {
 328       case 8:
 329          return BRW_REGISTER_TYPE_UB;
 330       case 16:
 331          return BRW_REGISTER_TYPE_UW;
 332       case 32:
 333          return BRW_REGISTER_TYPE_UD;
 334       case 64:
 335          return BRW_REGISTER_TYPE_UQ;
 336       default:
 337          unreachable("Invalid bit size");
 338       }
 339    default:
 340       unreachable("Unknown type");
 341    }
 342 }
 343
 344 void
 345 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 346 {
 347    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 348    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 349       nir_locals[i] = fs_reg();
 350    }
 351
 352    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 353       unsigned array_elems =
 354          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 355       unsigned size = array_elems * reg->num_components;
 356       const brw_reg_type reg_type =
 357          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
 358       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 359    }
 360
 361    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 362                              impl->ssa_alloc);
 363
 364    nir_emit_cf_list(&impl->body);
 365 }
 366
 367 void
 368 fs_visitor::nir_emit_cf_list(exec_list *list)
 369 {
 370    exec_list_validate(list);
 371    foreach_list_typed(nir_cf_node, node, node, list) {
 372       switch (node->type) {
 373       case nir_cf_node_if:
 374          nir_emit_if(nir_cf_node_as_if(node));
 375          break;
 376
 377       case nir_cf_node_loop:
 378          nir_emit_loop(nir_cf_node_as_loop(node));
 379          break;
 380
 381       case nir_cf_node_block:
 382          nir_emit_block(nir_cf_node_as_block(node));
 383          break;
 384
 385       default:
 386          unreachable("Invalid CFG node block");
 387       }
 388    }
 389 }
 390
 391 void
 392 fs_visitor::nir_emit_if(nir_if *if_stmt)
 393 {
 394    /* first, put the condition into f0 */
 395    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 396                             retype(get_nir_src(if_stmt->condition),
 397                                    BRW_REGISTER_TYPE_D));
 398    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 399
 400    bld.IF(BRW_PREDICATE_NORMAL);
 401
 402    nir_emit_cf_list(&if_stmt->then_list);
 403
 404    /* note: if the else is empty, dead CF elimination will remove it */
 405    bld.emit(BRW_OPCODE_ELSE);
 406
 407    nir_emit_cf_list(&if_stmt->else_list);
 408
 409    bld.emit(BRW_OPCODE_ENDIF);
 410
 411    if (devinfo->gen < 7)
 412       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 413                            "in SIMD32 mode.");
 414 }
 415
 416 void
 417 fs_visitor::nir_emit_loop(nir_loop *loop)
 418 {
 419    bld.emit(BRW_OPCODE_DO);
 420
 421    nir_emit_cf_list(&loop->body);
 422
 423    bld.emit(BRW_OPCODE_WHILE);
 424
 425    if (devinfo->gen < 7)
 426       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 427                            "in SIMD32 mode.");
 428 }
 429
 430 void
 431 fs_visitor::nir_emit_block(nir_block *block)
 432 {
 433    nir_foreach_instr(instr, block) {
 434       nir_emit_instr(instr);
 435    }
 436 }
 437
 438 void
 439 fs_visitor::nir_emit_instr(nir_instr *instr)
 440 {
 441    const fs_builder abld = bld.annotate(NULL, instr);
 442
 443    switch (instr->type) {
 444    case nir_instr_type_alu:
 445       nir_emit_alu(abld, nir_instr_as_alu(instr));
 446       break;
 447
 448    case nir_instr_type_deref:
 449       /* Derefs can exist for images but they do nothing */
 450       break;
 451
 452    case nir_instr_type_intrinsic:
 453       switch (stage) {
 454       case MESA_SHADER_VERTEX:
 455          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 456          break;
 457       case MESA_SHADER_TESS_CTRL:
 458          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 459          break;
 460       case MESA_SHADER_TESS_EVAL:
 461          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 462          break;
 463       case MESA_SHADER_GEOMETRY:
 464          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 465          break;
 466       case MESA_SHADER_FRAGMENT:
 467          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 468          break;
 469       case MESA_SHADER_COMPUTE:
 470          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 471          break;
 472       default:
 473          unreachable("unsupported shader stage");
 474       }
 475       break;
 476
 477    case nir_instr_type_tex:
 478       nir_emit_texture(abld, nir_instr_as_tex(instr));
 479       break;
 480
 481    case nir_instr_type_load_const:
 482       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 483       break;
 484
 485    case nir_instr_type_ssa_undef:
 486       /* We create a new VGRF for undefs on every use (by handling
 487        * them in get_nir_src()), rather than for each definition.
 488        * This helps register coalescing eliminate MOVs from undef.
 489        */
 490       break;
 491
 492    case nir_instr_type_jump:
 493       nir_emit_jump(abld, nir_instr_as_jump(instr));
 494       break;
 495
 496    default:
 497       unreachable("unknown instruction type");
 498    }
 499 }
 500
 501 /**
 502  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 503  * match instr.
 504  */
 505 bool
 506 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 507                                       const fs_reg &result)
 508 {
 509    if (!instr->src[0].src.is_ssa ||
 510        !instr->src[0].src.ssa->parent_instr)
 511       return false;
 512
 513    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 514       return false;
 515
 516    nir_alu_instr *src0 =
 517       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 518
 519    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 520        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 521       return false;
 522
 523    nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
 524    assert(element != NULL);
 525
 526    /* Element type to extract.*/
 527    const brw_reg_type type = brw_int_type(
 528       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 529       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 530
 531    fs_reg op0 = get_nir_src(src0->src[0].src);
 532    op0.type = brw_type_for_nir_type(devinfo,
 533       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 534                      nir_src_bit_size(src0->src[0].src)));
 535    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 536
 537    set_saturate(instr->dest.saturate,
 538                 bld.MOV(result, subscript(op0, type, element->u32[0])));
 539    return true;
 540 }
 541
 542 bool
 543 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 544                                          const fs_reg &result)
 545 {
 546    if (!instr->src[0].src.is_ssa ||
 547        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 548       return false;
 549
 550    nir_intrinsic_instr *src0 =
 551       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 552
 553    if (src0->intrinsic != nir_intrinsic_load_front_face)
 554       return false;
 555
 556    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
 557    if (!value1 || fabsf(value1->f32[0]) != 1.0f)
 558       return false;
 559
 560    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
 561    if (!value2 || fabsf(value2->f32[0]) != 1.0f)
 562       return false;
 563
 564    fs_reg tmp = vgrf(glsl_type::int_type);
 565
 566    if (devinfo->gen >= 6) {
 567       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 568       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 569
 570       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 571        *
 572        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 573        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 574        *
 575        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 576        *
 577        * This negation looks like it's safe in practice, because bits 0:4 will
 578        * surely be TRIANGLES
 579        */
 580
 581       if (value1->f32[0] == -1.0f) {
 582          g0.negate = true;
 583       }
 584
 585       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 586              g0, brw_imm_uw(0x3f80));
 587    } else {
 588       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 589       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 590
 591       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 592        *
 593        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 594        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 595        *
 596        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 597        *
 598        * This negation looks like it's safe in practice, because bits 0:4 will
 599        * surely be TRIANGLES
 600        */
 601
 602       if (value1->f32[0] == -1.0f) {
 603          g1_6.negate = true;
 604       }
 605
 606       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 607    }
 608    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 609
 610    return true;
 611 }
 612
 613 static void
 614 emit_find_msb_using_lzd(const fs_builder &bld,
 615                         const fs_reg &result,
 616                         const fs_reg &src,
 617                         bool is_signed)
 618 {
 619    fs_inst *inst;
 620    fs_reg temp = src;
 621
 622    if (is_signed) {
 623       /* LZD of an absolute value source almost always does the right
 624        * thing.  There are two problem values:
 625        *
 626        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 627        *   0.  However, findMSB(int(0x80000000)) == 30.
 628        *
 629        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 630        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 631        *
 632        *    For a value of zero or negative one, -1 will be returned.
 633        *
 634        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 635        *   findMSB(-(1<<x)) should return x-1.
 636        *
 637        * For all negative number cases, including 0x80000000 and
 638        * 0xffffffff, the correct value is obtained from LZD if instead of
 639        * negating the (already negative) value the logical-not is used.  A
 640        * conditonal logical-not can be achieved in two instructions.
 641        */
 642       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 643
 644       bld.ASR(temp, src, brw_imm_d(31));
 645       bld.XOR(temp, temp, src);
 646    }
 647
 648    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 649            retype(temp, BRW_REGISTER_TYPE_UD));
 650
 651    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 652     * from the LSB side. Subtract the result from 31 to convert the MSB
 653     * count into an LSB count.  If no bits are set, LZD will return 32.
 654     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 655     */
 656    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 657    inst->src[0].negate = true;
 658 }
 659
 660 static brw_rnd_mode
 661 brw_rnd_mode_from_nir_op (const nir_op op) {
 662    switch (op) {
 663    case nir_op_f2f16_rtz:
 664       return BRW_RND_MODE_RTZ;
 665    case nir_op_f2f16_rtne:
 666       return BRW_RND_MODE_RTNE;
 667    default:
 668       unreachable("Operation doesn't support rounding mode");
 669    }
 670 }
 671
 672 void
 673 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 674 {
 675    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 676    fs_inst *inst;
 677
 678    fs_reg result = get_nir_dest(instr->dest.dest);
 679    result.type = brw_type_for_nir_type(devinfo,
 680       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 681                      nir_dest_bit_size(instr->dest.dest)));
 682
 683    fs_reg op[4];
 684    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 685       op[i] = get_nir_src(instr->src[i].src);
 686       op[i].type = brw_type_for_nir_type(devinfo,
 687          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 688                         nir_src_bit_size(instr->src[i].src)));
 689       op[i].abs = instr->src[i].abs;
 690       op[i].negate = instr->src[i].negate;
 691    }
 692
 693    /* We get a bunch of mov's out of the from_ssa pass and they may still
 694     * be vectorized.  We'll handle them as a special-case.  We'll also
 695     * handle vecN here because it's basically the same thing.
 696     */
 697    switch (instr->op) {
 698    case nir_op_imov:
 699    case nir_op_fmov:
 700    case nir_op_vec2:
 701    case nir_op_vec3:
 702    case nir_op_vec4: {
 703       fs_reg temp = result;
 704       bool need_extra_copy = false;
 705       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 706          if (!instr->src[i].src.is_ssa &&
 707              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 708             need_extra_copy = true;
 709             temp = bld.vgrf(result.type, 4);
 710             break;
 711          }
 712       }
 713
 714       for (unsigned i = 0; i < 4; i++) {
 715          if (!(instr->dest.write_mask & (1 << i)))
 716             continue;
 717
 718          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 719             inst = bld.MOV(offset(temp, bld, i),
 720                            offset(op[0], bld, instr->src[0].swizzle[i]));
 721          } else {
 722             inst = bld.MOV(offset(temp, bld, i),
 723                            offset(op[i], bld, instr->src[i].swizzle[0]));
 724          }
 725          inst->saturate = instr->dest.saturate;
 726       }
 727
 728       /* In this case the source and destination registers were the same,
 729        * so we need to insert an extra set of moves in order to deal with
 730        * any swizzling.
 731        */
 732       if (need_extra_copy) {
 733          for (unsigned i = 0; i < 4; i++) {
 734             if (!(instr->dest.write_mask & (1 << i)))
 735                continue;
 736
 737             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 738          }
 739       }
 740       return;
 741    }
 742    default:
 743       break;
 744    }
 745
 746    /* At this point, we have dealt with any instruction that operates on
 747     * more than a single channel.  Therefore, we can just adjust the source
 748     * and destination registers for that channel and emit the instruction.
 749     */
 750    unsigned channel = 0;
 751    if (nir_op_infos[instr->op].output_size == 0) {
 752       /* Since NIR is doing the scalarizing for us, we should only ever see
 753        * vectorized operations with a single channel.
 754        */
 755       assert(util_bitcount(instr->dest.write_mask) == 1);
 756       channel = ffs(instr->dest.write_mask) - 1;
 757
 758       result = offset(result, bld, channel);
 759    }
 760
 761    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 762       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 763       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 764    }
 765
 766    switch (instr->op) {
 767    case nir_op_i2f32:
 768    case nir_op_u2f32:
 769       if (optimize_extract_to_float(instr, result))
 770          return;
 771       inst = bld.MOV(result, op[0]);
 772       inst->saturate = instr->dest.saturate;
 773       break;
 774
 775    case nir_op_f2f16_rtne:
 776    case nir_op_f2f16_rtz:
 777       bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
 778                brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
 779       /* fallthrough */
 780
 781       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
 782        * on the HW gen, it is a special hw opcode or just a MOV, and
 783        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
 784        *
 785        * But if we want to use that opcode, we need to provide support on
 786        * different optimizations and lowerings. As right now HF support is
 787        * only for gen8+, it will be better to use directly the MOV, and use
 788        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
 789        */
 790
 791    case nir_op_f2f16:
 792       inst = bld.MOV(result, op[0]);
 793       inst->saturate = instr->dest.saturate;
 794       break;
 795
 796    case nir_op_f2f64:
 797    case nir_op_f2i64:
 798    case nir_op_f2u64:
 799    case nir_op_i2f64:
 800    case nir_op_i2i64:
 801    case nir_op_u2f64:
 802    case nir_op_u2u64:
 803       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
 804        *
 805        *    "When source or destination is 64b (...), regioning in Align1
 806        *     must follow these rules:
 807        *
 808        *     1. Source and destination horizontal stride must be aligned to
 809        *        the same qword.
 810        *     (...)"
 811        *
 812        * This means that conversions from bit-sizes smaller than 64-bit to
 813        * 64-bit need to have the source data elements aligned to 64-bit.
 814        * This restriction does not apply to BDW and later.
 815        */
 816       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
 817           nir_src_bit_size(instr->src[0].src) < 64 &&
 818           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
 819          fs_reg tmp = bld.vgrf(result.type, 1);
 820          tmp = subscript(tmp, op[0].type, 0);
 821          inst = bld.MOV(tmp, op[0]);
 822          inst = bld.MOV(result, tmp);
 823          inst->saturate = instr->dest.saturate;
 824          break;
 825       }
 826       /* fallthrough */
 827    case nir_op_f2f32:
 828    case nir_op_f2i32:
 829    case nir_op_f2u32:
 830    case nir_op_f2i16:
 831    case nir_op_f2u16:
 832    case nir_op_i2i32:
 833    case nir_op_u2u32:
 834    case nir_op_i2i16:
 835    case nir_op_u2u16:
 836    case nir_op_i2f16:
 837    case nir_op_u2f16:
 838    case nir_op_i2i8:
 839    case nir_op_u2u8:
 840       inst = bld.MOV(result, op[0]);
 841       inst->saturate = instr->dest.saturate;
 842       break;
 843
 844    case nir_op_fsign: {
 845       if (op[0].abs) {
 846          /* Straightforward since the source can be assumed to be either
 847           * strictly >= 0 or strictly <= 0 depending on the setting of the
 848           * negate flag.
 849           */
 850          set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
 851
 852          inst = (op[0].negate)
 853             ? bld.MOV(result, brw_imm_f(-1.0f))
 854             : bld.MOV(result, brw_imm_f(1.0f));
 855
 856          set_predicate(BRW_PREDICATE_NORMAL, inst);
 857
 858          if (instr->dest.saturate)
 859             inst->saturate = true;
 860
 861       } else if (type_sz(op[0].type) < 8) {
 862          /* AND(val, 0x80000000) gives the sign bit.
 863           *
 864           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 865           * zero.
 866           */
 867          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 868
 869          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 870          op[0].type = BRW_REGISTER_TYPE_UD;
 871          result.type = BRW_REGISTER_TYPE_UD;
 872          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 873
 874          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 875          inst->predicate = BRW_PREDICATE_NORMAL;
 876          if (instr->dest.saturate) {
 877             inst = bld.MOV(result, result);
 878             inst->saturate = true;
 879          }
 880       } else {
 881          /* For doubles we do the same but we need to consider:
 882           *
 883           * - 2-src instructions can't operate with 64-bit immediates
 884           * - The sign is encoded in the high 32-bit of each DF
 885           * - We need to produce a DF result.
 886           */
 887
 888          fs_reg zero = vgrf(glsl_type::double_type);
 889          bld.MOV(zero, setup_imm_df(bld, 0.0));
 890          bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 891
 892          bld.MOV(result, zero);
 893
 894          fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
 895          bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
 896                  brw_imm_ud(0x80000000u));
 897
 898          set_predicate(BRW_PREDICATE_NORMAL,
 899                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 900
 901          if (instr->dest.saturate) {
 902             inst = bld.MOV(result, result);
 903             inst->saturate = true;
 904          }
 905       }
 906       break;
 907    }
 908
 909    case nir_op_isign: {
 910       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 911        *               -> non-negative val generates 0x00000000.
 912        *  Predicated OR sets 1 if val is positive.
 913        */
 914       uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
 915       assert(bit_size == 32 || bit_size == 16);
 916
 917       fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
 918       fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
 919       fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
 920
 921       bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
 922       bld.ASR(result, op[0], shift);
 923       inst = bld.OR(result, result, one);
 924       inst->predicate = BRW_PREDICATE_NORMAL;
 925       break;
 926    }
 927
 928    case nir_op_frcp:
 929       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 930       inst->saturate = instr->dest.saturate;
 931       break;
 932
 933    case nir_op_fexp2:
 934       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 935       inst->saturate = instr->dest.saturate;
 936       break;
 937
 938    case nir_op_flog2:
 939       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 940       inst->saturate = instr->dest.saturate;
 941       break;
 942
 943    case nir_op_fsin:
 944       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 945       inst->saturate = instr->dest.saturate;
 946       break;
 947
 948    case nir_op_fcos:
 949       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
 950       inst->saturate = instr->dest.saturate;
 951       break;
 952
 953    case nir_op_fddx:
 954       if (fs_key->high_quality_derivatives) {
 955          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 956       } else {
 957          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 958       }
 959       inst->saturate = instr->dest.saturate;
 960       break;
 961    case nir_op_fddx_fine:
 962       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 963       inst->saturate = instr->dest.saturate;
 964       break;
 965    case nir_op_fddx_coarse:
 966       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 967       inst->saturate = instr->dest.saturate;
 968       break;
 969    case nir_op_fddy:
 970       if (fs_key->high_quality_derivatives) {
 971          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 972       } else {
 973          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 974       }
 975       inst->saturate = instr->dest.saturate;
 976       break;
 977    case nir_op_fddy_fine:
 978       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 979       inst->saturate = instr->dest.saturate;
 980       break;
 981    case nir_op_fddy_coarse:
 982       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 983       inst->saturate = instr->dest.saturate;
 984       break;
 985
 986    case nir_op_iadd:
 987    case nir_op_fadd:
 988       inst = bld.ADD(result, op[0], op[1]);
 989       inst->saturate = instr->dest.saturate;
 990       break;
 991
 992    case nir_op_fmul:
 993       inst = bld.MUL(result, op[0], op[1]);
 994       inst->saturate = instr->dest.saturate;
 995       break;
 996
 997    case nir_op_imul:
 998       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 999       bld.MUL(result, op[0], op[1]);
1000       break;
1001
1002    case nir_op_imul_high:
1003    case nir_op_umul_high:
1004       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1005       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1006       break;
1007
1008    case nir_op_idiv:
1009    case nir_op_udiv:
1010       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1011       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1012       break;
1013
1014    case nir_op_uadd_carry:
1015       unreachable("Should have been lowered by carry_to_arith().");
1016
1017    case nir_op_usub_borrow:
1018       unreachable("Should have been lowered by borrow_to_arith().");
1019
1020    case nir_op_umod:
1021    case nir_op_irem:
1022       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1023        * appears that our hardware just does the right thing for signed
1024        * remainder.
1025        */
1026       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1027       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1028       break;
1029
1030    case nir_op_imod: {
1031       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1032       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1033
1034       /* Math instructions don't support conditional mod */
1035       inst = bld.MOV(bld.null_reg_d(), result);
1036       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1037
1038       /* Now, we need to determine if signs of the sources are different.
1039        * When we XOR the sources, the top bit is 0 if they are the same and 1
1040        * if they are different.  We can then use a conditional modifier to
1041        * turn that into a predicate.  This leads us to an XOR.l instruction.
1042        *
1043        * Technically, according to the PRM, you're not allowed to use .l on a
1044        * XOR instruction.  However, emperical experiments and Curro's reading
1045        * of the simulator source both indicate that it's safe.
1046        */
1047       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1048       inst = bld.XOR(tmp, op[0], op[1]);
1049       inst->predicate = BRW_PREDICATE_NORMAL;
1050       inst->conditional_mod = BRW_CONDITIONAL_L;
1051
1052       /* If the result of the initial remainder operation is non-zero and the
1053        * two sources have different signs, add in a copy of op[1] to get the
1054        * final integer modulus value.
1055        */
1056       inst = bld.ADD(result, result, op[1]);
1057       inst->predicate = BRW_PREDICATE_NORMAL;
1058       break;
1059    }
1060
1061    case nir_op_flt:
1062    case nir_op_fge:
1063    case nir_op_feq:
1064    case nir_op_fne: {
1065       fs_reg dest = result;
1066
1067       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1068       if (bit_size != 32)
1069          dest = bld.vgrf(op[0].type, 1);
1070
1071       brw_conditional_mod cond;
1072       switch (instr->op) {
1073       case nir_op_flt:
1074          cond = BRW_CONDITIONAL_L;
1075          break;
1076       case nir_op_fge:
1077          cond = BRW_CONDITIONAL_GE;
1078          break;
1079       case nir_op_feq:
1080          cond = BRW_CONDITIONAL_Z;
1081          break;
1082       case nir_op_fne:
1083          cond = BRW_CONDITIONAL_NZ;
1084          break;
1085       default:
1086          unreachable("bad opcode");
1087       }
1088
1089       bld.CMP(dest, op[0], op[1], cond);
1090
1091       if (bit_size > 32) {
1092          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1093       } else if(bit_size < 32) {
1094          /* When we convert the result to 32-bit we need to be careful and do
1095           * it as a signed conversion to get sign extension (for 32-bit true)
1096           */
1097          const brw_reg_type src_type =
1098             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1099
1100          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1101       }
1102       break;
1103    }
1104
1105    case nir_op_ilt:
1106    case nir_op_ult:
1107    case nir_op_ige:
1108    case nir_op_uge:
1109    case nir_op_ieq:
1110    case nir_op_ine: {
1111       fs_reg dest = result;
1112
1113       const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1114       if (bit_size != 32)
1115          dest = bld.vgrf(op[0].type, 1);
1116
1117       brw_conditional_mod cond;
1118       switch (instr->op) {
1119       case nir_op_ilt:
1120       case nir_op_ult:
1121          cond = BRW_CONDITIONAL_L;
1122          break;
1123       case nir_op_ige:
1124       case nir_op_uge:
1125          cond = BRW_CONDITIONAL_GE;
1126          break;
1127       case nir_op_ieq:
1128          cond = BRW_CONDITIONAL_Z;
1129          break;
1130       case nir_op_ine:
1131          cond = BRW_CONDITIONAL_NZ;
1132          break;
1133       default:
1134          unreachable("bad opcode");
1135       }
1136       bld.CMP(dest, op[0], op[1], cond);
1137
1138       if (bit_size > 32) {
1139          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1140       } else if (bit_size < 32) {
1141          /* When we convert the result to 32-bit we need to be careful and do
1142           * it as a signed conversion to get sign extension (for 32-bit true)
1143           */
1144          const brw_reg_type src_type =
1145             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1146
1147          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1148       }
1149       break;
1150    }
1151
1152    case nir_op_inot:
1153       if (devinfo->gen >= 8) {
1154          op[0] = resolve_source_modifiers(op[0]);
1155       }
1156       bld.NOT(result, op[0]);
1157       break;
1158    case nir_op_ixor:
1159       if (devinfo->gen >= 8) {
1160          op[0] = resolve_source_modifiers(op[0]);
1161          op[1] = resolve_source_modifiers(op[1]);
1162       }
1163       bld.XOR(result, op[0], op[1]);
1164       break;
1165    case nir_op_ior:
1166       if (devinfo->gen >= 8) {
1167          op[0] = resolve_source_modifiers(op[0]);
1168          op[1] = resolve_source_modifiers(op[1]);
1169       }
1170       bld.OR(result, op[0], op[1]);
1171       break;
1172    case nir_op_iand:
1173       if (devinfo->gen >= 8) {
1174          op[0] = resolve_source_modifiers(op[0]);
1175          op[1] = resolve_source_modifiers(op[1]);
1176       }
1177       bld.AND(result, op[0], op[1]);
1178       break;
1179
1180    case nir_op_fdot2:
1181    case nir_op_fdot3:
1182    case nir_op_fdot4:
1183    case nir_op_ball_fequal2:
1184    case nir_op_ball_iequal2:
1185    case nir_op_ball_fequal3:
1186    case nir_op_ball_iequal3:
1187    case nir_op_ball_fequal4:
1188    case nir_op_ball_iequal4:
1189    case nir_op_bany_fnequal2:
1190    case nir_op_bany_inequal2:
1191    case nir_op_bany_fnequal3:
1192    case nir_op_bany_inequal3:
1193    case nir_op_bany_fnequal4:
1194    case nir_op_bany_inequal4:
1195       unreachable("Lowered by nir_lower_alu_reductions");
1196
1197    case nir_op_fnoise1_1:
1198    case nir_op_fnoise1_2:
1199    case nir_op_fnoise1_3:
1200    case nir_op_fnoise1_4:
1201    case nir_op_fnoise2_1:
1202    case nir_op_fnoise2_2:
1203    case nir_op_fnoise2_3:
1204    case nir_op_fnoise2_4:
1205    case nir_op_fnoise3_1:
1206    case nir_op_fnoise3_2:
1207    case nir_op_fnoise3_3:
1208    case nir_op_fnoise3_4:
1209    case nir_op_fnoise4_1:
1210    case nir_op_fnoise4_2:
1211    case nir_op_fnoise4_3:
1212    case nir_op_fnoise4_4:
1213       unreachable("not reached: should be handled by lower_noise");
1214
1215    case nir_op_ldexp:
1216       unreachable("not reached: should be handled by ldexp_to_arith()");
1217
1218    case nir_op_fsqrt:
1219       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1220       inst->saturate = instr->dest.saturate;
1221       break;
1222
1223    case nir_op_frsq:
1224       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1225       inst->saturate = instr->dest.saturate;
1226       break;
1227
1228    case nir_op_b2i:
1229    case nir_op_b2f:
1230       bld.MOV(result, negate(op[0]));
1231       break;
1232
1233    case nir_op_i2b:
1234    case nir_op_f2b: {
1235       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1236       if (bit_size == 64) {
1237          /* two-argument instructions can't take 64-bit immediates */
1238          fs_reg zero;
1239          fs_reg tmp;
1240
1241          if (instr->op == nir_op_f2b) {
1242             zero = vgrf(glsl_type::double_type);
1243             tmp = vgrf(glsl_type::double_type);
1244             bld.MOV(zero, setup_imm_df(bld, 0.0));
1245          } else {
1246             zero = vgrf(glsl_type::int64_t_type);
1247             tmp = vgrf(glsl_type::int64_t_type);
1248             bld.MOV(zero, brw_imm_q(0));
1249          }
1250
1251          /* A SIMD16 execution needs to be split in two instructions, so use
1252           * a vgrf instead of the flag register as dst so instruction splitting
1253           * works
1254           */
1255          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1256          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1257       } else {
1258          fs_reg zero;
1259          if (bit_size == 32) {
1260             zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0);
1261          } else {
1262             assert(bit_size == 16);
1263             zero = instr->op == nir_op_f2b ?
1264                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1265          }
1266          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1267       }
1268       break;
1269    }
1270
1271    case nir_op_ftrunc:
1272       inst = bld.RNDZ(result, op[0]);
1273       inst->saturate = instr->dest.saturate;
1274       break;
1275
1276    case nir_op_fceil: {
1277       op[0].negate = !op[0].negate;
1278       fs_reg temp = vgrf(glsl_type::float_type);
1279       bld.RNDD(temp, op[0]);
1280       temp.negate = true;
1281       inst = bld.MOV(result, temp);
1282       inst->saturate = instr->dest.saturate;
1283       break;
1284    }
1285    case nir_op_ffloor:
1286       inst = bld.RNDD(result, op[0]);
1287       inst->saturate = instr->dest.saturate;
1288       break;
1289    case nir_op_ffract:
1290       inst = bld.FRC(result, op[0]);
1291       inst->saturate = instr->dest.saturate;
1292       break;
1293    case nir_op_fround_even:
1294       inst = bld.RNDE(result, op[0]);
1295       inst->saturate = instr->dest.saturate;
1296       break;
1297
1298    case nir_op_fquantize2f16: {
1299       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1300       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1301       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1302
1303       /* The destination stride must be at least as big as the source stride. */
1304       tmp16.type = BRW_REGISTER_TYPE_W;
1305       tmp16.stride = 2;
1306
1307       /* Check for denormal */
1308       fs_reg abs_src0 = op[0];
1309       abs_src0.abs = true;
1310       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1311               BRW_CONDITIONAL_L);
1312       /* Get the appropriately signed zero */
1313       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1314               retype(op[0], BRW_REGISTER_TYPE_UD),
1315               brw_imm_ud(0x80000000));
1316       /* Do the actual F32 -> F16 -> F32 conversion */
1317       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1318       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1319       /* Select that or zero based on normal status */
1320       inst = bld.SEL(result, zero, tmp32);
1321       inst->predicate = BRW_PREDICATE_NORMAL;
1322       inst->saturate = instr->dest.saturate;
1323       break;
1324    }
1325
1326    case nir_op_imin:
1327    case nir_op_umin:
1328    case nir_op_fmin:
1329       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1330       inst->saturate = instr->dest.saturate;
1331       break;
1332
1333    case nir_op_imax:
1334    case nir_op_umax:
1335    case nir_op_fmax:
1336       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1337       inst->saturate = instr->dest.saturate;
1338       break;
1339
1340    case nir_op_pack_snorm_2x16:
1341    case nir_op_pack_snorm_4x8:
1342    case nir_op_pack_unorm_2x16:
1343    case nir_op_pack_unorm_4x8:
1344    case nir_op_unpack_snorm_2x16:
1345    case nir_op_unpack_snorm_4x8:
1346    case nir_op_unpack_unorm_2x16:
1347    case nir_op_unpack_unorm_4x8:
1348    case nir_op_unpack_half_2x16:
1349    case nir_op_pack_half_2x16:
1350       unreachable("not reached: should be handled by lower_packing_builtins");
1351
1352    case nir_op_unpack_half_2x16_split_x:
1353       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1354       inst->saturate = instr->dest.saturate;
1355       break;
1356    case nir_op_unpack_half_2x16_split_y:
1357       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1358       inst->saturate = instr->dest.saturate;
1359       break;
1360
1361    case nir_op_pack_64_2x32_split:
1362    case nir_op_pack_32_2x16_split:
1363       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1364       break;
1365
1366    case nir_op_unpack_64_2x32_split_x:
1367    case nir_op_unpack_64_2x32_split_y: {
1368       if (instr->op == nir_op_unpack_64_2x32_split_x)
1369          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1370       else
1371          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1372       break;
1373    }
1374
1375    case nir_op_unpack_32_2x16_split_x:
1376    case nir_op_unpack_32_2x16_split_y: {
1377       if (instr->op == nir_op_unpack_32_2x16_split_x)
1378          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1379       else
1380          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1381       break;
1382    }
1383
1384    case nir_op_fpow:
1385       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1386       inst->saturate = instr->dest.saturate;
1387       break;
1388
1389    case nir_op_bitfield_reverse:
1390       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1391       bld.BFREV(result, op[0]);
1392       break;
1393
1394    case nir_op_bit_count:
1395       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1396       bld.CBIT(result, op[0]);
1397       break;
1398
1399    case nir_op_ufind_msb: {
1400       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1401       emit_find_msb_using_lzd(bld, result, op[0], false);
1402       break;
1403    }
1404
1405    case nir_op_ifind_msb: {
1406       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1407
1408       if (devinfo->gen < 7) {
1409          emit_find_msb_using_lzd(bld, result, op[0], true);
1410       } else {
1411          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1412
1413          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1414           * count from the LSB side. If FBH didn't return an error
1415           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1416           * count into an LSB count.
1417           */
1418          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1419
1420          inst = bld.ADD(result, result, brw_imm_d(31));
1421          inst->predicate = BRW_PREDICATE_NORMAL;
1422          inst->src[0].negate = true;
1423       }
1424       break;
1425    }
1426
1427    case nir_op_find_lsb:
1428       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1429
1430       if (devinfo->gen < 7) {
1431          fs_reg temp = vgrf(glsl_type::int_type);
1432
1433          /* (x & -x) generates a value that consists of only the LSB of x.
1434           * For all powers of 2, findMSB(y) == findLSB(y).
1435           */
1436          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1437          fs_reg negated_src = src;
1438
1439          /* One must be negated, and the other must be non-negated.  It
1440           * doesn't matter which is which.
1441           */
1442          negated_src.negate = true;
1443          src.negate = false;
1444
1445          bld.AND(temp, src, negated_src);
1446          emit_find_msb_using_lzd(bld, result, temp, false);
1447       } else {
1448          bld.FBL(result, op[0]);
1449       }
1450       break;
1451
1452    case nir_op_ubitfield_extract:
1453    case nir_op_ibitfield_extract:
1454       unreachable("should have been lowered");
1455    case nir_op_ubfe:
1456    case nir_op_ibfe:
1457       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1458       bld.BFE(result, op[2], op[1], op[0]);
1459       break;
1460    case nir_op_bfm:
1461       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1462       bld.BFI1(result, op[0], op[1]);
1463       break;
1464    case nir_op_bfi:
1465       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1466       bld.BFI2(result, op[0], op[1], op[2]);
1467       break;
1468
1469    case nir_op_bitfield_insert:
1470       unreachable("not reached: should have been lowered");
1471
1472    case nir_op_ishl:
1473    case nir_op_ishr:
1474    case nir_op_ushr: {
1475       fs_reg shift_count = op[1];
1476
1477       if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) {
1478          if (op[1].file == VGRF &&
1479              (result.type == BRW_REGISTER_TYPE_Q ||
1480               result.type == BRW_REGISTER_TYPE_UQ)) {
1481             shift_count = fs_reg(VGRF, alloc.allocate(dispatch_width / 4),
1482                                  BRW_REGISTER_TYPE_UD);
1483             shift_count.stride = 2;
1484             bld.MOV(shift_count, op[1]);
1485          }
1486       }
1487
1488       switch (instr->op) {
1489       case nir_op_ishl:
1490          bld.SHL(result, op[0], shift_count);
1491          break;
1492       case nir_op_ishr:
1493          bld.ASR(result, op[0], shift_count);
1494          break;
1495       case nir_op_ushr:
1496          bld.SHR(result, op[0], shift_count);
1497          break;
1498       default:
1499          unreachable("not reached");
1500       }
1501       break;
1502    }
1503
1504    case nir_op_pack_half_2x16_split:
1505       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1506       break;
1507
1508    case nir_op_ffma:
1509       inst = bld.MAD(result, op[2], op[1], op[0]);
1510       inst->saturate = instr->dest.saturate;
1511       break;
1512
1513    case nir_op_flrp:
1514       inst = bld.LRP(result, op[0], op[1], op[2]);
1515       inst->saturate = instr->dest.saturate;
1516       break;
1517
1518    case nir_op_bcsel:
1519       if (optimize_frontfacing_ternary(instr, result))
1520          return;
1521
1522       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1523       inst = bld.SEL(result, op[1], op[2]);
1524       inst->predicate = BRW_PREDICATE_NORMAL;
1525       break;
1526
1527    case nir_op_extract_u8:
1528    case nir_op_extract_i8: {
1529       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
1530       assert(byte != NULL);
1531
1532       /* The PRMs say:
1533        *
1534        *    BDW+
1535        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1536        *    Use two instructions and a word or DWord intermediate integer type.
1537        */
1538       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1539          const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
1540
1541          if (instr->op == nir_op_extract_i8) {
1542             /* If we need to sign extend, extract to a word first */
1543             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1544             bld.MOV(w_temp, subscript(op[0], type, byte->u32[0]));
1545             bld.MOV(result, w_temp);
1546          } else {
1547             /* Otherwise use an AND with 0xff and a word type */
1548             bld.AND(result, subscript(op[0], type, byte->u32[0] / 2), brw_imm_uw(0xff));
1549          }
1550       } else {
1551          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1552          bld.MOV(result, subscript(op[0], type, byte->u32[0]));
1553       }
1554       break;
1555    }
1556
1557    case nir_op_extract_u16:
1558    case nir_op_extract_i16: {
1559       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1560       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
1561       assert(word != NULL);
1562       bld.MOV(result, subscript(op[0], type, word->u32[0]));
1563       break;
1564    }
1565
1566    default:
1567       unreachable("unhandled instruction");
1568    }
1569
1570    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1571     * to sign extend the low bit to 0/~0
1572     */
1573    if (devinfo->gen <= 5 &&
1574        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1575       fs_reg masked = vgrf(glsl_type::int_type);
1576       bld.AND(masked, result, brw_imm_d(1));
1577       masked.negate = true;
1578       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1579    }
1580 }
1581
1582 void
1583 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1584                                 nir_load_const_instr *instr)
1585 {
1586    const brw_reg_type reg_type =
1587       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1588    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1589
1590    switch (instr->def.bit_size) {
1591    case 8:
1592       for (unsigned i = 0; i < instr->def.num_components; i++)
1593          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value.i8[i]));
1594       break;
1595
1596    case 16:
1597       for (unsigned i = 0; i < instr->def.num_components; i++)
1598          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
1599       break;
1600
1601    case 32:
1602       for (unsigned i = 0; i < instr->def.num_components; i++)
1603          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1604       break;
1605
1606    case 64:
1607       assert(devinfo->gen >= 7);
1608       if (devinfo->gen == 7) {
1609          /* We don't get 64-bit integer types until gen8 */
1610          for (unsigned i = 0; i < instr->def.num_components; i++) {
1611             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1612                     setup_imm_df(bld, instr->value.f64[i]));
1613          }
1614       } else {
1615          for (unsigned i = 0; i < instr->def.num_components; i++)
1616             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
1617       }
1618       break;
1619
1620    default:
1621       unreachable("Invalid bit size");
1622    }
1623
1624    nir_ssa_values[instr->def.index] = reg;
1625 }
1626
1627 fs_reg
1628 fs_visitor::get_nir_src(const nir_src &src)
1629 {
1630    fs_reg reg;
1631    if (src.is_ssa) {
1632       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1633          const brw_reg_type reg_type =
1634             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1635          reg = bld.vgrf(reg_type, src.ssa->num_components);
1636       } else {
1637          reg = nir_ssa_values[src.ssa->index];
1638       }
1639    } else {
1640       /* We don't handle indirects on locals */
1641       assert(src.reg.indirect == NULL);
1642       reg = offset(nir_locals[src.reg.reg->index], bld,
1643                    src.reg.base_offset * src.reg.reg->num_components);
1644    }
1645
1646    if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1647       /* The only 64-bit type available on gen7 is DF, so use that. */
1648       reg.type = BRW_REGISTER_TYPE_DF;
1649    } else {
1650       /* To avoid floating-point denorm flushing problems, set the type by
1651        * default to an integer type - instructions that need floating point
1652        * semantics will set this to F if they need to
1653        */
1654       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1655                                             BRW_REGISTER_TYPE_D);
1656    }
1657
1658    return reg;
1659 }
1660
1661 /**
1662  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1663  *
1664  * This function should not be called on any value which may be 64 bits.
1665  * We could theoretically support 64-bit on gen8+ but we choose not to
1666  * because it wouldn't work in general (no gen7 support) and there are
1667  * enough restrictions in 64-bit immediates that you can't take the return
1668  * value and treat it the same as the result of get_nir_src().
1669  */
1670 fs_reg
1671 fs_visitor::get_nir_src_imm(const nir_src &src)
1672 {
1673    nir_const_value *val = nir_src_as_const_value(src);
1674    assert(nir_src_bit_size(src) == 32);
1675    return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
1676 }
1677
1678 fs_reg
1679 fs_visitor::get_nir_dest(const nir_dest &dest)
1680 {
1681    if (dest.is_ssa) {
1682       const brw_reg_type reg_type =
1683          brw_reg_type_from_bit_size(dest.ssa.bit_size,
1684                                     dest.ssa.bit_size == 8 ?
1685                                     BRW_REGISTER_TYPE_D :
1686                                     BRW_REGISTER_TYPE_F);
1687       nir_ssa_values[dest.ssa.index] =
1688          bld.vgrf(reg_type, dest.ssa.num_components);
1689       return nir_ssa_values[dest.ssa.index];
1690    } else {
1691       /* We don't handle indirects on locals */
1692       assert(dest.reg.indirect == NULL);
1693       return offset(nir_locals[dest.reg.reg->index], bld,
1694                     dest.reg.base_offset * dest.reg.reg->num_components);
1695    }
1696 }
1697
1698 void
1699 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1700                          unsigned wr_mask)
1701 {
1702    for (unsigned i = 0; i < 4; i++) {
1703       if (!((wr_mask >> i) & 1))
1704          continue;
1705
1706       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1707       new_inst->dst = offset(new_inst->dst, bld, i);
1708       for (unsigned j = 0; j < new_inst->sources; j++)
1709          if (new_inst->src[j].file == VGRF)
1710             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1711
1712       bld.emit(new_inst);
1713    }
1714 }
1715
1716 static fs_inst *
1717 emit_pixel_interpolater_send(const fs_builder &bld,
1718                              enum opcode opcode,
1719                              const fs_reg &dst,
1720                              const fs_reg &src,
1721                              const fs_reg &desc,
1722                              glsl_interp_mode interpolation)
1723 {
1724    struct brw_wm_prog_data *wm_prog_data =
1725       brw_wm_prog_data(bld.shader->stage_prog_data);
1726
1727    fs_inst *inst = bld.emit(opcode, dst, src, desc);
1728    /* 2 floats per slot returned */
1729    inst->size_written = 2 * dst.component_size(inst->exec_size);
1730    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1731
1732    wm_prog_data->pulls_bary = true;
1733
1734    return inst;
1735 }
1736
1737 /**
1738  * Computes 1 << x, given a D/UD register containing some value x.
1739  */
1740 static fs_reg
1741 intexp2(const fs_builder &bld, const fs_reg &x)
1742 {
1743    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1744
1745    fs_reg result = bld.vgrf(x.type, 1);
1746    fs_reg one = bld.vgrf(x.type, 1);
1747
1748    bld.MOV(one, retype(brw_imm_d(1), one.type));
1749    bld.SHL(result, one, x);
1750    return result;
1751 }
1752
1753 void
1754 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1755 {
1756    assert(stage == MESA_SHADER_GEOMETRY);
1757
1758    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1759
1760    if (gs_compile->control_data_header_size_bits == 0)
1761       return;
1762
1763    /* We can only do EndPrimitive() functionality when the control data
1764     * consists of cut bits.  Fortunately, the only time it isn't is when the
1765     * output type is points, in which case EndPrimitive() is a no-op.
1766     */
1767    if (gs_prog_data->control_data_format !=
1768        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1769       return;
1770    }
1771
1772    /* Cut bits use one bit per vertex. */
1773    assert(gs_compile->control_data_bits_per_vertex == 1);
1774
1775    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1776    vertex_count.type = BRW_REGISTER_TYPE_UD;
1777
1778    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1779     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1780     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1781     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1782     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1783     *
1784     * Note that if EndPrimitive() is called before emitting any vertices, this
1785     * will cause us to set bit 31 of the control_data_bits register to 1.
1786     * That's fine because:
1787     *
1788     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1789     *   output, so the hardware will ignore cut bit 31.
1790     *
1791     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1792     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1793     *   is automatically ended when the GS terminates).
1794     *
1795     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1796     *   control_data_bits register to 0 when the first vertex is emitted.
1797     */
1798
1799    const fs_builder abld = bld.annotate("end primitive");
1800
1801    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1802    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1803    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1804    fs_reg mask = intexp2(abld, prev_count);
1805    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1806     * attention to the lower 5 bits of its second source argument, so on this
1807     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1808     * ((vertex_count - 1) % 32).
1809     */
1810    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1811 }
1812
1813 void
1814 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1815 {
1816    assert(stage == MESA_SHADER_GEOMETRY);
1817    assert(gs_compile->control_data_bits_per_vertex != 0);
1818
1819    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1820
1821    const fs_builder abld = bld.annotate("emit control data bits");
1822    const fs_builder fwa_bld = bld.exec_all();
1823
1824    /* We use a single UD register to accumulate control data bits (32 bits
1825     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1826     * at a time.
1827     *
1828     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1829     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1830     * use the Channel Mask phase to enable/disable which DWord within that
1831     * group to write.  (Remember, different SIMD8 channels may have emitted
1832     * different numbers of vertices, so we may need per-slot offsets.)
1833     *
1834     * Channel masking presents an annoying problem: we may have to replicate
1835     * the data up to 4 times:
1836     *
1837     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1838     *
1839     * To avoid penalizing shaders that emit a small number of vertices, we
1840     * can avoid these sometimes: if the size of the control data header is
1841     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1842     * land in the same 128-bit group, so we can skip per-slot offsets.
1843     *
1844     * Similarly, if the control data header is <= 32 bits, there is only one
1845     * DWord, so we can skip channel masks.
1846     */
1847    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1848
1849    fs_reg channel_mask, per_slot_offset;
1850
1851    if (gs_compile->control_data_header_size_bits > 32) {
1852       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1853       channel_mask = vgrf(glsl_type::uint_type);
1854    }
1855
1856    if (gs_compile->control_data_header_size_bits > 128) {
1857       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1858       per_slot_offset = vgrf(glsl_type::uint_type);
1859    }
1860
1861    /* Figure out which DWord we're trying to write to using the formula:
1862     *
1863     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1864     *
1865     * Since bits_per_vertex is a power of two, and is known at compile
1866     * time, this can be optimized to:
1867     *
1868     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1869     */
1870    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1871       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1872       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1873       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1874       unsigned log2_bits_per_vertex =
1875          util_last_bit(gs_compile->control_data_bits_per_vertex);
1876       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1877
1878       if (per_slot_offset.file != BAD_FILE) {
1879          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1880           * the appropriate OWord within the control data header.
1881           */
1882          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1883       }
1884
1885       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1886        * write to the appropriate DWORD within the OWORD.
1887        */
1888       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1889       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1890       channel_mask = intexp2(fwa_bld, channel);
1891       /* Then the channel masks need to be in bits 23:16. */
1892       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1893    }
1894
1895    /* Store the control data bits in the message payload and send it. */
1896    int mlen = 2;
1897    if (channel_mask.file != BAD_FILE)
1898       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1899    if (per_slot_offset.file != BAD_FILE)
1900       mlen++;
1901
1902    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1903    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1904    int i = 0;
1905    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1906    if (per_slot_offset.file != BAD_FILE)
1907       sources[i++] = per_slot_offset;
1908    if (channel_mask.file != BAD_FILE)
1909       sources[i++] = channel_mask;
1910    while (i < mlen) {
1911       sources[i++] = this->control_data_bits;
1912    }
1913
1914    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1915    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1916    inst->mlen = mlen;
1917    /* We need to increment Global Offset by 256-bits to make room for
1918     * Broadwell's extra "Vertex Count" payload at the beginning of the
1919     * URB entry.  Since this is an OWord message, Global Offset is counted
1920     * in 128-bit units, so we must set it to 2.
1921     */
1922    if (gs_prog_data->static_vertex_count == -1)
1923       inst->offset = 2;
1924 }
1925
1926 void
1927 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1928                                             unsigned stream_id)
1929 {
1930    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1931
1932    /* Note: we are calling this *before* increasing vertex_count, so
1933     * this->vertex_count == vertex_count - 1 in the formula above.
1934     */
1935
1936    /* Stream mode uses 2 bits per vertex */
1937    assert(gs_compile->control_data_bits_per_vertex == 2);
1938
1939    /* Must be a valid stream */
1940    assert(stream_id < MAX_VERTEX_STREAMS);
1941
1942    /* Control data bits are initialized to 0 so we don't have to set any
1943     * bits when sending vertices to stream 0.
1944     */
1945    if (stream_id == 0)
1946       return;
1947
1948    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1949
1950    /* reg::sid = stream_id */
1951    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1952    abld.MOV(sid, brw_imm_ud(stream_id));
1953
1954    /* reg:shift_count = 2 * (vertex_count - 1) */
1955    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1956    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1957
1958    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1959     * attention to the lower 5 bits of its second source argument, so on this
1960     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1961     * stream_id << ((2 * (vertex_count - 1)) % 32).
1962     */
1963    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1964    abld.SHL(mask, sid, shift_count);
1965    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1966 }
1967
1968 void
1969 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1970                            unsigned stream_id)
1971 {
1972    assert(stage == MESA_SHADER_GEOMETRY);
1973
1974    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1975
1976    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1977    vertex_count.type = BRW_REGISTER_TYPE_UD;
1978
1979    /* Haswell and later hardware ignores the "Render Stream Select" bits
1980     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1981     * and instead sends all primitives down the pipeline for rasterization.
1982     * If the SOL stage is enabled, "Render Stream Select" is honored and
1983     * primitives bound to non-zero streams are discarded after stream output.
1984     *
1985     * Since the only purpose of primives sent to non-zero streams is to
1986     * be recorded by transform feedback, we can simply discard all geometry
1987     * bound to these streams when transform feedback is disabled.
1988     */
1989    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
1990       return;
1991
1992    /* If we're outputting 32 control data bits or less, then we can wait
1993     * until the shader is over to output them all.  Otherwise we need to
1994     * output them as we go.  Now is the time to do it, since we're about to
1995     * output the vertex_count'th vertex, so it's guaranteed that the
1996     * control data bits associated with the (vertex_count - 1)th vertex are
1997     * correct.
1998     */
1999    if (gs_compile->control_data_header_size_bits > 32) {
2000       const fs_builder abld =
2001          bld.annotate("emit vertex: emit control data bits");
2002
2003       /* Only emit control data bits if we've finished accumulating a batch
2004        * of 32 bits.  This is the case when:
2005        *
2006        *     (vertex_count * bits_per_vertex) % 32 == 0
2007        *
2008        * (in other words, when the last 5 bits of vertex_count *
2009        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2010        * integer n (which is always the case, since bits_per_vertex is
2011        * always 1 or 2), this is equivalent to requiring that the last 5-n
2012        * bits of vertex_count are 0:
2013        *
2014        *     vertex_count & (2^(5-n) - 1) == 0
2015        *
2016        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2017        * equivalent to:
2018        *
2019        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2020        *
2021        * TODO: If vertex_count is an immediate, we could do some of this math
2022        *       at compile time...
2023        */
2024       fs_inst *inst =
2025          abld.AND(bld.null_reg_d(), vertex_count,
2026                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2027       inst->conditional_mod = BRW_CONDITIONAL_Z;
2028
2029       abld.IF(BRW_PREDICATE_NORMAL);
2030       /* If vertex_count is 0, then no control data bits have been
2031        * accumulated yet, so we can skip emitting them.
2032        */
2033       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2034                BRW_CONDITIONAL_NEQ);
2035       abld.IF(BRW_PREDICATE_NORMAL);
2036       emit_gs_control_data_bits(vertex_count);
2037       abld.emit(BRW_OPCODE_ENDIF);
2038
2039       /* Reset control_data_bits to 0 so we can start accumulating a new
2040        * batch.
2041        *
2042        * Note: in the case where vertex_count == 0, this neutralizes the
2043        * effect of any call to EndPrimitive() that the shader may have
2044        * made before outputting its first vertex.
2045        */
2046       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2047       inst->force_writemask_all = true;
2048       abld.emit(BRW_OPCODE_ENDIF);
2049    }
2050
2051    emit_urb_writes(vertex_count);
2052
2053    /* In stream mode we have to set control data bits for all vertices
2054     * unless we have disabled control data bits completely (which we do
2055     * do for GL_POINTS outputs that don't use streams).
2056     */
2057    if (gs_compile->control_data_header_size_bits > 0 &&
2058        gs_prog_data->control_data_format ==
2059           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2060       set_gs_stream_control_data_bits(vertex_count, stream_id);
2061    }
2062 }
2063
2064 void
2065 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2066                                const nir_src &vertex_src,
2067                                unsigned base_offset,
2068                                const nir_src &offset_src,
2069                                unsigned num_components,
2070                                unsigned first_component)
2071 {
2072    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2073
2074    nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2075    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
2076    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2077
2078    /* TODO: figure out push input layout for invocations == 1 */
2079    /* TODO: make this work with 64-bit inputs */
2080    if (gs_prog_data->invocations == 1 &&
2081        type_sz(dst.type) <= 4 &&
2082        offset_const != NULL && vertex_const != NULL &&
2083        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
2084       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
2085                        vertex_const->u32[0] * push_reg_count;
2086       for (unsigned i = 0; i < num_components; i++) {
2087          bld.MOV(offset(dst, bld, i),
2088                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2089       }
2090       return;
2091    }
2092
2093    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2094    assert(gs_prog_data->base.include_vue_handles);
2095
2096    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2097    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2098
2099    if (gs_prog_data->invocations == 1) {
2100       if (vertex_const) {
2101          /* The vertex index is constant; just select the proper URB handle. */
2102          icp_handle =
2103             retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
2104                    BRW_REGISTER_TYPE_UD);
2105       } else {
2106          /* The vertex index is non-constant.  We need to use indirect
2107           * addressing to fetch the proper URB handle.
2108           *
2109           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2110           * indicating that channel <n> should read the handle from
2111           * DWord <n>.  We convert that to bytes by multiplying by 4.
2112           *
2113           * Next, we convert the vertex index to bytes by multiplying
2114           * by 32 (shifting by 5), and add the two together.  This is
2115           * the final indirect byte offset.
2116           */
2117          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2118          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2119          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2120          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2121
2122          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2123          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2124          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2125          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2126          /* Convert vertex_index to bytes (multiply by 32) */
2127          bld.SHL(vertex_offset_bytes,
2128                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2129                  brw_imm_ud(5u));
2130          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2131
2132          /* Use first_icp_handle as the base offset.  There is one register
2133           * of URB handles per vertex, so inform the register allocator that
2134           * we might read up to nir->info.gs.vertices_in registers.
2135           */
2136          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2137                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2138                   fs_reg(icp_offset_bytes),
2139                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2140       }
2141    } else {
2142       assert(gs_prog_data->invocations > 1);
2143
2144       if (vertex_const) {
2145          assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
2146          bld.MOV(icp_handle,
2147                  retype(brw_vec1_grf(first_icp_handle +
2148                                      vertex_const->i32[0] / 8,
2149                                      vertex_const->i32[0] % 8),
2150                         BRW_REGISTER_TYPE_UD));
2151       } else {
2152          /* The vertex index is non-constant.  We need to use indirect
2153           * addressing to fetch the proper URB handle.
2154           *
2155           */
2156          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2157
2158          /* Convert vertex_index to bytes (multiply by 4) */
2159          bld.SHL(icp_offset_bytes,
2160                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2161                  brw_imm_ud(2u));
2162
2163          /* Use first_icp_handle as the base offset.  There is one DWord
2164           * of URB handles per vertex, so inform the register allocator that
2165           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2166           */
2167          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2168                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2169                   fs_reg(icp_offset_bytes),
2170                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2171                              REG_SIZE));
2172       }
2173    }
2174
2175    fs_inst *inst;
2176
2177    fs_reg tmp_dst = dst;
2178    fs_reg indirect_offset = get_nir_src(offset_src);
2179    unsigned num_iterations = 1;
2180    unsigned orig_num_components = num_components;
2181
2182    if (type_sz(dst.type) == 8) {
2183       if (num_components > 2) {
2184          num_iterations = 2;
2185          num_components = 2;
2186       }
2187       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2188       tmp_dst = tmp;
2189       first_component = first_component / 2;
2190    }
2191
2192    for (unsigned iter = 0; iter < num_iterations; iter++) {
2193       if (offset_const) {
2194          /* Constant indexing - use global offset. */
2195          if (first_component != 0) {
2196             unsigned read_components = num_components + first_component;
2197             fs_reg tmp = bld.vgrf(dst.type, read_components);
2198             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2199             inst->size_written = read_components *
2200                                  tmp.component_size(inst->exec_size);
2201             for (unsigned i = 0; i < num_components; i++) {
2202                bld.MOV(offset(tmp_dst, bld, i),
2203                        offset(tmp, bld, i + first_component));
2204             }
2205          } else {
2206             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2207                             icp_handle);
2208             inst->size_written = num_components *
2209                                  tmp_dst.component_size(inst->exec_size);
2210          }
2211          inst->offset = base_offset + offset_const->u32[0];
2212          inst->mlen = 1;
2213       } else {
2214          /* Indirect indexing - use per-slot offsets as well. */
2215          const fs_reg srcs[] = { icp_handle, indirect_offset };
2216          unsigned read_components = num_components + first_component;
2217          fs_reg tmp = bld.vgrf(dst.type, read_components);
2218          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2219          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2220          if (first_component != 0) {
2221             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2222                             payload);
2223             inst->size_written = read_components *
2224                                  tmp.component_size(inst->exec_size);
2225             for (unsigned i = 0; i < num_components; i++) {
2226                bld.MOV(offset(tmp_dst, bld, i),
2227                        offset(tmp, bld, i + first_component));
2228             }
2229          } else {
2230             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2231                          payload);
2232             inst->size_written = num_components *
2233                                  tmp_dst.component_size(inst->exec_size);
2234          }
2235          inst->offset = base_offset;
2236          inst->mlen = 2;
2237       }
2238
2239       if (type_sz(dst.type) == 8) {
2240          shuffle_from_32bit_read(bld,
2241                                  offset(dst, bld, iter * 2),
2242                                  retype(tmp_dst, BRW_REGISTER_TYPE_D),
2243                                  0,
2244                                  num_components);
2245       }
2246
2247       if (num_iterations > 1) {
2248          num_components = orig_num_components - 2;
2249          if(offset_const) {
2250             base_offset++;
2251          } else {
2252             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2253             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2254             indirect_offset = new_indirect;
2255          }
2256       }
2257    }
2258 }
2259
2260 fs_reg
2261 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2262 {
2263    nir_src *offset_src = nir_get_io_offset_src(instr);
2264    nir_const_value *const_value = nir_src_as_const_value(*offset_src);
2265
2266    if (const_value) {
2267       /* The only constant offset we should find is 0.  brw_nir.c's
2268        * add_const_offset_to_base() will fold other constant offsets
2269        * into instr->const_index[0].
2270        */
2271       assert(const_value->u32[0] == 0);
2272       return fs_reg();
2273    }
2274
2275    return get_nir_src(*offset_src);
2276 }
2277
2278 static void
2279 do_untyped_vector_read(const fs_builder &bld,
2280                        const fs_reg dest,
2281                        const fs_reg surf_index,
2282                        const fs_reg offset_reg,
2283                        unsigned num_components)
2284 {
2285    if (type_sz(dest.type) <= 2) {
2286       assert(dest.stride == 1);
2287       boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
2288
2289       if (is_const_offset) {
2290          uint32_t start = offset_reg.ud & ~3;
2291          uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
2292          end = ALIGN(end, 4);
2293          assert (end - start <= 16);
2294
2295          /* At this point we have 16-bit component/s that have constant
2296           * offset aligned to 4-bytes that can be read with untyped_reads.
2297           * untyped_read message requires 32-bit aligned offsets.
2298           */
2299          unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
2300          unsigned num_components_32bit = (end - start) / 4;
2301
2302          fs_reg read_result =
2303             emit_untyped_read(bld, surf_index, brw_imm_ud(start),
2304                               1 /* dims */,
2305                               num_components_32bit,
2306                               BRW_PREDICATE_NONE);
2307          shuffle_from_32bit_read(bld, dest, read_result, first_component,
2308                                  num_components);
2309       } else {
2310          fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2311          for (unsigned i = 0; i < num_components; i++) {
2312             if (i == 0) {
2313                bld.MOV(read_offset, offset_reg);
2314             } else {
2315                bld.ADD(read_offset, offset_reg,
2316                        brw_imm_ud(i * type_sz(dest.type)));
2317             }
2318             /* Non constant offsets are not guaranteed to be aligned 32-bits
2319              * so they are read using one byte_scattered_read message
2320              * for each component.
2321              */
2322             fs_reg read_result =
2323                emit_byte_scattered_read(bld, surf_index, read_offset,
2324                                         1 /* dims */, 1,
2325                                         type_sz(dest.type) * 8 /* bit_size */,
2326                                         BRW_PREDICATE_NONE);
2327             bld.MOV(offset(dest, bld, i),
2328                     subscript (read_result, dest.type, 0));
2329          }
2330       }
2331    } else if (type_sz(dest.type) == 4) {
2332       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2333                                              1 /* dims */,
2334                                              num_components,
2335                                              BRW_PREDICATE_NONE);
2336       read_result.type = dest.type;
2337       for (unsigned i = 0; i < num_components; i++)
2338          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2339    } else if (type_sz(dest.type) == 8) {
2340       /* Reading a dvec, so we need to:
2341        *
2342        * 1. Multiply num_components by 2, to account for the fact that we
2343        *    need to read 64-bit components.
2344        * 2. Shuffle the result of the load to form valid 64-bit elements
2345        * 3. Emit a second load (for components z/w) if needed.
2346        */
2347       fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2348       bld.MOV(read_offset, offset_reg);
2349
2350       int iters = num_components <= 2 ? 1 : 2;
2351
2352       /* Load the dvec, the first iteration loads components x/y, the second
2353        * iteration, if needed, loads components z/w
2354        */
2355       for (int it = 0; it < iters; it++) {
2356          /* Compute number of components to read in this iteration */
2357          int iter_components = MIN2(2, num_components);
2358          num_components -= iter_components;
2359
2360          /* Read. Since this message reads 32-bit components, we need to
2361           * read twice as many components.
2362           */
2363          fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
2364                                                 1 /* dims */,
2365                                                 iter_components * 2,
2366                                                 BRW_PREDICATE_NONE);
2367
2368          /* Shuffle the 32-bit load result into valid 64-bit data */
2369          shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
2370                                  read_result, 0, iter_components);
2371
2372          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
2373       }
2374    } else {
2375       unreachable("Unsupported type");
2376    }
2377 }
2378
2379 void
2380 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2381                                   nir_intrinsic_instr *instr)
2382 {
2383    assert(stage == MESA_SHADER_VERTEX);
2384
2385    fs_reg dest;
2386    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2387       dest = get_nir_dest(instr->dest);
2388
2389    switch (instr->intrinsic) {
2390    case nir_intrinsic_load_vertex_id:
2391    case nir_intrinsic_load_base_vertex:
2392       unreachable("should be lowered by nir_lower_system_values()");
2393
2394    case nir_intrinsic_load_input: {
2395       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2396       unsigned first_component = nir_intrinsic_component(instr);
2397       unsigned num_components = instr->num_components;
2398
2399       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2400       assert(const_offset && "Indirect input loads not allowed");
2401       src = offset(src, bld, const_offset->u32[0]);
2402
2403       if (type_sz(dest.type) == 8)
2404          first_component /= 2;
2405
2406       /* For 16-bit support maybe a temporary will be needed to copy from
2407        * the ATTR file.
2408        */
2409       shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2410                               first_component, num_components);
2411       break;
2412    }
2413
2414    case nir_intrinsic_load_vertex_id_zero_base:
2415    case nir_intrinsic_load_instance_id:
2416    case nir_intrinsic_load_base_instance:
2417    case nir_intrinsic_load_draw_id:
2418    case nir_intrinsic_load_first_vertex:
2419    case nir_intrinsic_load_is_indexed_draw:
2420       unreachable("lowered by brw_nir_lower_vs_inputs");
2421
2422    default:
2423       nir_emit_intrinsic(bld, instr);
2424       break;
2425    }
2426 }
2427
2428 void
2429 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2430                                    nir_intrinsic_instr *instr)
2431 {
2432    assert(stage == MESA_SHADER_TESS_CTRL);
2433    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2434    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2435
2436    fs_reg dst;
2437    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2438       dst = get_nir_dest(instr->dest);
2439
2440    switch (instr->intrinsic) {
2441    case nir_intrinsic_load_primitive_id:
2442       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2443       break;
2444    case nir_intrinsic_load_invocation_id:
2445       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2446       break;
2447    case nir_intrinsic_load_patch_vertices_in:
2448       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2449               brw_imm_d(tcs_key->input_vertices));
2450       break;
2451
2452    case nir_intrinsic_barrier: {
2453       if (tcs_prog_data->instances == 1)
2454          break;
2455
2456       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2457       fs_reg m0_2 = component(m0, 2);
2458
2459       const fs_builder chanbld = bld.exec_all().group(1, 0);
2460
2461       /* Zero the message header */
2462       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2463
2464       /* Copy "Barrier ID" from r0.2, bits 16:13 */
2465       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2466                   brw_imm_ud(INTEL_MASK(16, 13)));
2467
2468       /* Shift it up to bits 27:24. */
2469       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2470
2471       /* Set the Barrier Count and the enable bit */
2472       chanbld.OR(m0_2, m0_2,
2473                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2474
2475       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2476       break;
2477    }
2478
2479    case nir_intrinsic_load_input:
2480       unreachable("nir_lower_io should never give us these.");
2481       break;
2482
2483    case nir_intrinsic_load_per_vertex_input: {
2484       fs_reg indirect_offset = get_indirect_offset(instr);
2485       unsigned imm_offset = instr->const_index[0];
2486
2487       const nir_src &vertex_src = instr->src[0];
2488       nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2489
2490       fs_inst *inst;
2491
2492       fs_reg icp_handle;
2493
2494       if (vertex_const) {
2495          /* Emit a MOV to resolve <0,1,0> regioning. */
2496          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2497          bld.MOV(icp_handle,
2498                  retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
2499                                      vertex_const->i32[0] & 7),
2500                         BRW_REGISTER_TYPE_UD));
2501       } else if (tcs_prog_data->instances == 1 &&
2502                  vertex_src.is_ssa &&
2503                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2504                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2505          /* For the common case of only 1 instance, an array index of
2506           * gl_InvocationID means reading g1.  Skip all the indirect work.
2507           */
2508          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2509       } else {
2510          /* The vertex index is non-constant.  We need to use indirect
2511           * addressing to fetch the proper URB handle.
2512           */
2513          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2514
2515          /* Each ICP handle is a single DWord (4 bytes) */
2516          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2517          bld.SHL(vertex_offset_bytes,
2518                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2519                  brw_imm_ud(2u));
2520
2521          /* Start at g1.  We might read up to 4 registers. */
2522          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2523                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2524                   brw_imm_ud(4 * REG_SIZE));
2525       }
2526
2527       /* We can only read two double components with each URB read, so
2528        * we send two read messages in that case, each one loading up to
2529        * two double components.
2530        */
2531       unsigned num_iterations = 1;
2532       unsigned num_components = instr->num_components;
2533       unsigned first_component = nir_intrinsic_component(instr);
2534       fs_reg orig_dst = dst;
2535       if (type_sz(dst.type) == 8) {
2536          first_component = first_component / 2;
2537          if (instr->num_components > 2) {
2538             num_iterations = 2;
2539             num_components = 2;
2540          }
2541
2542          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2543          dst = tmp;
2544       }
2545
2546       for (unsigned iter = 0; iter < num_iterations; iter++) {
2547          if (indirect_offset.file == BAD_FILE) {
2548             /* Constant indexing - use global offset. */
2549             if (first_component != 0) {
2550                unsigned read_components = num_components + first_component;
2551                fs_reg tmp = bld.vgrf(dst.type, read_components);
2552                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2553                for (unsigned i = 0; i < num_components; i++) {
2554                   bld.MOV(offset(dst, bld, i),
2555                           offset(tmp, bld, i + first_component));
2556                }
2557             } else {
2558                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2559             }
2560             inst->offset = imm_offset;
2561             inst->mlen = 1;
2562          } else {
2563             /* Indirect indexing - use per-slot offsets as well. */
2564             const fs_reg srcs[] = { icp_handle, indirect_offset };
2565             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2566             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2567             if (first_component != 0) {
2568                unsigned read_components = num_components + first_component;
2569                fs_reg tmp = bld.vgrf(dst.type, read_components);
2570                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2571                                payload);
2572                for (unsigned i = 0; i < num_components; i++) {
2573                   bld.MOV(offset(dst, bld, i),
2574                           offset(tmp, bld, i + first_component));
2575                }
2576             } else {
2577                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2578                                payload);
2579             }
2580             inst->offset = imm_offset;
2581             inst->mlen = 2;
2582          }
2583          inst->size_written = (num_components + first_component) *
2584                               inst->dst.component_size(inst->exec_size);
2585
2586          /* If we are reading 64-bit data using 32-bit read messages we need
2587           * build proper 64-bit data elements by shuffling the low and high
2588           * 32-bit components around like we do for other things like UBOs
2589           * or SSBOs.
2590           */
2591          if (type_sz(dst.type) == 8) {
2592             shuffle_from_32bit_read(bld,
2593                                     offset(orig_dst, bld, iter * 2),
2594                                     retype(dst, BRW_REGISTER_TYPE_D),
2595                                     0, num_components);
2596          }
2597
2598          /* Copy the temporary to the destination to deal with writemasking.
2599           *
2600           * Also attempt to deal with gl_PointSize being in the .w component.
2601           */
2602          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2603             assert(type_sz(dst.type) < 8);
2604             inst->dst = bld.vgrf(dst.type, 4);
2605             inst->size_written = 4 * REG_SIZE;
2606             bld.MOV(dst, offset(inst->dst, bld, 3));
2607          }
2608
2609          /* If we are loading double data and we need a second read message
2610           * adjust the write offset
2611           */
2612          if (num_iterations > 1) {
2613             num_components = instr->num_components - 2;
2614             imm_offset++;
2615          }
2616       }
2617       break;
2618    }
2619
2620    case nir_intrinsic_load_output:
2621    case nir_intrinsic_load_per_vertex_output: {
2622       fs_reg indirect_offset = get_indirect_offset(instr);
2623       unsigned imm_offset = instr->const_index[0];
2624       unsigned first_component = nir_intrinsic_component(instr);
2625
2626       fs_inst *inst;
2627       if (indirect_offset.file == BAD_FILE) {
2628          /* Replicate the patch handle to all enabled channels */
2629          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2630          bld.MOV(patch_handle,
2631                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2632
2633          {
2634             if (first_component != 0) {
2635                unsigned read_components =
2636                   instr->num_components + first_component;
2637                fs_reg tmp = bld.vgrf(dst.type, read_components);
2638                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2639                                patch_handle);
2640                inst->size_written = read_components * REG_SIZE;
2641                for (unsigned i = 0; i < instr->num_components; i++) {
2642                   bld.MOV(offset(dst, bld, i),
2643                           offset(tmp, bld, i + first_component));
2644                }
2645             } else {
2646                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2647                                patch_handle);
2648                inst->size_written = instr->num_components * REG_SIZE;
2649             }
2650             inst->offset = imm_offset;
2651             inst->mlen = 1;
2652          }
2653       } else {
2654          /* Indirect indexing - use per-slot offsets as well. */
2655          const fs_reg srcs[] = {
2656             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2657             indirect_offset
2658          };
2659          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2660          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2661          if (first_component != 0) {
2662             unsigned read_components =
2663                instr->num_components + first_component;
2664             fs_reg tmp = bld.vgrf(dst.type, read_components);
2665             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2666                             payload);
2667             inst->size_written = read_components * REG_SIZE;
2668             for (unsigned i = 0; i < instr->num_components; i++) {
2669                bld.MOV(offset(dst, bld, i),
2670                        offset(tmp, bld, i + first_component));
2671             }
2672          } else {
2673             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2674                             payload);
2675             inst->size_written = instr->num_components * REG_SIZE;
2676          }
2677          inst->offset = imm_offset;
2678          inst->mlen = 2;
2679       }
2680       break;
2681    }
2682
2683    case nir_intrinsic_store_output:
2684    case nir_intrinsic_store_per_vertex_output: {
2685       fs_reg value = get_nir_src(instr->src[0]);
2686       bool is_64bit = (instr->src[0].is_ssa ?
2687          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2688       fs_reg indirect_offset = get_indirect_offset(instr);
2689       unsigned imm_offset = instr->const_index[0];
2690       unsigned mask = instr->const_index[1];
2691       unsigned header_regs = 0;
2692       fs_reg srcs[7];
2693       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2694
2695       if (indirect_offset.file != BAD_FILE) {
2696          srcs[header_regs++] = indirect_offset;
2697       }
2698
2699       if (mask == 0)
2700          break;
2701
2702       unsigned num_components = util_last_bit(mask);
2703       enum opcode opcode;
2704
2705       /* We can only pack two 64-bit components in a single message, so send
2706        * 2 messages if we have more components
2707        */
2708       unsigned num_iterations = 1;
2709       unsigned iter_components = num_components;
2710       unsigned first_component = nir_intrinsic_component(instr);
2711       if (is_64bit) {
2712          first_component = first_component / 2;
2713          if (instr->num_components > 2) {
2714             num_iterations = 2;
2715             iter_components = 2;
2716          }
2717       }
2718
2719       mask = mask << first_component;
2720
2721       for (unsigned iter = 0; iter < num_iterations; iter++) {
2722          if (!is_64bit && mask != WRITEMASK_XYZW) {
2723             srcs[header_regs++] = brw_imm_ud(mask << 16);
2724             opcode = indirect_offset.file != BAD_FILE ?
2725                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2726                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2727          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2728             /* Expand the 64-bit mask to 32-bit channels. We only handle
2729              * two channels in each iteration, so we only care about X/Y.
2730              */
2731             unsigned mask32 = 0;
2732             if (mask & WRITEMASK_X)
2733                mask32 |= WRITEMASK_XY;
2734             if (mask & WRITEMASK_Y)
2735                mask32 |= WRITEMASK_ZW;
2736
2737             /* If the mask does not include any of the channels X or Y there
2738              * is nothing to do in this iteration. Move on to the next couple
2739              * of 64-bit channels.
2740              */
2741             if (!mask32) {
2742                mask >>= 2;
2743                imm_offset++;
2744                continue;
2745             }
2746
2747             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2748             opcode = indirect_offset.file != BAD_FILE ?
2749                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2750                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2751          } else {
2752             opcode = indirect_offset.file != BAD_FILE ?
2753                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2754                SHADER_OPCODE_URB_WRITE_SIMD8;
2755          }
2756
2757          for (unsigned i = 0; i < iter_components; i++) {
2758             if (!(mask & (1 << (i + first_component))))
2759                continue;
2760
2761             if (!is_64bit) {
2762                srcs[header_regs + i + first_component] = offset(value, bld, i);
2763             } else {
2764                /* We need to shuffle the 64-bit data to match the layout
2765                 * expected by our 32-bit URB write messages. We use a temporary
2766                 * for that.
2767                 */
2768                unsigned channel = iter * 2 + i;
2769                fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2770
2771                srcs[header_regs + (i + first_component) * 2] = dest;
2772                srcs[header_regs + (i + first_component) * 2 + 1] =
2773                   offset(dest, bld, 1);
2774             }
2775          }
2776
2777          unsigned mlen =
2778             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2779             (is_64bit ? 2 * first_component : first_component);
2780          fs_reg payload =
2781             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2782          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2783
2784          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2785          inst->offset = imm_offset;
2786          inst->mlen = mlen;
2787
2788          /* If this is a 64-bit attribute, select the next two 64-bit channels
2789           * to be handled in the next iteration.
2790           */
2791          if (is_64bit) {
2792             mask >>= 2;
2793             imm_offset++;
2794          }
2795       }
2796       break;
2797    }
2798
2799    default:
2800       nir_emit_intrinsic(bld, instr);
2801       break;
2802    }
2803 }
2804
2805 void
2806 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2807                                    nir_intrinsic_instr *instr)
2808 {
2809    assert(stage == MESA_SHADER_TESS_EVAL);
2810    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2811
2812    fs_reg dest;
2813    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2814       dest = get_nir_dest(instr->dest);
2815
2816    switch (instr->intrinsic) {
2817    case nir_intrinsic_load_primitive_id:
2818       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2819       break;
2820    case nir_intrinsic_load_tess_coord:
2821       /* gl_TessCoord is part of the payload in g1-3 */
2822       for (unsigned i = 0; i < 3; i++) {
2823          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2824       }
2825       break;
2826
2827    case nir_intrinsic_load_input:
2828    case nir_intrinsic_load_per_vertex_input: {
2829       fs_reg indirect_offset = get_indirect_offset(instr);
2830       unsigned imm_offset = instr->const_index[0];
2831       unsigned first_component = nir_intrinsic_component(instr);
2832
2833       if (type_sz(dest.type) == 8) {
2834          first_component = first_component / 2;
2835       }
2836
2837       fs_inst *inst;
2838       if (indirect_offset.file == BAD_FILE) {
2839          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2840           * which is 16 registers (since each holds 2 vec4 slots).
2841           */
2842          unsigned slot_count = 1;
2843          if (type_sz(dest.type) == 8 && instr->num_components > 2)
2844             slot_count++;
2845
2846          const unsigned max_push_slots = 32;
2847          if (imm_offset + slot_count <= max_push_slots) {
2848             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2849             for (int i = 0; i < instr->num_components; i++) {
2850                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2851                   i + first_component;
2852                bld.MOV(offset(dest, bld, i), component(src, comp));
2853             }
2854
2855             tes_prog_data->base.urb_read_length =
2856                MAX2(tes_prog_data->base.urb_read_length,
2857                     DIV_ROUND_UP(imm_offset + slot_count, 2));
2858          } else {
2859             /* Replicate the patch handle to all enabled channels */
2860             const fs_reg srcs[] = {
2861                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2862             };
2863             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2864             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2865
2866             if (first_component != 0) {
2867                unsigned read_components =
2868                   instr->num_components + first_component;
2869                fs_reg tmp = bld.vgrf(dest.type, read_components);
2870                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2871                                patch_handle);
2872                inst->size_written = read_components * REG_SIZE;
2873                for (unsigned i = 0; i < instr->num_components; i++) {
2874                   bld.MOV(offset(dest, bld, i),
2875                           offset(tmp, bld, i + first_component));
2876                }
2877             } else {
2878                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2879                                patch_handle);
2880                inst->size_written = instr->num_components * REG_SIZE;
2881             }
2882             inst->mlen = 1;
2883             inst->offset = imm_offset;
2884          }
2885       } else {
2886          /* Indirect indexing - use per-slot offsets as well. */
2887
2888          /* We can only read two double components with each URB read, so
2889           * we send two read messages in that case, each one loading up to
2890           * two double components.
2891           */
2892          unsigned num_iterations = 1;
2893          unsigned num_components = instr->num_components;
2894          fs_reg orig_dest = dest;
2895          if (type_sz(dest.type) == 8) {
2896             if (instr->num_components > 2) {
2897                num_iterations = 2;
2898                num_components = 2;
2899             }
2900             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2901             dest = tmp;
2902          }
2903
2904          for (unsigned iter = 0; iter < num_iterations; iter++) {
2905             const fs_reg srcs[] = {
2906                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2907                indirect_offset
2908             };
2909             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2910             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2911
2912             if (first_component != 0) {
2913                unsigned read_components =
2914                    num_components + first_component;
2915                fs_reg tmp = bld.vgrf(dest.type, read_components);
2916                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2917                                payload);
2918                for (unsigned i = 0; i < num_components; i++) {
2919                   bld.MOV(offset(dest, bld, i),
2920                           offset(tmp, bld, i + first_component));
2921                }
2922             } else {
2923                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2924                                payload);
2925             }
2926             inst->mlen = 2;
2927             inst->offset = imm_offset;
2928             inst->size_written = (num_components + first_component) *
2929                                  inst->dst.component_size(inst->exec_size);
2930
2931             /* If we are reading 64-bit data using 32-bit read messages we need
2932              * build proper 64-bit data elements by shuffling the low and high
2933              * 32-bit components around like we do for other things like UBOs
2934              * or SSBOs.
2935              */
2936             if (type_sz(dest.type) == 8) {
2937                shuffle_from_32bit_read(bld,
2938                                        offset(orig_dest, bld, iter * 2),
2939                                        retype(dest, BRW_REGISTER_TYPE_D),
2940                                        0, num_components);
2941             }
2942
2943             /* If we are loading double data and we need a second read message
2944              * adjust the offset
2945              */
2946             if (num_iterations > 1) {
2947                num_components = instr->num_components - 2;
2948                imm_offset++;
2949             }
2950          }
2951       }
2952       break;
2953    }
2954    default:
2955       nir_emit_intrinsic(bld, instr);
2956       break;
2957    }
2958 }
2959
2960 void
2961 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2962                                   nir_intrinsic_instr *instr)
2963 {
2964    assert(stage == MESA_SHADER_GEOMETRY);
2965    fs_reg indirect_offset;
2966
2967    fs_reg dest;
2968    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2969       dest = get_nir_dest(instr->dest);
2970
2971    switch (instr->intrinsic) {
2972    case nir_intrinsic_load_primitive_id:
2973       assert(stage == MESA_SHADER_GEOMETRY);
2974       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2975       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2976               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2977       break;
2978
2979    case nir_intrinsic_load_input:
2980       unreachable("load_input intrinsics are invalid for the GS stage");
2981
2982    case nir_intrinsic_load_per_vertex_input:
2983       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2984                          instr->src[1], instr->num_components,
2985                          nir_intrinsic_component(instr));
2986       break;
2987
2988    case nir_intrinsic_emit_vertex_with_counter:
2989       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2990       break;
2991
2992    case nir_intrinsic_end_primitive_with_counter:
2993       emit_gs_end_primitive(instr->src[0]);
2994       break;
2995
2996    case nir_intrinsic_set_vertex_count:
2997       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2998       break;
2999
3000    case nir_intrinsic_load_invocation_id: {
3001       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3002       assert(val.file != BAD_FILE);
3003       dest.type = val.type;
3004       bld.MOV(dest, val);
3005       break;
3006    }
3007
3008    default:
3009       nir_emit_intrinsic(bld, instr);
3010       break;
3011    }
3012 }
3013
3014 /**
3015  * Fetch the current render target layer index.
3016  */
3017 static fs_reg
3018 fetch_render_target_array_index(const fs_builder &bld)
3019 {
3020    if (bld.shader->devinfo->gen >= 6) {
3021       /* The render target array index is provided in the thread payload as
3022        * bits 26:16 of r0.0.
3023        */
3024       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3025       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3026               brw_imm_uw(0x7ff));
3027       return idx;
3028    } else {
3029       /* Pre-SNB we only ever render into the first layer of the framebuffer
3030        * since layered rendering is not implemented.
3031        */
3032       return brw_imm_ud(0);
3033    }
3034 }
3035
3036 /**
3037  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3038  * framebuffer at the current fragment coordinates and sample index.
3039  */
3040 fs_inst *
3041 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3042                                       unsigned target)
3043 {
3044    const struct gen_device_info *devinfo = bld.shader->devinfo;
3045
3046    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3047    const brw_wm_prog_key *wm_key =
3048       reinterpret_cast<const brw_wm_prog_key *>(key);
3049    assert(!wm_key->coherent_fb_fetch);
3050    const struct brw_wm_prog_data *wm_prog_data =
3051       brw_wm_prog_data(stage_prog_data);
3052
3053    /* Calculate the surface index relative to the start of the texture binding
3054     * table block, since that's what the texturing messages expect.
3055     */
3056    const unsigned surface = target +
3057       wm_prog_data->binding_table.render_target_read_start -
3058       wm_prog_data->base.binding_table.texture_start;
3059
3060    brw_mark_surface_used(
3061       bld.shader->stage_prog_data,
3062       wm_prog_data->binding_table.render_target_read_start + target);
3063
3064    /* Calculate the fragment coordinates. */
3065    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3066    bld.MOV(offset(coords, bld, 0), pixel_x);
3067    bld.MOV(offset(coords, bld, 1), pixel_y);
3068    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3069
3070    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3071     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3072     * shouldn't be necessary to recompile based on whether the framebuffer is
3073     * CMS or UMS.
3074     */
3075    if (wm_key->multisample_fbo &&
3076        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3077       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3078
3079    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3080    const fs_reg mcs = wm_key->multisample_fbo ?
3081       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
3082
3083    /* Use either a normal or a CMS texel fetch message depending on whether
3084     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3085     * message just in case the framebuffer uses 16x multisampling, it should
3086     * be equivalent to the normal CMS fetch for lower multisampling modes.
3087     */
3088    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3089                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3090                      SHADER_OPCODE_TXF_CMS_LOGICAL;
3091
3092    /* Emit the instruction. */
3093    const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
3094                            sample, mcs,
3095                            brw_imm_ud(surface), brw_imm_ud(0),
3096                            fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
3097    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
3098
3099    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3100    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3101
3102    return inst;
3103 }
3104
3105 /**
3106  * Actual coherent framebuffer read implemented using the native render target
3107  * read message.  Requires SKL+.
3108  */
3109 static fs_inst *
3110 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3111 {
3112    assert(bld.shader->devinfo->gen >= 9);
3113    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3114    inst->target = target;
3115    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3116
3117    return inst;
3118 }
3119
3120 static fs_reg
3121 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3122 {
3123    if (n && regs[0].file != BAD_FILE) {
3124       return regs[0];
3125
3126    } else {
3127       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3128
3129       for (unsigned i = 0; i < n; i++)
3130          regs[i] = tmp;
3131
3132       return tmp;
3133    }
3134 }
3135
3136 static fs_reg
3137 alloc_frag_output(fs_visitor *v, unsigned location)
3138 {
3139    assert(v->stage == MESA_SHADER_FRAGMENT);
3140    const brw_wm_prog_key *const key =
3141       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3142    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3143    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3144
3145    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3146       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3147
3148    else if (l == FRAG_RESULT_COLOR)
3149       return alloc_temporary(v->bld, 4, v->outputs,
3150                              MAX2(key->nr_color_regions, 1));
3151
3152    else if (l == FRAG_RESULT_DEPTH)
3153       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3154
3155    else if (l == FRAG_RESULT_STENCIL)
3156       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3157
3158    else if (l == FRAG_RESULT_SAMPLE_MASK)
3159       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3160
3161    else if (l >= FRAG_RESULT_DATA0 &&
3162             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3163       return alloc_temporary(v->bld, 4,
3164                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3165
3166    else
3167       unreachable("Invalid location");
3168 }
3169
3170 void
3171 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3172                                   nir_intrinsic_instr *instr)
3173 {
3174    assert(stage == MESA_SHADER_FRAGMENT);
3175
3176    fs_reg dest;
3177    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3178       dest = get_nir_dest(instr->dest);
3179
3180    switch (instr->intrinsic) {
3181    case nir_intrinsic_load_front_face:
3182       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3183               *emit_frontfacing_interpolation());
3184       break;
3185
3186    case nir_intrinsic_load_sample_pos: {
3187       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3188       assert(sample_pos.file != BAD_FILE);
3189       dest.type = sample_pos.type;
3190       bld.MOV(dest, sample_pos);
3191       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3192       break;
3193    }
3194
3195    case nir_intrinsic_load_layer_id:
3196       dest.type = BRW_REGISTER_TYPE_UD;
3197       bld.MOV(dest, fetch_render_target_array_index(bld));
3198       break;
3199
3200    case nir_intrinsic_load_helper_invocation:
3201    case nir_intrinsic_load_sample_mask_in:
3202    case nir_intrinsic_load_sample_id: {
3203       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3204       fs_reg val = nir_system_values[sv];
3205       assert(val.file != BAD_FILE);
3206       dest.type = val.type;
3207       bld.MOV(dest, val);
3208       break;
3209    }
3210
3211    case nir_intrinsic_store_output: {
3212       const fs_reg src = get_nir_src(instr->src[0]);
3213       const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3214       assert(const_offset && "Indirect output stores not allowed");
3215       const unsigned location = nir_intrinsic_base(instr) +
3216          SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
3217       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3218                                      src.type);
3219
3220       for (unsigned j = 0; j < instr->num_components; j++)
3221          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3222                  offset(src, bld, j));
3223
3224       break;
3225    }
3226
3227    case nir_intrinsic_load_output: {
3228       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3229                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3230       assert(l >= FRAG_RESULT_DATA0);
3231       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3232       assert(const_offset && "Indirect output loads not allowed");
3233       const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
3234       const fs_reg tmp = bld.vgrf(dest.type, 4);
3235
3236       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3237          emit_coherent_fb_read(bld, tmp, target);
3238       else
3239          emit_non_coherent_fb_read(bld, tmp, target);
3240
3241       for (unsigned j = 0; j < instr->num_components; j++) {
3242          bld.MOV(offset(dest, bld, j),
3243                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3244       }
3245
3246       break;
3247    }
3248
3249    case nir_intrinsic_discard:
3250    case nir_intrinsic_discard_if: {
3251       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3252        * update just the flag bits that aren't yet discarded.  If there's no
3253        * condition, we emit a CMP of g0 != g0, so all currently executing
3254        * channels will get turned off.
3255        */
3256       fs_inst *cmp;
3257       if (instr->intrinsic == nir_intrinsic_discard_if) {
3258          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3259                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3260       } else {
3261          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3262                                        BRW_REGISTER_TYPE_UW));
3263          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3264       }
3265       cmp->predicate = BRW_PREDICATE_NORMAL;
3266       cmp->flag_subreg = 1;
3267
3268       if (devinfo->gen >= 6) {
3269          emit_discard_jump();
3270       }
3271
3272       limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3273       break;
3274    }
3275
3276    case nir_intrinsic_load_input: {
3277       /* load_input is only used for flat inputs */
3278       unsigned base = nir_intrinsic_base(instr);
3279       unsigned comp = nir_intrinsic_component(instr);
3280       unsigned num_components = instr->num_components;
3281       fs_reg orig_dest = dest;
3282       enum brw_reg_type type = dest.type;
3283
3284       /* Special case fields in the VUE header */
3285       if (base == VARYING_SLOT_LAYER)
3286          comp = 1;
3287       else if (base == VARYING_SLOT_VIEWPORT)
3288          comp = 2;
3289
3290       if (nir_dest_bit_size(instr->dest) == 64) {
3291          /* const_index is in 32-bit type size units that could not be aligned
3292           * with DF. We need to read the double vector as if it was a float
3293           * vector of twice the number of components to fetch the right data.
3294           */
3295          type = BRW_REGISTER_TYPE_F;
3296          num_components *= 2;
3297          dest = bld.vgrf(type, num_components);
3298       }
3299
3300       for (unsigned int i = 0; i < num_components; i++) {
3301          bld.MOV(offset(retype(dest, type), bld, i),
3302                  retype(component(interp_reg(base, comp + i), 3), type));
3303       }
3304
3305       if (nir_dest_bit_size(instr->dest) == 64) {
3306          shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3307                                  instr->num_components);
3308       }
3309       break;
3310    }
3311
3312    case nir_intrinsic_load_barycentric_pixel:
3313    case nir_intrinsic_load_barycentric_centroid:
3314    case nir_intrinsic_load_barycentric_sample:
3315       /* Do nothing - load_interpolated_input handling will handle it later. */
3316       break;
3317
3318    case nir_intrinsic_load_barycentric_at_sample: {
3319       const glsl_interp_mode interpolation =
3320          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3321
3322       nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
3323
3324       if (const_sample) {
3325          unsigned msg_data = const_sample->i32[0] << 4;
3326
3327          emit_pixel_interpolater_send(bld,
3328                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3329                                       dest,
3330                                       fs_reg(), /* src */
3331                                       brw_imm_ud(msg_data),
3332                                       interpolation);
3333       } else {
3334          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3335                                           BRW_REGISTER_TYPE_UD);
3336
3337          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3338             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3339             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3340             bld.exec_all().group(1, 0)
3341                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3342             emit_pixel_interpolater_send(bld,
3343                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3344                                          dest,
3345                                          fs_reg(), /* src */
3346                                          msg_data,
3347                                          interpolation);
3348          } else {
3349             /* Make a loop that sends a message to the pixel interpolater
3350              * for the sample number in each live channel. If there are
3351              * multiple channels with the same sample number then these
3352              * will be handled simultaneously with a single interation of
3353              * the loop.
3354              */
3355             bld.emit(BRW_OPCODE_DO);
3356
3357             /* Get the next live sample number into sample_id_reg */
3358             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3359
3360             /* Set the flag register so that we can perform the send
3361              * message on all channels that have the same sample number
3362              */
3363             bld.CMP(bld.null_reg_ud(),
3364                     sample_src, sample_id,
3365                     BRW_CONDITIONAL_EQ);
3366             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3367             bld.exec_all().group(1, 0)
3368                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3369             fs_inst *inst =
3370                emit_pixel_interpolater_send(bld,
3371                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3372                                             dest,
3373                                             fs_reg(), /* src */
3374                                             component(msg_data, 0),
3375                                             interpolation);
3376             set_predicate(BRW_PREDICATE_NORMAL, inst);
3377
3378             /* Continue the loop if there are any live channels left */
3379             set_predicate_inv(BRW_PREDICATE_NORMAL,
3380                               true, /* inverse */
3381                               bld.emit(BRW_OPCODE_WHILE));
3382          }
3383       }
3384       break;
3385    }
3386
3387    case nir_intrinsic_load_barycentric_at_offset: {
3388       const glsl_interp_mode interpolation =
3389          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3390
3391       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3392
3393       if (const_offset) {
3394          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3395          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3396
3397          emit_pixel_interpolater_send(bld,
3398                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3399                                       dest,
3400                                       fs_reg(), /* src */
3401                                       brw_imm_ud(off_x | (off_y << 4)),
3402                                       interpolation);
3403       } else {
3404          fs_reg src = vgrf(glsl_type::ivec2_type);
3405          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3406                                     BRW_REGISTER_TYPE_F);
3407          for (int i = 0; i < 2; i++) {
3408             fs_reg temp = vgrf(glsl_type::float_type);
3409             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3410             fs_reg itemp = vgrf(glsl_type::int_type);
3411             /* float to int */
3412             bld.MOV(itemp, temp);
3413
3414             /* Clamp the upper end of the range to +7/16.
3415              * ARB_gpu_shader5 requires that we support a maximum offset
3416              * of +0.5, which isn't representable in a S0.4 value -- if
3417              * we didn't clamp it, we'd end up with -8/16, which is the
3418              * opposite of what the shader author wanted.
3419              *
3420              * This is legal due to ARB_gpu_shader5's quantization
3421              * rules:
3422              *
3423              * "Not all values of <offset> may be supported; x and y
3424              * offsets may be rounded to fixed-point values with the
3425              * number of fraction bits given by the
3426              * implementation-dependent constant
3427              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3428              */
3429             set_condmod(BRW_CONDITIONAL_L,
3430                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3431          }
3432
3433          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3434          emit_pixel_interpolater_send(bld,
3435                                       opcode,
3436                                       dest,
3437                                       src,
3438                                       brw_imm_ud(0u),
3439                                       interpolation);
3440       }
3441       break;
3442    }
3443
3444    case nir_intrinsic_load_interpolated_input: {
3445       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3446          emit_fragcoord_interpolation(dest);
3447          break;
3448       }
3449
3450       assert(instr->src[0].ssa &&
3451              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3452       nir_intrinsic_instr *bary_intrinsic =
3453          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3454       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3455       enum glsl_interp_mode interp_mode =
3456          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3457       fs_reg dst_xy;
3458
3459       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3460           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3461          /* Use the result of the PI message */
3462          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3463       } else {
3464          /* Use the delta_xy values computed from the payload */
3465          enum brw_barycentric_mode bary =
3466             brw_barycentric_mode(interp_mode, bary_intrin);
3467
3468          dst_xy = this->delta_xy[bary];
3469       }
3470
3471       for (unsigned int i = 0; i < instr->num_components; i++) {
3472          fs_reg interp =
3473             component(interp_reg(nir_intrinsic_base(instr),
3474                                  nir_intrinsic_component(instr) + i), 0);
3475          interp.type = BRW_REGISTER_TYPE_F;
3476          dest.type = BRW_REGISTER_TYPE_F;
3477
3478          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3479             fs_reg tmp = vgrf(glsl_type::float_type);
3480             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3481             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3482          } else {
3483             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3484          }
3485       }
3486       break;
3487    }
3488
3489    default:
3490       nir_emit_intrinsic(bld, instr);
3491       break;
3492    }
3493 }
3494
3495 static int
3496 get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3497 {
3498    const nir_const_value *const val = nir_src_as_const_value(instr->src[src]);
3499
3500    if (val != NULL) {
3501       if (val->i32[0] == 1)
3502          return BRW_AOP_INC;
3503       else if (val->i32[0] == -1)
3504          return BRW_AOP_DEC;
3505    }
3506
3507    return BRW_AOP_ADD;
3508 }
3509
3510 void
3511 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3512                                   nir_intrinsic_instr *instr)
3513 {
3514    assert(stage == MESA_SHADER_COMPUTE);
3515    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3516
3517    fs_reg dest;
3518    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3519       dest = get_nir_dest(instr->dest);
3520
3521    switch (instr->intrinsic) {
3522    case nir_intrinsic_barrier:
3523       emit_barrier();
3524       cs_prog_data->uses_barrier = true;
3525       break;
3526
3527    case nir_intrinsic_load_subgroup_id:
3528       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3529       break;
3530
3531    case nir_intrinsic_load_local_invocation_id:
3532    case nir_intrinsic_load_work_group_id: {
3533       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3534       fs_reg val = nir_system_values[sv];
3535       assert(val.file != BAD_FILE);
3536       dest.type = val.type;
3537       for (unsigned i = 0; i < 3; i++)
3538          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3539       break;
3540    }
3541
3542    case nir_intrinsic_load_num_work_groups: {
3543       const unsigned surface =
3544          cs_prog_data->binding_table.work_groups_start;
3545
3546       cs_prog_data->uses_num_work_groups = true;
3547
3548       fs_reg surf_index = brw_imm_ud(surface);
3549       brw_mark_surface_used(prog_data, surface);
3550
3551       /* Read the 3 GLuint components of gl_NumWorkGroups */
3552       for (unsigned i = 0; i < 3; i++) {
3553          fs_reg read_result =
3554             emit_untyped_read(bld, surf_index,
3555                               brw_imm_ud(i << 2),
3556                               1 /* dims */, 1 /* size */,
3557                               BRW_PREDICATE_NONE);
3558          read_result.type = dest.type;
3559          bld.MOV(dest, read_result);
3560          dest = offset(dest, bld, 1);
3561       }
3562       break;
3563    }
3564
3565    case nir_intrinsic_shared_atomic_add:
3566       nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3567       break;
3568    case nir_intrinsic_shared_atomic_imin:
3569       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3570       break;
3571    case nir_intrinsic_shared_atomic_umin:
3572       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3573       break;
3574    case nir_intrinsic_shared_atomic_imax:
3575       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3576       break;
3577    case nir_intrinsic_shared_atomic_umax:
3578       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3579       break;
3580    case nir_intrinsic_shared_atomic_and:
3581       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3582       break;
3583    case nir_intrinsic_shared_atomic_or:
3584       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3585       break;
3586    case nir_intrinsic_shared_atomic_xor:
3587       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3588       break;
3589    case nir_intrinsic_shared_atomic_exchange:
3590       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3591       break;
3592    case nir_intrinsic_shared_atomic_comp_swap:
3593       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3594       break;
3595    case nir_intrinsic_shared_atomic_fmin:
3596       nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3597       break;
3598    case nir_intrinsic_shared_atomic_fmax:
3599       nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3600       break;
3601    case nir_intrinsic_shared_atomic_fcomp_swap:
3602       nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3603       break;
3604
3605    case nir_intrinsic_load_shared: {
3606       assert(devinfo->gen >= 7);
3607
3608       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3609
3610       /* Get the offset to read from */
3611       fs_reg offset_reg;
3612       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3613       if (const_offset) {
3614          offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
3615       } else {
3616          offset_reg = vgrf(glsl_type::uint_type);
3617          bld.ADD(offset_reg,
3618                  retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
3619                  brw_imm_ud(instr->const_index[0]));
3620       }
3621
3622       /* Read the vector */
3623       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3624                              instr->num_components);
3625       break;
3626    }
3627
3628    case nir_intrinsic_store_shared: {
3629       assert(devinfo->gen >= 7);
3630
3631       /* Block index */
3632       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3633
3634       /* Value */
3635       fs_reg val_reg = get_nir_src(instr->src[0]);
3636
3637       /* Writemask */
3638       unsigned writemask = instr->const_index[1];
3639
3640       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3641        * since the untyped writes below operate in units of 32-bits, which
3642        * means that we need to write twice as many components each time.
3643        * Also, we have to suffle 64-bit data to be in the appropriate layout
3644        * expected by our 32-bit write messages.
3645        */
3646       unsigned type_size = 4;
3647       if (nir_src_bit_size(instr->src[0]) == 64) {
3648          type_size = 8;
3649          val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
3650                                            instr->num_components);
3651       }
3652
3653       unsigned type_slots = type_size / 4;
3654
3655       /* Combine groups of consecutive enabled channels in one write
3656        * message. We use ffs to find the first enabled channel and then ffs on
3657        * the bit-inverse, down-shifted writemask to determine the length of
3658        * the block of enabled bits.
3659        */
3660       while (writemask) {
3661          unsigned first_component = ffs(writemask) - 1;
3662          unsigned length = ffs(~(writemask >> first_component)) - 1;
3663
3664          /* We can't write more than 2 64-bit components at once. Limit the
3665           * length of the write to what we can do and let the next iteration
3666           * handle the rest
3667           */
3668          if (type_size > 4)
3669             length = MIN2(2, length);
3670
3671          fs_reg offset_reg;
3672          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3673          if (const_offset) {
3674             offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
3675                                     type_size * first_component);
3676          } else {
3677             offset_reg = vgrf(glsl_type::uint_type);
3678             bld.ADD(offset_reg,
3679                     retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
3680                     brw_imm_ud(instr->const_index[0] + type_size * first_component));
3681          }
3682
3683          emit_untyped_write(bld, surf_index, offset_reg,
3684                             offset(val_reg, bld, first_component * type_slots),
3685                             1 /* dims */, length * type_slots,
3686                             BRW_PREDICATE_NONE);
3687
3688          /* Clear the bits in the writemask that we just wrote, then try
3689           * again to see if more channels are left.
3690           */
3691          writemask &= (15 << (first_component + length));
3692       }
3693
3694       break;
3695    }
3696
3697    default:
3698       nir_emit_intrinsic(bld, instr);
3699       break;
3700    }
3701 }
3702
3703 static fs_reg
3704 brw_nir_reduction_op_identity(const fs_builder &bld,
3705                               nir_op op, brw_reg_type type)
3706 {
3707    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3708    switch (type_sz(type)) {
3709    case 2:
3710       assert(type != BRW_REGISTER_TYPE_HF);
3711       return retype(brw_imm_uw(value.u16[0]), type);
3712    case 4:
3713       return retype(brw_imm_ud(value.u32[0]), type);
3714    case 8:
3715       if (type == BRW_REGISTER_TYPE_DF)
3716          return setup_imm_df(bld, value.f64[0]);
3717       else
3718          return retype(brw_imm_u64(value.u64[0]), type);
3719    default:
3720       unreachable("Invalid type size");
3721    }
3722 }
3723
3724 static opcode
3725 brw_op_for_nir_reduction_op(nir_op op)
3726 {
3727    switch (op) {
3728    case nir_op_iadd: return BRW_OPCODE_ADD;
3729    case nir_op_fadd: return BRW_OPCODE_ADD;
3730    case nir_op_imul: return BRW_OPCODE_MUL;
3731    case nir_op_fmul: return BRW_OPCODE_MUL;
3732    case nir_op_imin: return BRW_OPCODE_SEL;
3733    case nir_op_umin: return BRW_OPCODE_SEL;
3734    case nir_op_fmin: return BRW_OPCODE_SEL;
3735    case nir_op_imax: return BRW_OPCODE_SEL;
3736    case nir_op_umax: return BRW_OPCODE_SEL;
3737    case nir_op_fmax: return BRW_OPCODE_SEL;
3738    case nir_op_iand: return BRW_OPCODE_AND;
3739    case nir_op_ior:  return BRW_OPCODE_OR;
3740    case nir_op_ixor: return BRW_OPCODE_XOR;
3741    default:
3742       unreachable("Invalid reduction operation");
3743    }
3744 }
3745
3746 static brw_conditional_mod
3747 brw_cond_mod_for_nir_reduction_op(nir_op op)
3748 {
3749    switch (op) {
3750    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3751    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3752    case nir_op_imul: return BRW_CONDITIONAL_NONE;
3753    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3754    case nir_op_imin: return BRW_CONDITIONAL_L;
3755    case nir_op_umin: return BRW_CONDITIONAL_L;
3756    case nir_op_fmin: return BRW_CONDITIONAL_L;
3757    case nir_op_imax: return BRW_CONDITIONAL_GE;
3758    case nir_op_umax: return BRW_CONDITIONAL_GE;
3759    case nir_op_fmax: return BRW_CONDITIONAL_GE;
3760    case nir_op_iand: return BRW_CONDITIONAL_NONE;
3761    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3762    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3763    default:
3764       unreachable("Invalid reduction operation");
3765    }
3766 }
3767
3768 fs_reg
3769 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3770                                           nir_intrinsic_instr *instr)
3771 {
3772    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3773
3774    if (stage_prog_data->binding_table.image_start > 0) {
3775       if (image.file == BRW_IMMEDIATE_VALUE) {
3776          image.d += stage_prog_data->binding_table.image_start;
3777       } else {
3778          bld.ADD(image, image,
3779                  brw_imm_d(stage_prog_data->binding_table.image_start));
3780       }
3781    }
3782
3783    return bld.emit_uniformize(image);
3784 }
3785
3786 static unsigned
3787 image_intrinsic_coord_components(nir_intrinsic_instr *instr)
3788 {
3789    switch (nir_intrinsic_image_dim(instr)) {
3790    case GLSL_SAMPLER_DIM_1D:
3791       return 1 + nir_intrinsic_image_array(instr);
3792    case GLSL_SAMPLER_DIM_2D:
3793    case GLSL_SAMPLER_DIM_RECT:
3794       return 2 + nir_intrinsic_image_array(instr);
3795    case GLSL_SAMPLER_DIM_3D:
3796    case GLSL_SAMPLER_DIM_CUBE:
3797       return 3;
3798    case GLSL_SAMPLER_DIM_BUF:
3799       return 1;
3800    case GLSL_SAMPLER_DIM_MS:
3801       return 2 + nir_intrinsic_image_array(instr);
3802    default:
3803       unreachable("Invalid image dimension");
3804    }
3805 }
3806
3807 void
3808 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3809 {
3810    fs_reg dest;
3811    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3812       dest = get_nir_dest(instr->dest);
3813
3814    switch (instr->intrinsic) {
3815    case nir_intrinsic_image_load:
3816    case nir_intrinsic_image_store:
3817    case nir_intrinsic_image_atomic_add:
3818    case nir_intrinsic_image_atomic_min:
3819    case nir_intrinsic_image_atomic_max:
3820    case nir_intrinsic_image_atomic_and:
3821    case nir_intrinsic_image_atomic_or:
3822    case nir_intrinsic_image_atomic_xor:
3823    case nir_intrinsic_image_atomic_exchange:
3824    case nir_intrinsic_image_atomic_comp_swap: {
3825       if (stage == MESA_SHADER_FRAGMENT &&
3826           instr->intrinsic != nir_intrinsic_image_load)
3827          brw_wm_prog_data(prog_data)->has_side_effects = true;
3828
3829       /* Get some metadata from the image intrinsic. */
3830       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3831       const unsigned dims = image_intrinsic_coord_components(instr);
3832       const GLenum format = nir_intrinsic_format(instr);
3833       const unsigned dest_components = nir_intrinsic_dest_components(instr);
3834
3835       /* Get the arguments of the image intrinsic. */
3836       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3837       const fs_reg coords = retype(get_nir_src(instr->src[1]),
3838                                    BRW_REGISTER_TYPE_UD);
3839       fs_reg tmp;
3840
3841       /* Emit an image load, store or atomic op. */
3842       if (instr->intrinsic == nir_intrinsic_image_load) {
3843          tmp = emit_typed_read(bld, image, coords, dims,
3844                                instr->num_components);
3845       } else if (instr->intrinsic == nir_intrinsic_image_store) {
3846          const fs_reg src0 = get_nir_src(instr->src[3]);
3847          emit_typed_write(bld, image, coords, src0, dims,
3848                           instr->num_components);
3849       } else {
3850          int op;
3851          unsigned num_srcs = info->num_srcs;
3852
3853          switch (instr->intrinsic) {
3854          case nir_intrinsic_image_atomic_add:
3855             assert(num_srcs == 4);
3856
3857             op = get_op_for_atomic_add(instr, 3);
3858
3859             if (op != BRW_AOP_ADD)
3860                num_srcs = 3;
3861             break;
3862          case nir_intrinsic_image_atomic_min:
3863             assert(format == GL_R32UI || format == GL_R32I);
3864             op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
3865             break;
3866          case nir_intrinsic_image_atomic_max:
3867             assert(format == GL_R32UI || format == GL_R32I);
3868             op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
3869             break;
3870          case nir_intrinsic_image_atomic_and:
3871             op = BRW_AOP_AND;
3872             break;
3873          case nir_intrinsic_image_atomic_or:
3874             op = BRW_AOP_OR;
3875             break;
3876          case nir_intrinsic_image_atomic_xor:
3877             op = BRW_AOP_XOR;
3878             break;
3879          case nir_intrinsic_image_atomic_exchange:
3880             op = BRW_AOP_MOV;
3881             break;
3882          case nir_intrinsic_image_atomic_comp_swap:
3883             op = BRW_AOP_CMPWR;
3884             break;
3885          default:
3886             unreachable("Not reachable.");
3887          }
3888
3889          const fs_reg src0 = (num_srcs >= 4 ?
3890                               get_nir_src(instr->src[3]) : fs_reg());
3891          const fs_reg src1 = (num_srcs >= 5 ?
3892                               get_nir_src(instr->src[4]) : fs_reg());
3893
3894          tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op);
3895       }
3896
3897       /* Assign the result. */
3898       for (unsigned c = 0; c < dest_components; ++c) {
3899          bld.MOV(offset(retype(dest, tmp.type), bld, c),
3900                  offset(tmp, bld, c));
3901       }
3902       break;
3903    }
3904
3905    case nir_intrinsic_image_size: {
3906       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
3907        * into will handle the binding table index for us in the geneerator.
3908        */
3909       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
3910                             BRW_REGISTER_TYPE_UD);
3911       image = bld.emit_uniformize(image);
3912
3913       /* Since the image size is always uniform, we can just emit a SIMD8
3914        * query instruction and splat the result out.
3915        */
3916       const fs_builder ubld = bld.exec_all().group(8, 0);
3917
3918       /* The LOD also serves as the message payload */
3919       fs_reg lod = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3920       ubld.MOV(lod, brw_imm_ud(0));
3921
3922       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
3923       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE, tmp, lod, image);
3924       inst->mlen = 1;
3925       inst->size_written = 4 * REG_SIZE;
3926
3927       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
3928          if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
3929             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3930                      offset(retype(dest, tmp.type), bld, c),
3931                      component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
3932          } else {
3933             bld.MOV(offset(retype(dest, tmp.type), bld, c),
3934                     component(offset(tmp, ubld, c), 0));
3935          }
3936       }
3937       break;
3938    }
3939
3940    case nir_intrinsic_image_load_raw_intel: {
3941       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3942       const fs_reg addr = retype(get_nir_src(instr->src[1]),
3943                                  BRW_REGISTER_TYPE_UD);
3944
3945       fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
3946                                      instr->num_components);
3947
3948       for (unsigned c = 0; c < instr->num_components; ++c) {
3949          bld.MOV(offset(retype(dest, tmp.type), bld, c),
3950                  offset(tmp, bld, c));
3951       }
3952       break;
3953    }
3954
3955    case nir_intrinsic_image_store_raw_intel: {
3956       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3957       const fs_reg addr = retype(get_nir_src(instr->src[1]),
3958                                  BRW_REGISTER_TYPE_UD);
3959       const fs_reg data = retype(get_nir_src(instr->src[2]),
3960                                  BRW_REGISTER_TYPE_UD);
3961
3962       brw_wm_prog_data(prog_data)->has_side_effects = true;
3963
3964       emit_untyped_write(bld, image, addr, data, 1,
3965                          instr->num_components);
3966       break;
3967    }
3968
3969    case nir_intrinsic_group_memory_barrier:
3970    case nir_intrinsic_memory_barrier_shared:
3971    case nir_intrinsic_memory_barrier_atomic_counter:
3972    case nir_intrinsic_memory_barrier_buffer:
3973    case nir_intrinsic_memory_barrier_image:
3974    case nir_intrinsic_memory_barrier: {
3975       const fs_builder ubld = bld.group(8, 0);
3976       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3977       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3978          ->size_written = 2 * REG_SIZE;
3979       break;
3980    }
3981
3982    case nir_intrinsic_shader_clock: {
3983       /* We cannot do anything if there is an event, so ignore it for now */
3984       const fs_reg shader_clock = get_timestamp(bld);
3985       const fs_reg srcs[] = { component(shader_clock, 0),
3986                               component(shader_clock, 1) };
3987       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3988       break;
3989    }
3990
3991    case nir_intrinsic_image_samples:
3992       /* The driver does not support multi-sampled images. */
3993       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3994       break;
3995
3996    case nir_intrinsic_load_uniform: {
3997       /* Offsets are in bytes but they should always aligned to
3998        * the type size
3999        */
4000       assert(instr->const_index[0] % 4 == 0 ||
4001              instr->const_index[0] % type_sz(dest.type) == 0);
4002
4003       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4004
4005       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4006       if (const_offset) {
4007          assert(const_offset->u32[0] % type_sz(dest.type) == 0);
4008          /* For 16-bit types we add the module of the const_index[0]
4009           * offset to access to not 32-bit aligned element
4010           */
4011          src.offset = const_offset->u32[0] + instr->const_index[0] % 4;
4012
4013          for (unsigned j = 0; j < instr->num_components; j++) {
4014             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4015          }
4016       } else {
4017          fs_reg indirect = retype(get_nir_src(instr->src[0]),
4018                                   BRW_REGISTER_TYPE_UD);
4019
4020          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4021           * go past the end of the uniform.  In order to keep the n'th
4022           * component from running past, we subtract off the size of all but
4023           * one component of the vector.
4024           */
4025          assert(instr->const_index[1] >=
4026                 instr->num_components * (int) type_sz(dest.type));
4027          unsigned read_size = instr->const_index[1] -
4028             (instr->num_components - 1) * type_sz(dest.type);
4029
4030          bool supports_64bit_indirects =
4031             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4032
4033          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4034             for (unsigned j = 0; j < instr->num_components; j++) {
4035                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4036                         offset(dest, bld, j), offset(src, bld, j),
4037                         indirect, brw_imm_ud(read_size));
4038             }
4039          } else {
4040             const unsigned num_mov_indirects =
4041                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4042             /* We read a little bit less per MOV INDIRECT, as they are now
4043              * 32-bits ones instead of 64-bit. Fix read_size then.
4044              */
4045             const unsigned read_size_32bit = read_size -
4046                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4047             for (unsigned j = 0; j < instr->num_components; j++) {
4048                for (unsigned i = 0; i < num_mov_indirects; i++) {
4049                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4050                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4051                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4052                            indirect, brw_imm_ud(read_size_32bit));
4053                }
4054             }
4055          }
4056       }
4057       break;
4058    }
4059
4060    case nir_intrinsic_load_ubo: {
4061       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
4062       fs_reg surf_index;
4063
4064       if (const_index) {
4065          const unsigned index = stage_prog_data->binding_table.ubo_start +
4066                                 const_index->u32[0];
4067          surf_index = brw_imm_ud(index);
4068          brw_mark_surface_used(prog_data, index);
4069       } else {
4070          /* The block index is not a constant. Evaluate the index expression
4071           * per-channel and add the base UBO index; we have to select a value
4072           * from any live channel.
4073           */
4074          surf_index = vgrf(glsl_type::uint_type);
4075          bld.ADD(surf_index, get_nir_src(instr->src[0]),
4076                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4077          surf_index = bld.emit_uniformize(surf_index);
4078
4079          /* Assume this may touch any UBO. It would be nice to provide
4080           * a tighter bound, but the array information is already lowered away.
4081           */
4082          brw_mark_surface_used(prog_data,
4083                                stage_prog_data->binding_table.ubo_start +
4084                                nir->info.num_ubos - 1);
4085       }
4086
4087       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4088       if (const_offset == NULL) {
4089          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4090                                      BRW_REGISTER_TYPE_UD);
4091
4092          for (int i = 0; i < instr->num_components; i++)
4093             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4094                                        base_offset, i * type_sz(dest.type));
4095       } else {
4096          /* Even if we are loading doubles, a pull constant load will load
4097           * a 32-bit vec4, so should only reserve vgrf space for that. If we
4098           * need to load a full dvec4 we will have to emit 2 loads. This is
4099           * similar to demote_pull_constants(), except that in that case we
4100           * see individual accesses to each component of the vector and then
4101           * we let CSE deal with duplicate loads. Here we see a vector access
4102           * and we have to split it if necessary.
4103           */
4104          const unsigned type_size = type_sz(dest.type);
4105
4106          /* See if we've selected this as a push constant candidate */
4107          if (const_index) {
4108             const unsigned ubo_block = const_index->u32[0];
4109             const unsigned offset_256b = const_offset->u32[0] / 32;
4110
4111             fs_reg push_reg;
4112             for (int i = 0; i < 4; i++) {
4113                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4114                if (range->block == ubo_block &&
4115                    offset_256b >= range->start &&
4116                    offset_256b < range->start + range->length) {
4117
4118                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4119                   push_reg.offset = const_offset->u32[0] - 32 * range->start;
4120                   break;
4121                }
4122             }
4123
4124             if (push_reg.file != BAD_FILE) {
4125                for (unsigned i = 0; i < instr->num_components; i++) {
4126                   bld.MOV(offset(dest, bld, i),
4127                           byte_offset(push_reg, i * type_size));
4128                }
4129                break;
4130             }
4131          }
4132
4133          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4134          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4135          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4136
4137          for (unsigned c = 0; c < instr->num_components;) {
4138             const unsigned base = const_offset->u32[0] + c * type_size;
4139             /* Number of usable components in the next block-aligned load. */
4140             const unsigned count = MIN2(instr->num_components - c,
4141                                         (block_sz - base % block_sz) / type_size);
4142
4143             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4144                       packed_consts, surf_index,
4145                       brw_imm_ud(base & ~(block_sz - 1)));
4146
4147             const fs_reg consts =
4148                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4149                       dest.type);
4150
4151             for (unsigned d = 0; d < count; d++)
4152                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4153
4154             c += count;
4155          }
4156       }
4157       break;
4158    }
4159
4160    case nir_intrinsic_load_ssbo: {
4161       assert(devinfo->gen >= 7);
4162
4163       nir_const_value *const_uniform_block =
4164          nir_src_as_const_value(instr->src[0]);
4165
4166       fs_reg surf_index;
4167       if (const_uniform_block) {
4168          unsigned index = stage_prog_data->binding_table.ssbo_start +
4169                           const_uniform_block->u32[0];
4170          surf_index = brw_imm_ud(index);
4171          brw_mark_surface_used(prog_data, index);
4172       } else {
4173          surf_index = vgrf(glsl_type::uint_type);
4174          bld.ADD(surf_index, get_nir_src(instr->src[0]),
4175                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4176
4177          /* Assume this may touch any UBO. It would be nice to provide
4178           * a tighter bound, but the array information is already lowered away.
4179           */
4180          brw_mark_surface_used(prog_data,
4181                                stage_prog_data->binding_table.ssbo_start +
4182                                nir->info.num_ssbos - 1);
4183       }
4184
4185       fs_reg offset_reg;
4186       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4187       if (const_offset) {
4188          offset_reg = brw_imm_ud(const_offset->u32[0]);
4189       } else {
4190          offset_reg = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD);
4191       }
4192
4193       /* Read the vector */
4194       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
4195                              instr->num_components);
4196
4197       break;
4198    }
4199
4200    case nir_intrinsic_store_ssbo: {
4201       assert(devinfo->gen >= 7);
4202
4203       if (stage == MESA_SHADER_FRAGMENT)
4204          brw_wm_prog_data(prog_data)->has_side_effects = true;
4205
4206       /* Block index */
4207       fs_reg surf_index;
4208       nir_const_value *const_uniform_block =
4209          nir_src_as_const_value(instr->src[1]);
4210       if (const_uniform_block) {
4211          unsigned index = stage_prog_data->binding_table.ssbo_start +
4212                           const_uniform_block->u32[0];
4213          surf_index = brw_imm_ud(index);
4214          brw_mark_surface_used(prog_data, index);
4215       } else {
4216          surf_index = vgrf(glsl_type::uint_type);
4217          bld.ADD(surf_index, get_nir_src(instr->src[1]),
4218                   brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4219
4220          brw_mark_surface_used(prog_data,
4221                                stage_prog_data->binding_table.ssbo_start +
4222                                nir->info.num_ssbos - 1);
4223       }
4224
4225       /* Value */
4226       fs_reg val_reg = get_nir_src(instr->src[0]);
4227
4228       /* Writemask */
4229       unsigned writemask = instr->const_index[0];
4230
4231       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
4232        * since the untyped writes below operate in units of 32-bits, which
4233        * means that we need to write twice as many components each time.
4234        * Also, we have to suffle 64-bit data to be in the appropriate layout
4235        * expected by our 32-bit write messages.
4236        */
4237       unsigned bit_size = nir_src_bit_size(instr->src[0]);
4238       unsigned type_size = bit_size / 8;
4239
4240       /* Combine groups of consecutive enabled channels in one write
4241        * message. We use ffs to find the first enabled channel and then ffs on
4242        * the bit-inverse, down-shifted writemask to determine the num_components
4243        * of the block of enabled bits.
4244        */
4245       while (writemask) {
4246          unsigned first_component = ffs(writemask) - 1;
4247          unsigned num_components = ffs(~(writemask >> first_component)) - 1;
4248          fs_reg write_src = offset(val_reg, bld, first_component);
4249
4250          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
4251
4252          if (type_size > 4) {
4253             /* We can't write more than 2 64-bit components at once. Limit
4254              * the num_components of the write to what we can do and let the next
4255              * iteration handle the rest.
4256              */
4257             num_components = MIN2(2, num_components);
4258             write_src = shuffle_for_32bit_write(bld, write_src, 0,
4259                                                 num_components);
4260          } else if (type_size < 4) {
4261             /* For 16-bit types we pack two consecutive values into a 32-bit
4262              * word and use an untyped write message. For single values or not
4263              * 32-bit-aligned we need to use byte-scattered writes because
4264              * untyped writes works with 32-bit components with 32-bit
4265              * alignment. byte_scattered_write messages only support one
4266              * 16-bit component at a time. As VK_KHR_relaxed_block_layout
4267              * could be enabled we can not guarantee that not constant offsets
4268              * to be 32-bit aligned for 16-bit types. For example an array, of
4269              * 16-bit vec3 with array element stride of 6.
4270              *
4271              * In the case of 32-bit aligned constant offsets if there is
4272              * a 3-components vector we submit one untyped-write message
4273              * of 32-bit (first two components), and one byte-scattered
4274              * write message (the last component).
4275              */
4276
4277             if ( !const_offset || ((const_offset->u32[0] +
4278                                    type_size * first_component) % 4)) {
4279                /* If we use a .yz writemask we also need to emit 2
4280                 * byte-scattered write messages because of y-component not
4281                 * being aligned to 32-bit.
4282                 */
4283                num_components = 1;
4284             } else if (num_components * type_size > 4 &&
4285                        (num_components * type_size % 4)) {
4286                /* If the pending components size is not a multiple of 4 bytes
4287                 * we left the not aligned components for following emits of
4288                 * length == 1 with byte_scattered_write.
4289                 */
4290                num_components -= (num_components * type_size % 4) / type_size;
4291             } else if (num_components * type_size < 4) {
4292                num_components = 1;
4293             }
4294             /* For num_components == 1 we are also shuffling the component
4295              * because byte scattered writes of 16-bit need values to be dword
4296              * aligned. Shuffling only one component would be the same as
4297              * striding it.
4298              */
4299             write_src = shuffle_for_32bit_write(bld, write_src, 0,
4300                                                 num_components);
4301          }
4302
4303          fs_reg offset_reg;
4304
4305          if (const_offset) {
4306             offset_reg = brw_imm_ud(const_offset->u32[0] +
4307                                     type_size * first_component);
4308          } else {
4309             offset_reg = vgrf(glsl_type::uint_type);
4310             bld.ADD(offset_reg,
4311                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
4312                     brw_imm_ud(type_size * first_component));
4313          }
4314
4315          if (type_size < 4 && num_components == 1) {
4316             /* Untyped Surface messages have a fixed 32-bit size, so we need
4317              * to rely on byte scattered in order to write 16-bit elements.
4318              * The byte_scattered_write message needs that every written 16-bit
4319              * type to be aligned 32-bits (stride=2).
4320              */
4321             emit_byte_scattered_write(bld, surf_index, offset_reg,
4322                                       write_src,
4323                                       1 /* dims */,
4324                                       bit_size,
4325                                       BRW_PREDICATE_NONE);
4326          } else {
4327             assert(num_components * type_size <= 16);
4328             assert((num_components * type_size) % 4 == 0);
4329             assert(offset_reg.file != BRW_IMMEDIATE_VALUE ||
4330                    offset_reg.ud % 4 == 0);
4331             unsigned num_slots = (num_components * type_size) / 4;
4332
4333             emit_untyped_write(bld, surf_index, offset_reg,
4334                                write_src,
4335                                1 /* dims */, num_slots,
4336                                BRW_PREDICATE_NONE);
4337          }
4338
4339          /* Clear the bits in the writemask that we just wrote, then try
4340           * again to see if more channels are left.
4341           */
4342          writemask &= (15 << (first_component + num_components));
4343       }
4344       break;
4345    }
4346
4347    case nir_intrinsic_store_output: {
4348       fs_reg src = get_nir_src(instr->src[0]);
4349
4350       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4351       assert(const_offset && "Indirect output stores not allowed");
4352
4353       unsigned num_components = instr->num_components;
4354       unsigned first_component = nir_intrinsic_component(instr);
4355       if (nir_src_bit_size(instr->src[0]) == 64) {
4356          src = shuffle_for_32bit_write(bld, src, 0, num_components);
4357          num_components *= 2;
4358       }
4359
4360       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4361                                       4 * const_offset->u32[0]), src.type);
4362       for (unsigned j = 0; j < num_components; j++) {
4363          bld.MOV(offset(new_dest, bld, j + first_component),
4364                  offset(src, bld, j));
4365       }
4366       break;
4367    }
4368
4369    case nir_intrinsic_ssbo_atomic_add:
4370       nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4371       break;
4372    case nir_intrinsic_ssbo_atomic_imin:
4373       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4374       break;
4375    case nir_intrinsic_ssbo_atomic_umin:
4376       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4377       break;
4378    case nir_intrinsic_ssbo_atomic_imax:
4379       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4380       break;
4381    case nir_intrinsic_ssbo_atomic_umax:
4382       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4383       break;
4384    case nir_intrinsic_ssbo_atomic_and:
4385       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4386       break;
4387    case nir_intrinsic_ssbo_atomic_or:
4388       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4389       break;
4390    case nir_intrinsic_ssbo_atomic_xor:
4391       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4392       break;
4393    case nir_intrinsic_ssbo_atomic_exchange:
4394       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4395       break;
4396    case nir_intrinsic_ssbo_atomic_comp_swap:
4397       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4398       break;
4399    case nir_intrinsic_ssbo_atomic_fmin:
4400       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4401       break;
4402    case nir_intrinsic_ssbo_atomic_fmax:
4403       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4404       break;
4405    case nir_intrinsic_ssbo_atomic_fcomp_swap:
4406       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4407       break;
4408
4409    case nir_intrinsic_get_buffer_size: {
4410       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
4411       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
4412
4413       /* A resinfo's sampler message is used to get the buffer size.  The
4414        * SIMD8's writeback message consists of four registers and SIMD16's
4415        * writeback message consists of 8 destination registers (two per each
4416        * component).  Because we are only interested on the first channel of
4417        * the first returned component, where resinfo returns the buffer size
4418        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4419        * the dispatch width.
4420        */
4421       const fs_builder ubld = bld.exec_all().group(8, 0);
4422       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4423       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4424
4425       /* Set LOD = 0 */
4426       ubld.MOV(src_payload, brw_imm_d(0));
4427
4428       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4429       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4430                                 src_payload, brw_imm_ud(index));
4431       inst->header_size = 0;
4432       inst->mlen = 1;
4433       inst->size_written = 4 * REG_SIZE;
4434
4435       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4436        *
4437        * "Out-of-bounds checking is always performed at a DWord granularity. If
4438        * any part of the DWord is out-of-bounds then the whole DWord is
4439        * considered out-of-bounds."
4440        *
4441        * This implies that types with size smaller than 4-bytes need to be
4442        * padded if they don't complete the last dword of the buffer. But as we
4443        * need to maintain the original size we need to reverse the padding
4444        * calculation to return the correct size to know the number of elements
4445        * of an unsized array. As we stored in the last two bits of the surface
4446        * size the needed padding for the buffer, we calculate here the
4447        * original buffer_size reversing the surface_size calculation:
4448        *
4449        * surface_size = isl_align(buffer_size, 4) +
4450        *                (isl_align(buffer_size) - buffer_size)
4451        *
4452        * buffer_size = surface_size & ~3 - surface_size & 3
4453        */
4454
4455       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4456       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4457       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4458
4459       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4460       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4461       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4462
4463       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4464
4465       brw_mark_surface_used(prog_data, index);
4466       break;
4467    }
4468
4469    case nir_intrinsic_load_subgroup_invocation:
4470       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4471               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4472       break;
4473
4474    case nir_intrinsic_load_subgroup_eq_mask:
4475    case nir_intrinsic_load_subgroup_ge_mask:
4476    case nir_intrinsic_load_subgroup_gt_mask:
4477    case nir_intrinsic_load_subgroup_le_mask:
4478    case nir_intrinsic_load_subgroup_lt_mask:
4479       unreachable("not reached");
4480
4481    case nir_intrinsic_vote_any: {
4482       const fs_builder ubld = bld.exec_all().group(1, 0);
4483
4484       /* The any/all predicates do not consider channel enables. To prevent
4485        * dead channels from affecting the result, we initialize the flag with
4486        * with the identity value for the logical operation.
4487        */
4488       if (dispatch_width == 32) {
4489          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4490          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4491                          brw_imm_ud(0));
4492       } else {
4493          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4494       }
4495       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4496
4497       /* For some reason, the any/all predicates don't work properly with
4498        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4499        * doesn't read the correct subset of the flag register and you end up
4500        * getting garbage in the second half.  Work around this by using a pair
4501        * of 1-wide MOVs and scattering the result.
4502        */
4503       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4504       ubld.MOV(res1, brw_imm_d(0));
4505       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4506                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4507                                            BRW_PREDICATE_ALIGN1_ANY32H,
4508                     ubld.MOV(res1, brw_imm_d(-1)));
4509
4510       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4511       break;
4512    }
4513    case nir_intrinsic_vote_all: {
4514       const fs_builder ubld = bld.exec_all().group(1, 0);
4515
4516       /* The any/all predicates do not consider channel enables. To prevent
4517        * dead channels from affecting the result, we initialize the flag with
4518        * with the identity value for the logical operation.
4519        */
4520       if (dispatch_width == 32) {
4521          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4522          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4523                          brw_imm_ud(0xffffffff));
4524       } else {
4525          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4526       }
4527       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4528
4529       /* For some reason, the any/all predicates don't work properly with
4530        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4531        * doesn't read the correct subset of the flag register and you end up
4532        * getting garbage in the second half.  Work around this by using a pair
4533        * of 1-wide MOVs and scattering the result.
4534        */
4535       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4536       ubld.MOV(res1, brw_imm_d(0));
4537       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4538                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4539                                            BRW_PREDICATE_ALIGN1_ALL32H,
4540                     ubld.MOV(res1, brw_imm_d(-1)));
4541
4542       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4543       break;
4544    }
4545    case nir_intrinsic_vote_feq:
4546    case nir_intrinsic_vote_ieq: {
4547       fs_reg value = get_nir_src(instr->src[0]);
4548       if (instr->intrinsic == nir_intrinsic_vote_feq) {
4549          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4550          value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4551       }
4552
4553       fs_reg uniformized = bld.emit_uniformize(value);
4554       const fs_builder ubld = bld.exec_all().group(1, 0);
4555
4556       /* The any/all predicates do not consider channel enables. To prevent
4557        * dead channels from affecting the result, we initialize the flag with
4558        * with the identity value for the logical operation.
4559        */
4560       if (dispatch_width == 32) {
4561          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4562          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4563                          brw_imm_ud(0xffffffff));
4564       } else {
4565          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4566       }
4567       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4568
4569       /* For some reason, the any/all predicates don't work properly with
4570        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4571        * doesn't read the correct subset of the flag register and you end up
4572        * getting garbage in the second half.  Work around this by using a pair
4573        * of 1-wide MOVs and scattering the result.
4574        */
4575       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4576       ubld.MOV(res1, brw_imm_d(0));
4577       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4578                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4579                                            BRW_PREDICATE_ALIGN1_ALL32H,
4580                     ubld.MOV(res1, brw_imm_d(-1)));
4581
4582       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4583       break;
4584    }
4585
4586    case nir_intrinsic_ballot: {
4587       const fs_reg value = retype(get_nir_src(instr->src[0]),
4588                                   BRW_REGISTER_TYPE_UD);
4589       struct brw_reg flag = brw_flag_reg(0, 0);
4590       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4591        * as f0.0.  This is a problem for fragment programs as we currently use
4592        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4593        * programs yet so this isn't a problem.  When we do, something will
4594        * have to change.
4595        */
4596       if (dispatch_width == 32)
4597          flag.type = BRW_REGISTER_TYPE_UD;
4598
4599       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4600       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4601
4602       if (instr->dest.ssa.bit_size > 32) {
4603          dest.type = BRW_REGISTER_TYPE_UQ;
4604       } else {
4605          dest.type = BRW_REGISTER_TYPE_UD;
4606       }
4607       bld.MOV(dest, flag);
4608       break;
4609    }
4610
4611    case nir_intrinsic_read_invocation: {
4612       const fs_reg value = get_nir_src(instr->src[0]);
4613       const fs_reg invocation = get_nir_src(instr->src[1]);
4614       fs_reg tmp = bld.vgrf(value.type);
4615
4616       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4617                           bld.emit_uniformize(invocation));
4618
4619       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4620       break;
4621    }
4622
4623    case nir_intrinsic_read_first_invocation: {
4624       const fs_reg value = get_nir_src(instr->src[0]);
4625       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4626       break;
4627    }
4628
4629    case nir_intrinsic_shuffle: {
4630       const fs_reg value = get_nir_src(instr->src[0]);
4631       const fs_reg index = get_nir_src(instr->src[1]);
4632
4633       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4634       break;
4635    }
4636
4637    case nir_intrinsic_first_invocation: {
4638       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4639       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4640       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4641               fs_reg(component(tmp, 0)));
4642       break;
4643    }
4644
4645    case nir_intrinsic_quad_broadcast: {
4646       const fs_reg value = get_nir_src(instr->src[0]);
4647       nir_const_value *index = nir_src_as_const_value(instr->src[1]);
4648       assert(nir_src_bit_size(instr->src[1]) == 32);
4649
4650       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4651                value, brw_imm_ud(index->u32[0]), brw_imm_ud(4));
4652       break;
4653    }
4654
4655    case nir_intrinsic_quad_swap_horizontal: {
4656       const fs_reg value = get_nir_src(instr->src[0]);
4657       const fs_reg tmp = bld.vgrf(value.type);
4658       const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4659
4660       const fs_reg src_left = horiz_stride(value, 2);
4661       const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4662       const fs_reg tmp_left = horiz_stride(tmp, 2);
4663       const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4664
4665       /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
4666        *
4667        *    "When source or destination datatype is 64b or operation is
4668        *    integer DWord multiply, regioning in Align1 must follow
4669        *    these rules:
4670        *
4671        *    [...]
4672        *
4673        *    3. Source and Destination offset must be the same, except
4674        *       the case of scalar source."
4675        *
4676        * In order to work around this, we have to emit two 32-bit MOVs instead
4677        * of a single 64-bit MOV to do the shuffle.
4678        */
4679       if (type_sz(value.type) > 4 &&
4680           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
4681          ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0),
4682                   subscript(src_right, BRW_REGISTER_TYPE_D, 0));
4683          ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1),
4684                   subscript(src_right, BRW_REGISTER_TYPE_D, 1));
4685          ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0),
4686                   subscript(src_left, BRW_REGISTER_TYPE_D, 0));
4687          ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1),
4688                   subscript(src_left, BRW_REGISTER_TYPE_D, 1));
4689       } else {
4690          ubld.MOV(tmp_left, src_right);
4691          ubld.MOV(tmp_right, src_left);
4692       }
4693       bld.MOV(retype(dest, value.type), tmp);
4694       break;
4695    }
4696
4697    case nir_intrinsic_quad_swap_vertical: {
4698       const fs_reg value = get_nir_src(instr->src[0]);
4699       if (nir_src_bit_size(instr->src[0]) == 32) {
4700          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4701          const fs_reg tmp = bld.vgrf(value.type);
4702          const fs_builder ubld = bld.exec_all();
4703          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4704                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4705          bld.MOV(retype(dest, value.type), tmp);
4706       } else {
4707          /* For larger data types, we have to either emit dispatch_width many
4708           * MOVs or else fall back to doing indirects.
4709           */
4710          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4711          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4712                       brw_imm_w(0x2));
4713          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4714       }
4715       break;
4716    }
4717
4718    case nir_intrinsic_quad_swap_diagonal: {
4719       const fs_reg value = get_nir_src(instr->src[0]);
4720       if (nir_src_bit_size(instr->src[0]) == 32) {
4721          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4722          const fs_reg tmp = bld.vgrf(value.type);
4723          const fs_builder ubld = bld.exec_all();
4724          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4725                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4726          bld.MOV(retype(dest, value.type), tmp);
4727       } else {
4728          /* For larger data types, we have to either emit dispatch_width many
4729           * MOVs or else fall back to doing indirects.
4730           */
4731          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4732          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4733                       brw_imm_w(0x3));
4734          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4735       }
4736       break;
4737    }
4738
4739    case nir_intrinsic_reduce: {
4740       fs_reg src = get_nir_src(instr->src[0]);
4741       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4742       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4743       if (cluster_size == 0 || cluster_size > dispatch_width)
4744          cluster_size = dispatch_width;
4745
4746       /* Figure out the source type */
4747       src.type = brw_type_for_nir_type(devinfo,
4748          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4749                         nir_src_bit_size(instr->src[0])));
4750
4751       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4752       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4753       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4754
4755       /* Set up a register for all of our scratching around and initialize it
4756        * to reduction operation's identity value.
4757        */
4758       fs_reg scan = bld.vgrf(src.type);
4759       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4760
4761       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4762
4763       dest.type = src.type;
4764       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4765          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4766           * the distance between clusters is at least 2 GRFs.  In this case,
4767           * we don't need the weird striding of the CLUSTER_BROADCAST
4768           * instruction and can just do regular MOVs.
4769           */
4770          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4771          const unsigned groups =
4772             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4773          const unsigned group_size = dispatch_width / groups;
4774          for (unsigned i = 0; i < groups; i++) {
4775             const unsigned cluster = (i * group_size) / cluster_size;
4776             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4777             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4778                                          component(scan, comp));
4779          }
4780       } else {
4781          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4782                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4783       }
4784       break;
4785    }
4786
4787    case nir_intrinsic_inclusive_scan:
4788    case nir_intrinsic_exclusive_scan: {
4789       fs_reg src = get_nir_src(instr->src[0]);
4790       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4791
4792       /* Figure out the source type */
4793       src.type = brw_type_for_nir_type(devinfo,
4794          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4795                         nir_src_bit_size(instr->src[0])));
4796
4797       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4798       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4799       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4800
4801       /* Set up a register for all of our scratching around and initialize it
4802        * to reduction operation's identity value.
4803        */
4804       fs_reg scan = bld.vgrf(src.type);
4805       const fs_builder allbld = bld.exec_all();
4806       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4807
4808       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4809          /* Exclusive scan is a bit harder because we have to do an annoying
4810           * shift of the contents before we can begin.  To make things worse,
4811           * we can't do this with a normal stride; we have to use indirects.
4812           */
4813          fs_reg shifted = bld.vgrf(src.type);
4814          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4815          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4816                          brw_imm_w(-1));
4817          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4818          allbld.group(1, 0).MOV(component(shifted, 0), identity);
4819          scan = shifted;
4820       }
4821
4822       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4823
4824       bld.MOV(retype(dest, src.type), scan);
4825       break;
4826    }
4827
4828    case nir_intrinsic_begin_fragment_shader_ordering:
4829    case nir_intrinsic_begin_invocation_interlock: {
4830       const fs_builder ubld = bld.group(8, 0);
4831       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4832
4833       ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
4834          REG_SIZE;
4835
4836       break;
4837    }
4838
4839    case nir_intrinsic_end_invocation_interlock: {
4840       /* We don't need to do anything here */
4841       break;
4842    }
4843
4844    default:
4845       unreachable("unknown intrinsic");
4846    }
4847 }
4848
4849 void
4850 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4851                                  int op, nir_intrinsic_instr *instr)
4852 {
4853    if (stage == MESA_SHADER_FRAGMENT)
4854       brw_wm_prog_data(prog_data)->has_side_effects = true;
4855
4856    fs_reg dest;
4857    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4858       dest = get_nir_dest(instr->dest);
4859
4860    fs_reg surface;
4861    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4862    if (const_surface) {
4863       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4864                             const_surface->u32[0];
4865       surface = brw_imm_ud(surf_index);
4866       brw_mark_surface_used(prog_data, surf_index);
4867    } else {
4868       surface = vgrf(glsl_type::uint_type);
4869       bld.ADD(surface, get_nir_src(instr->src[0]),
4870               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4871
4872       /* Assume this may touch any SSBO. This is the same we do for other
4873        * UBO/SSBO accesses with non-constant surface.
4874        */
4875       brw_mark_surface_used(prog_data,
4876                             stage_prog_data->binding_table.ssbo_start +
4877                             nir->info.num_ssbos - 1);
4878    }
4879
4880    fs_reg offset = get_nir_src(instr->src[1]);
4881    fs_reg data1;
4882    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4883       data1 = get_nir_src(instr->src[2]);
4884    fs_reg data2;
4885    if (op == BRW_AOP_CMPWR)
4886       data2 = get_nir_src(instr->src[3]);
4887
4888    /* Emit the actual atomic operation */
4889
4890    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4891                                               data1, data2,
4892                                               1 /* dims */, 1 /* rsize */,
4893                                               op,
4894                                               BRW_PREDICATE_NONE);
4895    dest.type = atomic_result.type;
4896    bld.MOV(dest, atomic_result);
4897 }
4898
4899 void
4900 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
4901                                        int op, nir_intrinsic_instr *instr)
4902 {
4903    if (stage == MESA_SHADER_FRAGMENT)
4904       brw_wm_prog_data(prog_data)->has_side_effects = true;
4905
4906    fs_reg dest;
4907    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4908       dest = get_nir_dest(instr->dest);
4909
4910    fs_reg surface;
4911    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4912    if (const_surface) {
4913       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4914                             const_surface->u32[0];
4915       surface = brw_imm_ud(surf_index);
4916       brw_mark_surface_used(prog_data, surf_index);
4917    } else {
4918       surface = vgrf(glsl_type::uint_type);
4919       bld.ADD(surface, get_nir_src(instr->src[0]),
4920               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4921
4922       /* Assume this may touch any SSBO. This is the same we do for other
4923        * UBO/SSBO accesses with non-constant surface.
4924        */
4925       brw_mark_surface_used(prog_data,
4926                             stage_prog_data->binding_table.ssbo_start +
4927                             nir->info.num_ssbos - 1);
4928    }
4929
4930    fs_reg offset = get_nir_src(instr->src[1]);
4931    fs_reg data1 = get_nir_src(instr->src[2]);
4932    fs_reg data2;
4933    if (op == BRW_AOP_FCMPWR)
4934       data2 = get_nir_src(instr->src[3]);
4935
4936    /* Emit the actual atomic operation */
4937
4938    fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset,
4939                                                     data1, data2,
4940                                                     1 /* dims */, 1 /* rsize */,
4941                                                     op,
4942                                                     BRW_PREDICATE_NONE);
4943    dest.type = atomic_result.type;
4944    bld.MOV(dest, atomic_result);
4945 }
4946
4947 void
4948 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4949                                    int op, nir_intrinsic_instr *instr)
4950 {
4951    fs_reg dest;
4952    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4953       dest = get_nir_dest(instr->dest);
4954
4955    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4956    fs_reg offset;
4957    fs_reg data1;
4958    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4959       data1 = get_nir_src(instr->src[1]);
4960    fs_reg data2;
4961    if (op == BRW_AOP_CMPWR)
4962       data2 = get_nir_src(instr->src[2]);
4963
4964    /* Get the offset */
4965    nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4966    if (const_offset) {
4967       offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
4968    } else {
4969       offset = vgrf(glsl_type::uint_type);
4970       bld.ADD(offset,
4971               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4972               brw_imm_ud(instr->const_index[0]));
4973    }
4974
4975    /* Emit the actual atomic operation operation */
4976
4977    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4978                                               data1, data2,
4979                                               1 /* dims */, 1 /* rsize */,
4980                                               op,
4981                                               BRW_PREDICATE_NONE);
4982    dest.type = atomic_result.type;
4983    bld.MOV(dest, atomic_result);
4984 }
4985
4986 void
4987 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
4988                                          int op, nir_intrinsic_instr *instr)
4989 {
4990    fs_reg dest;
4991    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4992       dest = get_nir_dest(instr->dest);
4993
4994    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4995    fs_reg offset;
4996    fs_reg data1 = get_nir_src(instr->src[1]);
4997    fs_reg data2;
4998    if (op == BRW_AOP_FCMPWR)
4999       data2 = get_nir_src(instr->src[2]);
5000
5001    /* Get the offset */
5002    nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
5003    if (const_offset) {
5004       offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
5005    } else {
5006       offset = vgrf(glsl_type::uint_type);
5007       bld.ADD(offset,
5008               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5009               brw_imm_ud(instr->const_index[0]));
5010    }
5011
5012    /* Emit the actual atomic operation operation */
5013
5014    fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset,
5015                                                     data1, data2,
5016                                                     1 /* dims */, 1 /* rsize */,
5017                                                     op,
5018                                                     BRW_PREDICATE_NONE);
5019    dest.type = atomic_result.type;
5020    bld.MOV(dest, atomic_result);
5021 }
5022
5023 void
5024 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
5025 {
5026    unsigned texture = instr->texture_index;
5027    unsigned sampler = instr->sampler_index;
5028
5029    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5030
5031    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
5032    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
5033
5034    int lod_components = 0;
5035
5036    /* The hardware requires a LOD for buffer textures */
5037    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
5038       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
5039
5040    uint32_t header_bits = 0;
5041    for (unsigned i = 0; i < instr->num_srcs; i++) {
5042       fs_reg src = get_nir_src(instr->src[i].src);
5043       switch (instr->src[i].src_type) {
5044       case nir_tex_src_bias:
5045          srcs[TEX_LOGICAL_SRC_LOD] =
5046             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5047          break;
5048       case nir_tex_src_comparator:
5049          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5050          break;
5051       case nir_tex_src_coord:
5052          switch (instr->op) {
5053          case nir_texop_txf:
5054          case nir_texop_txf_ms:
5055          case nir_texop_txf_ms_mcs:
5056          case nir_texop_samples_identical:
5057             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5058             break;
5059          default:
5060             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5061             break;
5062          }
5063          break;
5064       case nir_tex_src_ddx:
5065          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5066          lod_components = nir_tex_instr_src_size(instr, i);
5067          break;
5068       case nir_tex_src_ddy:
5069          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5070          break;
5071       case nir_tex_src_lod:
5072          switch (instr->op) {
5073          case nir_texop_txs:
5074             srcs[TEX_LOGICAL_SRC_LOD] =
5075                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5076             break;
5077          case nir_texop_txf:
5078             srcs[TEX_LOGICAL_SRC_LOD] =
5079                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5080             break;
5081          default:
5082             srcs[TEX_LOGICAL_SRC_LOD] =
5083                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5084             break;
5085          }
5086          break;
5087       case nir_tex_src_ms_index:
5088          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5089          break;
5090
5091       case nir_tex_src_offset: {
5092          nir_const_value *const_offset =
5093             nir_src_as_const_value(instr->src[i].src);
5094          unsigned offset_bits = 0;
5095          if (const_offset &&
5096              brw_texture_offset(const_offset->i32,
5097                                 nir_tex_instr_src_size(instr, i),
5098                                 &offset_bits)) {
5099             header_bits |= offset_bits;
5100          } else {
5101             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5102                retype(src, BRW_REGISTER_TYPE_D);
5103          }
5104          break;
5105       }
5106
5107       case nir_tex_src_projector:
5108          unreachable("should be lowered");
5109
5110       case nir_tex_src_texture_offset: {
5111          /* Figure out the highest possible texture index and mark it as used */
5112          uint32_t max_used = texture + instr->texture_array_size - 1;
5113          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
5114             max_used += stage_prog_data->binding_table.gather_texture_start;
5115          } else {
5116             max_used += stage_prog_data->binding_table.texture_start;
5117          }
5118          brw_mark_surface_used(prog_data, max_used);
5119
5120          /* Emit code to evaluate the actual indexing expression */
5121          fs_reg tmp = vgrf(glsl_type::uint_type);
5122          bld.ADD(tmp, src, brw_imm_ud(texture));
5123          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5124          break;
5125       }
5126
5127       case nir_tex_src_sampler_offset: {
5128          /* Emit code to evaluate the actual indexing expression */
5129          fs_reg tmp = vgrf(glsl_type::uint_type);
5130          bld.ADD(tmp, src, brw_imm_ud(sampler));
5131          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5132          break;
5133       }
5134
5135       case nir_tex_src_ms_mcs:
5136          assert(instr->op == nir_texop_txf_ms);
5137          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5138          break;
5139
5140       case nir_tex_src_plane: {
5141          nir_const_value *const_plane =
5142             nir_src_as_const_value(instr->src[i].src);
5143          const uint32_t plane = const_plane->u32[0];
5144          const uint32_t texture_index =
5145             instr->texture_index +
5146             stage_prog_data->binding_table.plane_start[plane] -
5147             stage_prog_data->binding_table.texture_start;
5148
5149          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5150          break;
5151       }
5152
5153       default:
5154          unreachable("unknown texture source");
5155       }
5156    }
5157
5158    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5159        (instr->op == nir_texop_txf_ms ||
5160         instr->op == nir_texop_samples_identical)) {
5161       if (devinfo->gen >= 7 &&
5162           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5163          srcs[TEX_LOGICAL_SRC_MCS] =
5164             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5165                            instr->coord_components,
5166                            srcs[TEX_LOGICAL_SRC_SURFACE]);
5167       } else {
5168          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5169       }
5170    }
5171
5172    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5173    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5174
5175    enum opcode opcode;
5176    switch (instr->op) {
5177    case nir_texop_tex:
5178       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
5179                 SHADER_OPCODE_TXL_LOGICAL);
5180       break;
5181    case nir_texop_txb:
5182       opcode = FS_OPCODE_TXB_LOGICAL;
5183       break;
5184    case nir_texop_txl:
5185       opcode = SHADER_OPCODE_TXL_LOGICAL;
5186       break;
5187    case nir_texop_txd:
5188       opcode = SHADER_OPCODE_TXD_LOGICAL;
5189       break;
5190    case nir_texop_txf:
5191       opcode = SHADER_OPCODE_TXF_LOGICAL;
5192       break;
5193    case nir_texop_txf_ms:
5194       if ((key_tex->msaa_16 & (1 << sampler)))
5195          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5196       else
5197          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5198       break;
5199    case nir_texop_txf_ms_mcs:
5200       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5201       break;
5202    case nir_texop_query_levels:
5203    case nir_texop_txs:
5204       opcode = SHADER_OPCODE_TXS_LOGICAL;
5205       break;
5206    case nir_texop_lod:
5207       opcode = SHADER_OPCODE_LOD_LOGICAL;
5208       break;
5209    case nir_texop_tg4:
5210       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5211          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5212       else
5213          opcode = SHADER_OPCODE_TG4_LOGICAL;
5214       break;
5215    case nir_texop_texture_samples:
5216       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5217       break;
5218    case nir_texop_samples_identical: {
5219       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5220
5221       /* If mcs is an immediate value, it means there is no MCS.  In that case
5222        * just return false.
5223        */
5224       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5225          bld.MOV(dst, brw_imm_ud(0u));
5226       } else if ((key_tex->msaa_16 & (1 << sampler))) {
5227          fs_reg tmp = vgrf(glsl_type::uint_type);
5228          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5229                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5230          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5231       } else {
5232          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5233                  BRW_CONDITIONAL_EQ);
5234       }
5235       return;
5236    }
5237    default:
5238       unreachable("unknown texture opcode");
5239    }
5240
5241    if (instr->op == nir_texop_tg4) {
5242       if (instr->component == 1 &&
5243           key_tex->gather_channel_quirk_mask & (1 << texture)) {
5244          /* gather4 sampler is broken for green channel on RG32F --
5245           * we must ask for blue instead.
5246           */
5247          header_bits |= 2 << 16;
5248       } else {
5249          header_bits |= instr->component << 16;
5250       }
5251    }
5252
5253    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5254    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5255    inst->offset = header_bits;
5256
5257    const unsigned dest_size = nir_tex_instr_dest_size(instr);
5258    if (devinfo->gen >= 9 &&
5259        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5260       unsigned write_mask = instr->dest.is_ssa ?
5261                             nir_ssa_def_components_read(&instr->dest.ssa):
5262                             (1 << dest_size) - 1;
5263       assert(write_mask != 0); /* dead code should have been eliminated */
5264       inst->size_written = util_last_bit(write_mask) *
5265                            inst->dst.component_size(inst->exec_size);
5266    } else {
5267       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5268    }
5269
5270    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5271       inst->shadow_compare = true;
5272
5273    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5274       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5275
5276    fs_reg nir_dest[4];
5277    for (unsigned i = 0; i < dest_size; i++)
5278       nir_dest[i] = offset(dst, bld, i);
5279
5280    if (instr->op == nir_texop_query_levels) {
5281       /* # levels is in .w */
5282       nir_dest[0] = offset(dst, bld, 3);
5283    } else if (instr->op == nir_texop_txs &&
5284               dest_size >= 3 && devinfo->gen < 7) {
5285       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5286       fs_reg depth = offset(dst, bld, 2);
5287       nir_dest[2] = vgrf(glsl_type::int_type);
5288       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5289    }
5290
5291    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5292 }
5293
5294 void
5295 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5296 {
5297    switch (instr->type) {
5298    case nir_jump_break:
5299       bld.emit(BRW_OPCODE_BREAK);
5300       break;
5301    case nir_jump_continue:
5302       bld.emit(BRW_OPCODE_CONTINUE);
5303       break;
5304    case nir_jump_return:
5305    default:
5306       unreachable("unknown jump");
5307    }
5308 }
5309
5310 /*
5311  * This helper takes a source register and un/shuffles it into the destination
5312  * register.
5313  *
5314  * If source type size is smaller than destination type size the operation
5315  * needed is a component shuffle. The opposite case would be an unshuffle. If
5316  * source/destination type size is equal a shuffle is done that would be
5317  * equivalent to a simple MOV.
5318  *
5319  * For example, if source is a 16-bit type and destination is 32-bit. A 3
5320  * components .xyz 16-bit vector on SIMD8 would be.
5321  *
5322  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5323  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5324  *
5325  * This helper will return the following 2 32-bit components with the 16-bit
5326  * values shuffled:
5327  *
5328  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5329  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5330  *
5331  * For unshuffle, the example would be the opposite, a 64-bit type source
5332  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5333  * would be:
5334  *
5335  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5336  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5337  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5338  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5339  *
5340  * The returned result would be the following 4 32-bit components unshuffled:
5341  *
5342  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5343  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5344  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5345  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5346  *
5347  * - Source and destination register must not be overlapped.
5348  * - components units are measured in terms of the smaller type between
5349  *   source and destination because we are un/shuffling the smaller
5350  *   components from/into the bigger ones.
5351  * - first_component parameter allows skipping source components.
5352  */
5353 void
5354 shuffle_src_to_dst(const fs_builder &bld,
5355                    const fs_reg &dst,
5356                    const fs_reg &src,
5357                    uint32_t first_component,
5358                    uint32_t components)
5359 {
5360    if (type_sz(src.type) == type_sz(dst.type)) {
5361       assert(!regions_overlap(dst,
5362          type_sz(dst.type) * bld.dispatch_width() * components,
5363          offset(src, bld, first_component),
5364          type_sz(src.type) * bld.dispatch_width() * components));
5365       for (unsigned i = 0; i < components; i++) {
5366          bld.MOV(retype(offset(dst, bld, i), src.type),
5367                  offset(src, bld, i + first_component));
5368       }
5369    } else if (type_sz(src.type) < type_sz(dst.type)) {
5370       /* Source is shuffled into destination */
5371       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5372       assert(!regions_overlap(dst,
5373          type_sz(dst.type) * bld.dispatch_width() *
5374          DIV_ROUND_UP(components, size_ratio),
5375          offset(src, bld, first_component),
5376          type_sz(src.type) * bld.dispatch_width() * components));
5377
5378       brw_reg_type shuffle_type =
5379          brw_reg_type_from_bit_size(8 * type_sz(src.type),
5380                                     BRW_REGISTER_TYPE_D);
5381       for (unsigned i = 0; i < components; i++) {
5382          fs_reg shuffle_component_i =
5383             subscript(offset(dst, bld, i / size_ratio),
5384                       shuffle_type, i % size_ratio);
5385          bld.MOV(shuffle_component_i,
5386                  retype(offset(src, bld, i + first_component), shuffle_type));
5387       }
5388    } else {
5389       /* Source is unshuffled into destination */
5390       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5391       assert(!regions_overlap(dst,
5392          type_sz(dst.type) * bld.dispatch_width() * components,
5393          offset(src, bld, first_component / size_ratio),
5394          type_sz(src.type) * bld.dispatch_width() *
5395          DIV_ROUND_UP(components + (first_component % size_ratio),
5396                       size_ratio)));
5397
5398       brw_reg_type shuffle_type =
5399          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5400                                     BRW_REGISTER_TYPE_D);
5401       for (unsigned i = 0; i < components; i++) {
5402          fs_reg shuffle_component_i =
5403             subscript(offset(src, bld, (first_component + i) / size_ratio),
5404                       shuffle_type, (first_component + i) % size_ratio);
5405          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5406                  shuffle_component_i);
5407       }
5408    }
5409 }
5410
5411 void
5412 shuffle_from_32bit_read(const fs_builder &bld,
5413                         const fs_reg &dst,
5414                         const fs_reg &src,
5415                         uint32_t first_component,
5416                         uint32_t components)
5417 {
5418    assert(type_sz(src.type) == 4);
5419
5420    /* This function takes components in units of the destination type while
5421     * shuffle_src_to_dst takes components in units of the smallest type
5422     */
5423    if (type_sz(dst.type) > 4) {
5424       assert(type_sz(dst.type) == 8);
5425       first_component *= 2;
5426       components *= 2;
5427    }
5428
5429    shuffle_src_to_dst(bld, dst, src, first_component, components);
5430 }
5431
5432 fs_reg
5433 shuffle_for_32bit_write(const fs_builder &bld,
5434                         const fs_reg &src,
5435                         uint32_t first_component,
5436                         uint32_t components)
5437 {
5438    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5439                          DIV_ROUND_UP (components * type_sz(src.type), 4));
5440    /* This function takes components in units of the source type while
5441     * shuffle_src_to_dst takes components in units of the smallest type
5442     */
5443    if (type_sz(src.type) > 4) {
5444       assert(type_sz(src.type) == 8);
5445       first_component *= 2;
5446       components *= 2;
5447    }
5448
5449    shuffle_src_to_dst(bld, dst, src, first_component, components);
5450
5451    return dst;
5452 }
5453
5454 fs_reg
5455 setup_imm_df(const fs_builder &bld, double v)
5456 {
5457    const struct gen_device_info *devinfo = bld.shader->devinfo;
5458    assert(devinfo->gen >= 7);
5459
5460    if (devinfo->gen >= 8)
5461       return brw_imm_df(v);
5462
5463    /* gen7.5 does not support DF immediates straighforward but the DIM
5464     * instruction allows to set the 64-bit immediate value.
5465     */
5466    if (devinfo->is_haswell) {
5467       const fs_builder ubld = bld.exec_all().group(1, 0);
5468       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5469       ubld.DIM(dst, brw_imm_df(v));
5470       return component(dst, 0);
5471    }
5472
5473    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5474     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5475     * the high 32-bit to suboffset 4 and then applying a stride of 0.
5476     *
5477     * Alternatively, we could also produce a normal VGRF (without stride 0)
5478     * by writing to all the channels in the VGRF, however, that would hit the
5479     * gen7 bug where we have to split writes that span more than 1 register
5480     * into instructions with a width of 4 (otherwise the write to the second
5481     * register written runs into an execmask hardware bug) which isn't very
5482     * nice.
5483     */
5484    union {
5485       double d;
5486       struct {
5487          uint32_t i1;
5488          uint32_t i2;
5489       };
5490    } di;
5491
5492    di.d = v;
5493
5494    const fs_builder ubld = bld.exec_all().group(1, 0);
5495    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5496    ubld.MOV(tmp, brw_imm_ud(di.i1));
5497    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5498
5499    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5500 }
5501
5502 fs_reg
5503 setup_imm_b(const fs_builder &bld, int8_t v)
5504 {
5505    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5506    bld.MOV(tmp, brw_imm_w(v));
5507    return tmp;
5508 }
5509
5510 fs_reg
5511 setup_imm_ub(const fs_builder &bld, uint8_t v)
5512 {
5513    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5514    bld.MOV(tmp, brw_imm_uw(v));
5515    return tmp;
5516 }