1 /* Copyright © 2011 Intel Corporation
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 #include "main/macros.h"
30 #include "program/prog_print.h"
31 #include "program/prog_parameter.h"
37 vec4_instruction::get_dst(void)
39 struct brw_reg brw_reg;
43 brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
44 brw_reg = retype(brw_reg, dst.type);
45 brw_reg.dw1.bits.writemask = dst.writemask;
49 brw_reg = brw_message_reg(dst.reg + dst.reg_offset);
50 brw_reg = retype(brw_reg, dst.type);
51 brw_reg.dw1.bits.writemask = dst.writemask;
55 assert(dst.type == dst.fixed_hw_reg.type);
56 brw_reg = dst.fixed_hw_reg;
60 brw_reg = brw_null_reg();
64 unreachable("not reached");
70 vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i)
72 struct brw_reg brw_reg;
74 switch (src[i].file) {
76 brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
77 brw_reg = retype(brw_reg, src[i].type);
78 brw_reg.dw1.bits.swizzle = src[i].swizzle;
80 brw_reg = brw_abs(brw_reg);
82 brw_reg = negate(brw_reg);
86 switch (src[i].type) {
87 case BRW_REGISTER_TYPE_F:
88 brw_reg = brw_imm_f(src[i].fixed_hw_reg.dw1.f);
90 case BRW_REGISTER_TYPE_D:
91 brw_reg = brw_imm_d(src[i].fixed_hw_reg.dw1.d);
93 case BRW_REGISTER_TYPE_UD:
94 brw_reg = brw_imm_ud(src[i].fixed_hw_reg.dw1.ud);
96 case BRW_REGISTER_TYPE_VF:
97 brw_reg = brw_imm_vf(src[i].fixed_hw_reg.dw1.ud);
100 unreachable("not reached");
105 brw_reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
106 (src[i].reg + src[i].reg_offset) / 2,
107 ((src[i].reg + src[i].reg_offset) % 2) * 4),
109 brw_reg = retype(brw_reg, src[i].type);
110 brw_reg.dw1.bits.swizzle = src[i].swizzle;
112 brw_reg = brw_abs(brw_reg);
114 brw_reg = negate(brw_reg);
116 /* This should have been moved to pull constants. */
117 assert(!src[i].reladdr);
121 assert(src[i].type == src[i].fixed_hw_reg.type);
122 brw_reg = src[i].fixed_hw_reg;
126 /* Probably unused. */
127 brw_reg = brw_null_reg();
131 unreachable("not reached");
137 vec4_generator::vec4_generator(struct brw_context *brw,
138 struct gl_shader_program *shader_prog,
139 struct gl_program *prog,
140 struct brw_vue_prog_data *prog_data,
143 const char *stage_name,
144 const char *stage_abbrev)
145 : brw(brw), devinfo(brw->intelScreen->devinfo),
146 shader_prog(shader_prog), prog(prog), prog_data(prog_data),
147 mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev),
148 debug_flag(debug_flag)
150 p = rzalloc(mem_ctx, struct brw_codegen);
151 brw_init_codegen(brw->intelScreen->devinfo, p, mem_ctx);
154 vec4_generator::~vec4_generator()
159 vec4_generator::generate_math1_gen4(vec4_instruction *inst,
165 brw_math_function(inst->opcode),
168 BRW_MATH_PRECISION_FULL);
172 check_gen6_math_src_arg(struct brw_reg src)
174 /* Source swizzles are ignored. */
177 assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
181 vec4_generator::generate_math_gen6(vec4_instruction *inst,
186 /* Can't do writemask because math can't be align16. */
187 assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
188 /* Source swizzles are ignored. */
189 check_gen6_math_src_arg(src0);
190 if (src1.file == BRW_GENERAL_REGISTER_FILE)
191 check_gen6_math_src_arg(src1);
193 brw_set_default_access_mode(p, BRW_ALIGN_1);
194 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
195 brw_set_default_access_mode(p, BRW_ALIGN_16);
199 vec4_generator::generate_math2_gen4(vec4_instruction *inst,
204 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
207 * "Operand0[7]. For the INT DIV functions, this operand is the
210 * "Operand1[7]. For the INT DIV functions, this operand is the
213 bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
214 struct brw_reg &op0 = is_int_div ? src1 : src0;
215 struct brw_reg &op1 = is_int_div ? src0 : src1;
217 brw_push_insn_state(p);
218 brw_set_default_saturate(p, false);
219 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
220 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
221 brw_pop_insn_state(p);
225 brw_math_function(inst->opcode),
228 BRW_MATH_PRECISION_FULL);
232 vec4_generator::generate_tex(vec4_instruction *inst,
235 struct brw_reg sampler_index)
239 if (devinfo->gen >= 5) {
240 switch (inst->opcode) {
241 case SHADER_OPCODE_TEX:
242 case SHADER_OPCODE_TXL:
243 if (inst->shadow_compare) {
244 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
246 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
249 case SHADER_OPCODE_TXD:
250 if (inst->shadow_compare) {
251 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
252 assert(devinfo->gen >= 8 || devinfo->is_haswell);
253 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
255 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
258 case SHADER_OPCODE_TXF:
259 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
261 case SHADER_OPCODE_TXF_CMS:
262 if (devinfo->gen >= 7)
263 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
265 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
267 case SHADER_OPCODE_TXF_MCS:
268 assert(devinfo->gen >= 7);
269 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
271 case SHADER_OPCODE_TXS:
272 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
274 case SHADER_OPCODE_TG4:
275 if (inst->shadow_compare) {
276 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
278 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
281 case SHADER_OPCODE_TG4_OFFSET:
282 if (inst->shadow_compare) {
283 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
285 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
289 unreachable("should not get here: invalid vec4 texture opcode");
292 switch (inst->opcode) {
293 case SHADER_OPCODE_TEX:
294 case SHADER_OPCODE_TXL:
295 if (inst->shadow_compare) {
296 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
297 assert(inst->mlen == 3);
299 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
300 assert(inst->mlen == 2);
303 case SHADER_OPCODE_TXD:
304 /* There is no sample_d_c message; comparisons are done manually. */
305 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
306 assert(inst->mlen == 4);
308 case SHADER_OPCODE_TXF:
309 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
310 assert(inst->mlen == 2);
312 case SHADER_OPCODE_TXS:
313 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
314 assert(inst->mlen == 2);
317 unreachable("should not get here: invalid vec4 texture opcode");
321 assert(msg_type != -1);
323 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
325 /* Load the message header if present. If there's a texture offset, we need
326 * to set it up explicitly and load the offset bitfield. Otherwise, we can
327 * use an implied move from g0 to the first message register.
329 if (inst->header_size != 0) {
330 if (devinfo->gen < 6 && !inst->offset) {
331 /* Set up an implied move from g0 to the MRF. */
332 src = brw_vec8_grf(0, 0);
334 struct brw_reg header =
335 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
338 /* Explicitly set up the message header by copying g0 to the MRF. */
339 brw_push_insn_state(p);
340 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
341 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
343 brw_set_default_access_mode(p, BRW_ALIGN_1);
346 /* Set the texel offset bits in DWord 2. */
349 if (devinfo->gen >= 9)
350 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
351 * based on bit 22 in the header.
353 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
356 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
358 brw_adjust_sampler_state_pointer(p, header, sampler_index);
359 brw_pop_insn_state(p);
363 uint32_t return_format;
366 case BRW_REGISTER_TYPE_D:
367 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
369 case BRW_REGISTER_TYPE_UD:
370 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
373 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
377 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
378 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
379 ? prog_data->base.binding_table.gather_texture_start
380 : prog_data->base.binding_table.texture_start;
382 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
383 uint32_t sampler = sampler_index.dw1.ud;
389 sampler + base_binding_table_index,
392 1, /* response length */
394 inst->header_size != 0,
395 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
398 brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
400 /* Non-constant sampler index. */
402 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
403 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
405 brw_push_insn_state(p);
406 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
407 brw_set_default_access_mode(p, BRW_ALIGN_1);
409 /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
410 brw_MUL(p, addr, sampler_reg, brw_imm_ud(0x101));
411 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
412 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
414 brw_pop_insn_state(p);
416 /* dst = send(offset, a0.0 | <descriptor>) */
417 brw_inst *insn = brw_send_indirect_message(
418 p, BRW_SFID_SAMPLER, dst, src, addr);
419 brw_set_sampler_message(p, insn,
424 inst->mlen /* mlen */,
425 inst->header_size != 0 /* header */,
426 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
429 /* visitor knows more than we do about the surface limit required,
430 * so has already done marking.
436 vec4_generator::generate_vs_urb_write(vec4_instruction *inst)
439 brw_null_reg(), /* dest */
440 inst->base_mrf, /* starting mrf reg nr */
441 brw_vec8_grf(0, 0), /* src */
442 inst->urb_write_flags,
444 0, /* response len */
445 inst->offset, /* urb destination offset */
446 BRW_URB_SWIZZLE_INTERLEAVE);
450 vec4_generator::generate_gs_urb_write(vec4_instruction *inst)
452 struct brw_reg src = brw_message_reg(inst->base_mrf);
454 brw_null_reg(), /* dest */
455 inst->base_mrf, /* starting mrf reg nr */
457 inst->urb_write_flags,
459 0, /* response len */
460 inst->offset, /* urb destination offset */
461 BRW_URB_SWIZZLE_INTERLEAVE);
465 vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst)
467 struct brw_reg src = brw_message_reg(inst->base_mrf);
469 /* We pass the temporary passed in src0 as the writeback register */
471 inst->get_src(this->prog_data, 0), /* dest */
472 inst->base_mrf, /* starting mrf reg nr */
474 BRW_URB_WRITE_ALLOCATE_COMPLETE,
476 1, /* response len */
477 inst->offset, /* urb destination offset */
478 BRW_URB_SWIZZLE_INTERLEAVE);
480 /* Now put allocated urb handle in dst.0 */
481 brw_push_insn_state(p);
482 brw_set_default_access_mode(p, BRW_ALIGN_1);
483 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
484 brw_MOV(p, get_element_ud(inst->get_dst(), 0),
485 get_element_ud(inst->get_src(this->prog_data, 0), 0));
486 brw_set_default_access_mode(p, BRW_ALIGN_16);
487 brw_pop_insn_state(p);
491 vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
493 struct brw_reg src = brw_message_reg(inst->base_mrf);
495 brw_null_reg(), /* dest */
496 inst->base_mrf, /* starting mrf reg nr */
498 BRW_URB_WRITE_EOT | inst->urb_write_flags,
499 devinfo->gen >= 8 ? 2 : 1,/* message len */
500 0, /* response len */
501 0, /* urb destination offset */
502 BRW_URB_SWIZZLE_INTERLEAVE);
506 vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
510 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
513 * Slot 0 Offset. This field, after adding to the Global Offset field
514 * in the message descriptor, specifies the offset (in 256-bit units)
515 * from the start of the URB entry, as referenced by URB Handle 0, at
516 * which the data will be accessed.
518 * Similar text describes DWORD M0.4, which is slot 1 offset.
520 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
521 * of the register for geometry shader invocations 0 and 1) by the
522 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
524 * We can do this with the following EU instruction:
526 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
528 brw_push_insn_state(p);
529 brw_set_default_access_mode(p, BRW_ALIGN_1);
530 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
531 assert(devinfo->gen >= 7 &&
532 src1.file == BRW_IMMEDIATE_VALUE &&
533 src1.type == BRW_REGISTER_TYPE_UD &&
534 src1.dw1.ud <= USHRT_MAX);
535 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
536 retype(src1, BRW_REGISTER_TYPE_UW));
537 brw_set_default_access_mode(p, BRW_ALIGN_16);
538 brw_pop_insn_state(p);
542 vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
545 brw_push_insn_state(p);
546 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
548 if (devinfo->gen >= 8) {
549 /* Move the vertex count into the second MRF for the EOT write. */
550 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
553 /* If we think of the src and dst registers as composed of 8 DWORDs each,
554 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
555 * them to WORDs, and then pack them into DWORD 2 of dst.
557 * It's easier to get the EU to do this if we think of the src and dst
558 * registers as composed of 16 WORDS each; then, we want to pick up the
559 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
562 * We can do that by the following EU instruction:
564 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
566 brw_set_default_access_mode(p, BRW_ALIGN_1);
568 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
569 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
570 brw_set_default_access_mode(p, BRW_ALIGN_16);
572 brw_pop_insn_state(p);
576 vec4_generator::generate_gs_svb_write(vec4_instruction *inst,
581 int binding = inst->sol_binding;
582 bool final_write = inst->sol_final_write;
584 brw_push_insn_state(p);
585 /* Copy Vertex data into M0.x */
586 brw_MOV(p, stride(dst, 4, 4, 1),
587 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
591 final_write ? src1 : brw_null_reg(), /* dest == src1 */
593 dst, /* src0 == previous dst */
594 SURF_INDEX_GEN6_SOL_BINDING(binding), /* binding_table_index */
595 final_write); /* send_commit_msg */
597 /* Finally, wait for the write commit to occur so that we can proceed to
598 * other things safely.
600 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
602 * The write commit does not modify the destination register, but
603 * merely clears the dependency associated with the destination
604 * register. Thus, a simple “mov” instruction using the register as a
605 * source is sufficient to wait for the write commit to occur.
608 brw_MOV(p, src1, src1);
610 brw_pop_insn_state(p);
614 vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst,
619 int vertex = inst->sol_vertex;
620 brw_push_insn_state(p);
621 brw_set_default_access_mode(p, BRW_ALIGN_1);
622 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
623 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
624 brw_pop_insn_state(p);
628 vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src)
630 brw_push_insn_state(p);
631 brw_set_default_access_mode(p, BRW_ALIGN_1);
632 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
633 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
634 brw_pop_insn_state(p);
638 vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
640 /* We want to left shift just DWORD 4 (the x component belonging to the
641 * second geometry shader invocation) by 4 bits. So generate the
644 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
646 dst = suboffset(vec1(dst), 4);
647 brw_push_insn_state(p);
648 brw_set_default_access_mode(p, BRW_ALIGN_1);
649 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
650 brw_SHL(p, dst, dst, brw_imm_ud(4));
651 brw_pop_insn_state(p);
655 vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
658 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
661 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
663 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
664 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
665 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
666 * channel enable to determine the final channel enable. For the
667 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
668 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
669 * in the writeback message. For the URB_WRITE_OWORD &
670 * URB_WRITE_HWORD messages, when final channel enable is 1 it
671 * indicates that Vertex 1 DATA [3] will be written to the surface.
673 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
674 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
676 * 14 Vertex 1 DATA [2] Channel Mask
677 * 13 Vertex 1 DATA [1] Channel Mask
678 * 12 Vertex 1 DATA [0] Channel Mask
679 * 11 Vertex 0 DATA [3] Channel Mask
680 * 10 Vertex 0 DATA [2] Channel Mask
681 * 9 Vertex 0 DATA [1] Channel Mask
682 * 8 Vertex 0 DATA [0] Channel Mask
684 * (This is from a section of the PRM that is agnostic to the particular
685 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
686 * geometry shader invocations 0 and 1, respectively). Since we have the
687 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
688 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
689 * DWORD 4, we just need to OR them together and store the result in bits
692 * It's easier to get the EU to do this if we think of the src and dst
693 * registers as composed of 32 bytes each; then, we want to pick up the
694 * contents of bytes 0 and 16 from src, OR them together, and store them in
697 * We can do that by the following EU instruction:
699 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
701 * Note: this relies on the source register having zeros in (a) bits 7:4 of
702 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
703 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
704 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
705 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
706 * contain valid channel mask values (which are in the range 0x0-0xf).
708 dst = retype(dst, BRW_REGISTER_TYPE_UB);
709 src = retype(src, BRW_REGISTER_TYPE_UB);
710 brw_push_insn_state(p);
711 brw_set_default_access_mode(p, BRW_ALIGN_1);
712 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
713 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
714 brw_pop_insn_state(p);
718 vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
720 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
721 * and store into dst.0 & dst.4. So generate the instruction:
723 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
725 brw_push_insn_state(p);
726 brw_set_default_access_mode(p, BRW_ALIGN_1);
727 dst = retype(dst, BRW_REGISTER_TYPE_UD);
728 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
729 brw_SHR(p, dst, stride(r0, 1, 4, 0),
730 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
731 brw_pop_insn_state(p);
735 vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst,
740 brw_push_insn_state(p);
741 brw_set_default_access_mode(p, BRW_ALIGN_1);
742 /* Save src0 data in 16:31 bits of dst.0 */
743 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
744 brw_imm_ud(0xffffu));
745 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
746 /* Save src1 data in 0:15 bits of dst.0 */
747 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
748 brw_imm_ud(0xffffu));
749 brw_OR(p, suboffset(vec1(dst), 0),
750 suboffset(vec1(dst), 0),
751 suboffset(vec1(src2), 0));
752 brw_pop_insn_state(p);
756 vec4_generator::generate_gs_ff_sync(vec4_instruction *inst,
761 /* This opcode uses an implied MRF register for:
762 * - the header of the ff_sync message. And as such it is expected to be
763 * initialized to r0 before calling here.
764 * - the destination where we will write the allocated URB handle.
766 struct brw_reg header =
767 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
769 /* Overwrite dword 0 of the header (SO vertices to write) and
770 * dword 1 (number of primitives written).
772 brw_push_insn_state(p);
773 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
774 brw_set_default_access_mode(p, BRW_ALIGN_1);
775 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
776 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
777 brw_pop_insn_state(p);
779 /* Allocate URB handle in dst */
785 1, /* response length */
788 /* Now put allocated urb handle in header.0 */
789 brw_push_insn_state(p);
790 brw_set_default_access_mode(p, BRW_ALIGN_1);
791 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
792 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
794 /* src1 is not an immediate when we use transform feedback */
795 if (src1.file != BRW_IMMEDIATE_VALUE)
796 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
798 brw_pop_insn_state(p);
802 vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst)
804 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
805 struct brw_reg src = brw_vec8_grf(0, 0);
806 brw_push_insn_state(p);
807 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
808 brw_set_default_access_mode(p, BRW_ALIGN_1);
809 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
810 brw_pop_insn_state(p);
814 vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
815 struct brw_reg index)
817 int second_vertex_offset;
819 if (devinfo->gen >= 6)
820 second_vertex_offset = 1;
822 second_vertex_offset = 16;
824 m1 = retype(m1, BRW_REGISTER_TYPE_D);
826 /* Set up M1 (message payload). Only the block offsets in M1.0 and
827 * M1.4 are used, and the rest are ignored.
829 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
830 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
831 struct brw_reg index_0 = suboffset(vec1(index), 0);
832 struct brw_reg index_4 = suboffset(vec1(index), 4);
834 brw_push_insn_state(p);
835 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
836 brw_set_default_access_mode(p, BRW_ALIGN_1);
838 brw_MOV(p, m1_0, index_0);
840 if (index.file == BRW_IMMEDIATE_VALUE) {
841 index_4.dw1.ud += second_vertex_offset;
842 brw_MOV(p, m1_4, index_4);
844 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
847 brw_pop_insn_state(p);
851 vec4_generator::generate_unpack_flags(struct brw_reg dst)
853 brw_push_insn_state(p);
854 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
855 brw_set_default_access_mode(p, BRW_ALIGN_1);
857 struct brw_reg flags = brw_flag_reg(0, 0);
858 struct brw_reg dst_0 = suboffset(vec1(dst), 0);
859 struct brw_reg dst_4 = suboffset(vec1(dst), 4);
861 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
862 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
863 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
865 brw_pop_insn_state(p);
869 vec4_generator::generate_scratch_read(vec4_instruction *inst,
871 struct brw_reg index)
873 struct brw_reg header = brw_vec8_grf(0, 0);
875 gen6_resolve_implied_move(p, &header, inst->base_mrf);
877 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
882 if (devinfo->gen >= 6)
883 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
884 else if (devinfo->gen == 5 || devinfo->is_g4x)
885 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
887 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
889 /* Each of the 8 channel enables is considered for whether each
892 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
893 brw_set_dest(p, send, dst);
894 brw_set_src0(p, send, header);
895 if (devinfo->gen < 6)
896 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
897 brw_set_dp_read_message(p, send,
898 255, /* binding table index: stateless access */
899 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
901 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
903 true, /* header_present */
908 vec4_generator::generate_scratch_write(vec4_instruction *inst,
911 struct brw_reg index)
913 struct brw_reg header = brw_vec8_grf(0, 0);
916 /* If the instruction is predicated, we'll predicate the send, not
919 brw_set_default_predicate_control(p, false);
921 gen6_resolve_implied_move(p, &header, inst->base_mrf);
923 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
927 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
928 retype(src, BRW_REGISTER_TYPE_D));
932 if (devinfo->gen >= 7)
933 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
934 else if (devinfo->gen == 6)
935 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
937 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
939 brw_set_default_predicate_control(p, inst->predicate);
941 /* Pre-gen6, we have to specify write commits to ensure ordering
942 * between reads and writes within a thread. Afterwards, that's
943 * guaranteed and write commits only matter for inter-thread
946 if (devinfo->gen >= 6) {
947 write_commit = false;
949 /* The visitor set up our destination register to be g0. This
950 * means that when the next read comes along, we will end up
951 * reading from g0 and causing a block on the write commit. For
952 * write-after-read, we are relying on the value of the previous
953 * read being used (and thus blocking on completion) before our
954 * write is executed. This means we have to be careful in
955 * instruction scheduling to not violate this assumption.
960 /* Each of the 8 channel enables is considered for whether each
963 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
964 brw_set_dest(p, send, dst);
965 brw_set_src0(p, send, header);
966 if (devinfo->gen < 6)
967 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
968 brw_set_dp_write_message(p, send,
969 255, /* binding table index: stateless access */
970 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
973 true, /* header present */
974 false, /* not a render target write */
975 write_commit, /* rlen */
981 vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
983 struct brw_reg index,
984 struct brw_reg offset)
986 assert(index.file == BRW_IMMEDIATE_VALUE &&
987 index.type == BRW_REGISTER_TYPE_UD);
988 uint32_t surf_index = index.dw1.ud;
990 struct brw_reg header = brw_vec8_grf(0, 0);
992 gen6_resolve_implied_move(p, &header, inst->base_mrf);
994 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
999 if (devinfo->gen >= 6)
1000 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1001 else if (devinfo->gen == 5 || devinfo->is_g4x)
1002 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1004 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1006 /* Each of the 8 channel enables is considered for whether each
1009 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1010 brw_set_dest(p, send, dst);
1011 brw_set_src0(p, send, header);
1012 if (devinfo->gen < 6)
1013 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1014 brw_set_dp_read_message(p, send,
1016 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1018 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1020 true, /* header_present */
1023 brw_mark_surface_used(&prog_data->base, surf_index);
1027 vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
1029 struct brw_reg surf_index,
1030 struct brw_reg offset)
1032 assert(surf_index.type == BRW_REGISTER_TYPE_UD);
1034 if (surf_index.file == BRW_IMMEDIATE_VALUE) {
1036 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1037 brw_set_dest(p, insn, dst);
1038 brw_set_src0(p, insn, offset);
1039 brw_set_sampler_message(p, insn,
1041 0, /* LD message ignores sampler unit */
1042 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1045 inst->header_size != 0,
1046 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1049 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
1053 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1055 brw_push_insn_state(p);
1056 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1057 brw_set_default_access_mode(p, BRW_ALIGN_1);
1059 /* a0.0 = surf_index & 0xff */
1060 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1061 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1062 brw_set_dest(p, insn_and, addr);
1063 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
1064 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1066 brw_pop_insn_state(p);
1068 /* dst = send(offset, a0.0 | <descriptor>) */
1069 brw_inst *insn = brw_send_indirect_message(
1070 p, BRW_SFID_SAMPLER, dst, offset, addr);
1071 brw_set_sampler_message(p, insn,
1074 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1077 inst->header_size != 0,
1078 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1081 /* visitor knows more than we do about the surface limit required,
1082 * so has already done marking.
1088 vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
1091 brw_push_insn_state(p);
1092 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1094 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1095 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1097 brw_set_default_access_mode(p, BRW_ALIGN_1);
1098 brw_MOV(p, get_element_ud(dst, 2),
1099 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1101 brw_pop_insn_state(p);
1105 vec4_generator::generate_code(const cfg_t *cfg)
1107 struct annotation_info annotation;
1108 memset(&annotation, 0, sizeof(annotation));
1111 foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1112 struct brw_reg src[3], dst;
1114 if (unlikely(debug_flag))
1115 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1117 for (unsigned int i = 0; i < 3; i++) {
1118 src[i] = inst->get_src(this->prog_data, i);
1120 dst = inst->get_dst();
1122 brw_set_default_predicate_control(p, inst->predicate);
1123 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1124 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1125 brw_set_default_saturate(p, inst->saturate);
1126 brw_set_default_mask_control(p, inst->force_writemask_all);
1127 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1129 unsigned pre_emit_nr_insn = p->nr_insn;
1131 if (dst.width == BRW_WIDTH_4) {
1132 /* This happens in attribute fixups for "dual instanced" geometry
1133 * shaders, since they use attributes that are vec4's. Since the exec
1134 * width is only 4, it's essential that the caller set
1135 * force_writemask_all in order to make sure the instruction is executed
1136 * regardless of which channels are enabled.
1138 assert(inst->force_writemask_all);
1140 /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
1141 * the following register region restrictions (from Graphics BSpec:
1142 * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
1143 * > Register Region Restrictions)
1145 * 1. ExecSize must be greater than or equal to Width.
1147 * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set
1148 * to Width * HorzStride."
1150 for (int i = 0; i < 3; i++) {
1151 if (src[i].file == BRW_GENERAL_REGISTER_FILE)
1152 src[i] = stride(src[i], 4, 4, 1);
1156 switch (inst->opcode) {
1157 case VEC4_OPCODE_UNPACK_UNIFORM:
1158 case BRW_OPCODE_MOV:
1159 brw_MOV(p, dst, src[0]);
1161 case BRW_OPCODE_ADD:
1162 brw_ADD(p, dst, src[0], src[1]);
1164 case BRW_OPCODE_MUL:
1165 brw_MUL(p, dst, src[0], src[1]);
1167 case BRW_OPCODE_MACH:
1168 brw_MACH(p, dst, src[0], src[1]);
1171 case BRW_OPCODE_MAD:
1172 assert(devinfo->gen >= 6);
1173 brw_MAD(p, dst, src[0], src[1], src[2]);
1176 case BRW_OPCODE_FRC:
1177 brw_FRC(p, dst, src[0]);
1179 case BRW_OPCODE_RNDD:
1180 brw_RNDD(p, dst, src[0]);
1182 case BRW_OPCODE_RNDE:
1183 brw_RNDE(p, dst, src[0]);
1185 case BRW_OPCODE_RNDZ:
1186 brw_RNDZ(p, dst, src[0]);
1189 case BRW_OPCODE_AND:
1190 brw_AND(p, dst, src[0], src[1]);
1193 brw_OR(p, dst, src[0], src[1]);
1195 case BRW_OPCODE_XOR:
1196 brw_XOR(p, dst, src[0], src[1]);
1198 case BRW_OPCODE_NOT:
1199 brw_NOT(p, dst, src[0]);
1201 case BRW_OPCODE_ASR:
1202 brw_ASR(p, dst, src[0], src[1]);
1204 case BRW_OPCODE_SHR:
1205 brw_SHR(p, dst, src[0], src[1]);
1207 case BRW_OPCODE_SHL:
1208 brw_SHL(p, dst, src[0], src[1]);
1211 case BRW_OPCODE_CMP:
1212 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1214 case BRW_OPCODE_SEL:
1215 brw_SEL(p, dst, src[0], src[1]);
1218 case BRW_OPCODE_DPH:
1219 brw_DPH(p, dst, src[0], src[1]);
1222 case BRW_OPCODE_DP4:
1223 brw_DP4(p, dst, src[0], src[1]);
1226 case BRW_OPCODE_DP3:
1227 brw_DP3(p, dst, src[0], src[1]);
1230 case BRW_OPCODE_DP2:
1231 brw_DP2(p, dst, src[0], src[1]);
1234 case BRW_OPCODE_F32TO16:
1235 assert(devinfo->gen >= 7);
1236 brw_F32TO16(p, dst, src[0]);
1239 case BRW_OPCODE_F16TO32:
1240 assert(devinfo->gen >= 7);
1241 brw_F16TO32(p, dst, src[0]);
1244 case BRW_OPCODE_LRP:
1245 assert(devinfo->gen >= 6);
1246 brw_LRP(p, dst, src[0], src[1], src[2]);
1249 case BRW_OPCODE_BFREV:
1250 assert(devinfo->gen >= 7);
1251 /* BFREV only supports UD type for src and dst. */
1252 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1253 retype(src[0], BRW_REGISTER_TYPE_UD));
1255 case BRW_OPCODE_FBH:
1256 assert(devinfo->gen >= 7);
1257 /* FBH only supports UD type for dst. */
1258 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1260 case BRW_OPCODE_FBL:
1261 assert(devinfo->gen >= 7);
1262 /* FBL only supports UD type for dst. */
1263 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1265 case BRW_OPCODE_CBIT:
1266 assert(devinfo->gen >= 7);
1267 /* CBIT only supports UD type for dst. */
1268 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1270 case BRW_OPCODE_ADDC:
1271 assert(devinfo->gen >= 7);
1272 brw_ADDC(p, dst, src[0], src[1]);
1274 case BRW_OPCODE_SUBB:
1275 assert(devinfo->gen >= 7);
1276 brw_SUBB(p, dst, src[0], src[1]);
1278 case BRW_OPCODE_MAC:
1279 brw_MAC(p, dst, src[0], src[1]);
1282 case BRW_OPCODE_BFE:
1283 assert(devinfo->gen >= 7);
1284 brw_BFE(p, dst, src[0], src[1], src[2]);
1287 case BRW_OPCODE_BFI1:
1288 assert(devinfo->gen >= 7);
1289 brw_BFI1(p, dst, src[0], src[1]);
1291 case BRW_OPCODE_BFI2:
1292 assert(devinfo->gen >= 7);
1293 brw_BFI2(p, dst, src[0], src[1], src[2]);
1297 if (inst->src[0].file != BAD_FILE) {
1298 /* The instruction has an embedded compare (only allowed on gen6) */
1299 assert(devinfo->gen == 6);
1300 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1302 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
1303 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1307 case BRW_OPCODE_ELSE:
1310 case BRW_OPCODE_ENDIF:
1315 brw_DO(p, BRW_EXECUTE_8);
1318 case BRW_OPCODE_BREAK:
1320 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1322 case BRW_OPCODE_CONTINUE:
1324 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1327 case BRW_OPCODE_WHILE:
1332 case SHADER_OPCODE_RCP:
1333 case SHADER_OPCODE_RSQ:
1334 case SHADER_OPCODE_SQRT:
1335 case SHADER_OPCODE_EXP2:
1336 case SHADER_OPCODE_LOG2:
1337 case SHADER_OPCODE_SIN:
1338 case SHADER_OPCODE_COS:
1339 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1340 if (devinfo->gen >= 7) {
1341 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1343 } else if (devinfo->gen == 6) {
1344 generate_math_gen6(inst, dst, src[0], brw_null_reg());
1346 generate_math1_gen4(inst, dst, src[0]);
1350 case SHADER_OPCODE_POW:
1351 case SHADER_OPCODE_INT_QUOTIENT:
1352 case SHADER_OPCODE_INT_REMAINDER:
1353 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1354 if (devinfo->gen >= 7) {
1355 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1356 } else if (devinfo->gen == 6) {
1357 generate_math_gen6(inst, dst, src[0], src[1]);
1359 generate_math2_gen4(inst, dst, src[0], src[1]);
1363 case SHADER_OPCODE_TEX:
1364 case SHADER_OPCODE_TXD:
1365 case SHADER_OPCODE_TXF:
1366 case SHADER_OPCODE_TXF_CMS:
1367 case SHADER_OPCODE_TXF_MCS:
1368 case SHADER_OPCODE_TXL:
1369 case SHADER_OPCODE_TXS:
1370 case SHADER_OPCODE_TG4:
1371 case SHADER_OPCODE_TG4_OFFSET:
1372 generate_tex(inst, dst, src[0], src[1]);
1375 case VS_OPCODE_URB_WRITE:
1376 generate_vs_urb_write(inst);
1379 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1380 generate_scratch_read(inst, dst, src[0]);
1383 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1384 generate_scratch_write(inst, dst, src[0], src[1]);
1387 case VS_OPCODE_PULL_CONSTANT_LOAD:
1388 generate_pull_constant_load(inst, dst, src[0], src[1]);
1391 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1392 generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1395 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
1396 generate_set_simd4x2_header_gen9(inst, dst);
1399 case GS_OPCODE_URB_WRITE:
1400 generate_gs_urb_write(inst);
1403 case GS_OPCODE_URB_WRITE_ALLOCATE:
1404 generate_gs_urb_write_allocate(inst);
1407 case GS_OPCODE_SVB_WRITE:
1408 generate_gs_svb_write(inst, dst, src[0], src[1]);
1411 case GS_OPCODE_SVB_SET_DST_INDEX:
1412 generate_gs_svb_set_destination_index(inst, dst, src[0]);
1415 case GS_OPCODE_THREAD_END:
1416 generate_gs_thread_end(inst);
1419 case GS_OPCODE_SET_WRITE_OFFSET:
1420 generate_gs_set_write_offset(dst, src[0], src[1]);
1423 case GS_OPCODE_SET_VERTEX_COUNT:
1424 generate_gs_set_vertex_count(dst, src[0]);
1427 case GS_OPCODE_FF_SYNC:
1428 generate_gs_ff_sync(inst, dst, src[0], src[1]);
1431 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1432 generate_gs_ff_sync_set_primitives(dst, src[0], src[1], src[2]);
1435 case GS_OPCODE_SET_PRIMITIVE_ID:
1436 generate_gs_set_primitive_id(dst);
1439 case GS_OPCODE_SET_DWORD_2:
1440 generate_gs_set_dword_2(dst, src[0]);
1443 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1444 generate_gs_prepare_channel_masks(dst);
1447 case GS_OPCODE_SET_CHANNEL_MASKS:
1448 generate_gs_set_channel_masks(dst, src[0]);
1451 case GS_OPCODE_GET_INSTANCE_ID:
1452 generate_gs_get_instance_id(dst);
1455 case SHADER_OPCODE_SHADER_TIME_ADD:
1456 brw_shader_time_add(p, src[0],
1457 prog_data->base.binding_table.shader_time_start);
1458 brw_mark_surface_used(&prog_data->base,
1459 prog_data->base.binding_table.shader_time_start);
1462 case SHADER_OPCODE_UNTYPED_ATOMIC:
1463 assert(src[1].file == BRW_IMMEDIATE_VALUE &&
1464 src[2].file == BRW_IMMEDIATE_VALUE);
1465 brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
1466 !inst->dst.is_null());
1467 brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
1470 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1471 assert(src[1].file == BRW_IMMEDIATE_VALUE &&
1472 src[2].file == BRW_IMMEDIATE_VALUE);
1473 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1475 brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
1478 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1479 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1480 brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
1484 case SHADER_OPCODE_TYPED_ATOMIC:
1485 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1486 brw_typed_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
1487 !inst->dst.is_null());
1490 case SHADER_OPCODE_TYPED_SURFACE_READ:
1491 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1492 brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
1496 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1497 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1498 brw_typed_surface_write(p, src[0], src[1], inst->mlen,
1502 case SHADER_OPCODE_MEMORY_FENCE:
1503 brw_memory_fence(p, dst);
1506 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1507 brw_find_live_channel(p, dst);
1510 case SHADER_OPCODE_BROADCAST:
1511 brw_broadcast(p, dst, src[0], src[1]);
1514 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1515 generate_unpack_flags(dst);
1518 case VEC4_OPCODE_MOV_BYTES: {
1519 /* Moves the low byte from each channel, using an Align1 access mode
1520 * and a <4,1,0> source region.
1522 assert(src[0].type == BRW_REGISTER_TYPE_UB ||
1523 src[0].type == BRW_REGISTER_TYPE_B);
1525 brw_set_default_access_mode(p, BRW_ALIGN_1);
1526 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1527 src[0].width = BRW_WIDTH_1;
1528 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
1529 brw_MOV(p, dst, src[0]);
1530 brw_set_default_access_mode(p, BRW_ALIGN_16);
1534 case VEC4_OPCODE_PACK_BYTES: {
1537 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
1539 * but destinations' only regioning is horizontal stride, so instead we
1540 * have to use two instructions:
1542 * mov(4) dst<1>:UB src<4,1,0>:UB
1543 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
1545 * where they pack the four bytes from the low and high four DW.
1547 assert(is_power_of_two(dst.dw1.bits.writemask) &&
1548 dst.dw1.bits.writemask != 0);
1549 unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
1551 dst.type = BRW_REGISTER_TYPE_UB;
1553 brw_set_default_access_mode(p, BRW_ALIGN_1);
1555 src[0].type = BRW_REGISTER_TYPE_UB;
1556 src[0].vstride = BRW_VERTICAL_STRIDE_4;
1557 src[0].width = BRW_WIDTH_1;
1558 src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
1559 dst.subnr = offset * 4;
1560 struct brw_inst *insn = brw_MOV(p, dst, src[0]);
1561 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
1562 brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1563 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
1566 dst.subnr = 16 + offset * 4;
1567 insn = brw_MOV(p, dst, src[0]);
1568 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
1569 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
1570 brw_inst_set_no_dd_check(p->devinfo, insn, true);
1572 brw_set_default_access_mode(p, BRW_ALIGN_16);
1577 unreachable("Unsupported opcode");
1580 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
1581 /* Handled dependency hints in the generator. */
1583 assert(!inst->conditional_mod);
1584 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
1585 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
1586 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
1587 "emitting more than 1 instruction");
1589 brw_inst *last = &p->store[pre_emit_nr_insn];
1591 if (inst->conditional_mod)
1592 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
1593 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
1594 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
1599 annotation_finalize(&annotation, p->next_insn_offset);
1601 int before_size = p->next_insn_offset;
1602 brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
1603 int after_size = p->next_insn_offset;
1605 if (unlikely(debug_flag)) {
1607 fprintf(stderr, "Native code for %s %s shader %d:\n",
1608 shader_prog->Label ? shader_prog->Label : "unnamed",
1609 stage_name, shader_prog->Name);
1611 fprintf(stderr, "Native code for %s program %d:\n", stage_name,
1614 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d"
1615 " bytes (%.0f%%)\n",
1617 before_size / 16, loop_count, before_size, after_size,
1618 100.0f * (before_size - after_size) / before_size);
1620 dump_assembly(p->store, annotation.ann_count, annotation.ann,
1622 ralloc_free(annotation.ann);
1625 static GLuint msg_id = 0;
1626 _mesa_gl_debug(&brw->ctx, &msg_id,
1627 MESA_DEBUG_SOURCE_SHADER_COMPILER,
1628 MESA_DEBUG_TYPE_OTHER,
1629 MESA_DEBUG_SEVERITY_NOTIFICATION,
1630 "%s vec4 shader: %d inst, %d loops, "
1631 "compacted %d to %d bytes.\n",
1633 before_size / 16, loop_count,
1634 before_size, after_size);
1638 vec4_generator::generate_assembly(const cfg_t *cfg,
1639 unsigned *assembly_size)
1641 brw_set_default_access_mode(p, BRW_ALIGN_16);
1644 return brw_get_program(p, assembly_size);
1647 } /* namespace brw */