From 9eb568d7531eb4715be24d5076353ea6c10c8ceb Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 6 Dec 2012 22:37:34 -0800 Subject: [PATCH] i965: Create a new vec4 backend for Broadwell. This replaces the old vec4_generator backend. v2: Port to use the C-based instruction representation. Also, remove Geometry Shader offset hacks - the visitor will handle those instead of this code. v3: Texturing fixes (including adding textureGather support). v4: Pass brw_context to gen8_instruction functions as required. v5: Add SHADER_OPCODE_TXF_MCS support; port DUAL_INSTANCED gs fixes (caught by Eric). Simplify ADDC/SUBB handling; add comments to gen8_set_dp_message calls (suggested by Matt). Signed-off-by: Kenneth Graunke Reviewed-by: Eric Anholt Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/Makefile.sources | 1 + src/mesa/drivers/dri/i965/brw_vec4.cpp | 16 +- src/mesa/drivers/dri/i965/brw_vec4.h | 61 ++ src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 36 +- src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp | 879 ++++++++++++++++++++++ 5 files changed, 976 insertions(+), 17 deletions(-) create mode 100644 src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index ade40eded5a..f38d06edbf2 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -142,4 +142,5 @@ i965_FILES = \ gen8_disasm.c \ gen8_generator.cpp \ gen8_instruction.c \ + gen8_vec4_generator.cpp \ $() diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 9d3735a1504..d4ed820fe3b 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1676,10 +1676,16 @@ brw_vs_emit(struct brw_context *brw, return NULL; } - vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, mem_ctx, - INTEL_DEBUG & DEBUG_VS); - const unsigned *generated =g.generate_assembly(&v.instructions, - final_assembly_size); + const unsigned *assembly = NULL; + if (brw->gen >= 8) { + gen8_vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, + mem_ctx, INTEL_DEBUG & DEBUG_VS); + assembly = g.generate_assembly(&v.instructions, final_assembly_size); + } else { + vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, + mem_ctx, INTEL_DEBUG & DEBUG_VS); + assembly = g.generate_assembly(&v.instructions, final_assembly_size); + } if (unlikely(brw->perf_debug) && shader) { if (shader->compiled_once) { @@ -1692,7 +1698,7 @@ brw_vs_emit(struct brw_context *brw, shader->compiled_once = true; } - return generated; + return assembly; } diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index d4029d8aa37..71aaf1adfac 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -39,6 +39,7 @@ extern "C" { #ifdef __cplusplus }; /* extern "C" */ +#include "gen8_generator.h" #endif #include "glsl/ir.h" @@ -653,6 +654,66 @@ private: const bool debug_flag; }; +/** + * The vertex shader code generator. + * + * Translates VS IR to actual i965 assembly code. + */ +class gen8_vec4_generator : public gen8_generator +{ +public: + gen8_vec4_generator(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct gl_program *prog, + struct brw_vec4_prog_data *prog_data, + void *mem_ctx, + bool debug_flag); + ~gen8_vec4_generator(); + + const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size); + +private: + void generate_code(exec_list *instructions); + void generate_vec4_instruction(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg *src); + + void generate_tex(vec4_instruction *inst, + struct brw_reg dst); + + void generate_urb_write(vec4_instruction *ir, bool copy_g0); + void generate_gs_thread_end(vec4_instruction *ir); + void generate_gs_set_write_offset(struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1); + void generate_gs_set_vertex_count(struct brw_reg dst, + struct brw_reg src); + void generate_gs_set_dword_2_immed(struct brw_reg dst, struct brw_reg src); + void generate_gs_prepare_channel_masks(struct brw_reg dst); + void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src); + + void generate_oword_dual_block_offsets(struct brw_reg m1, + struct brw_reg index); + void generate_scratch_write(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg index); + void generate_scratch_read(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index); + void generate_pull_constant_load(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset); + + void mark_surface_used(unsigned surf_index); + + struct brw_vec4_prog_data *prog_data; + + const bool debug_flag; +}; + + } /* namespace brw */ #endif /* __cplusplus */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 5d5b1690477..f33c80dae3d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -533,6 +533,25 @@ vec4_gs_visitor::visit(ir_end_primitive *) emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); } +static const unsigned * +generate_assembly(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct gl_program *prog, + struct brw_vec4_prog_data *prog_data, + void *mem_ctx, + exec_list *instructions, + unsigned *final_assembly_size) +{ + if (brw->gen >= 8) { + gen8_vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx, + INTEL_DEBUG & DEBUG_GS); + return g.generate_assembly(instructions, final_assembly_size); + } else { + vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx, + INTEL_DEBUG & DEBUG_GS); + return g.generate_assembly(instructions, final_assembly_size); + } +} extern "C" const unsigned * brw_gs_emit(struct brw_context *brw, @@ -558,12 +577,9 @@ brw_gs_emit(struct brw_context *brw, vec4_gs_visitor v(brw, c, prog, shader, mem_ctx, true /* no_spills */); if (v.run()) { - vec4_generator g(brw, prog, &c->gp->program.Base, &c->prog_data.base, - mem_ctx, INTEL_DEBUG & DEBUG_GS); - const unsigned *generated = - g.generate_assembly(&v.instructions, final_assembly_size); - - return generated; + return generate_assembly(brw, prog, &c->gp->program.Base, + &c->prog_data.base, mem_ctx, &v.instructions, + final_assembly_size); } } @@ -586,12 +602,8 @@ brw_gs_emit(struct brw_context *brw, return NULL; } - vec4_generator g(brw, prog, &c->gp->program.Base, &c->prog_data.base, - mem_ctx, INTEL_DEBUG & DEBUG_GS); - const unsigned *generated = - g.generate_assembly(&v.instructions, final_assembly_size); - - return generated; + return generate_assembly(brw, prog, &c->gp->program.Base, &c->prog_data.base, + mem_ctx, &v.instructions, final_assembly_size); } diff --git a/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp new file mode 100644 index 00000000000..ee86dbbccfa --- /dev/null +++ b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp @@ -0,0 +1,879 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" + +extern "C" { +#include "brw_eu.h" +#include "main/macros.h" +#include "program/prog_print.h" +#include "program/prog_parameter.h" +}; + +namespace brw { + +gen8_vec4_generator::gen8_vec4_generator(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct gl_program *prog, + struct brw_vec4_prog_data *prog_data, + void *mem_ctx, + bool debug_flag) + : gen8_generator(brw, shader_prog, prog, mem_ctx), + prog_data(prog_data), + debug_flag(debug_flag) +{ + shader = shader_prog ? shader_prog->_LinkedShaders[MESA_SHADER_VERTEX] : NULL; +} + +gen8_vec4_generator::~gen8_vec4_generator() +{ +} + +void +gen8_vec4_generator::mark_surface_used(unsigned surf_index) +{ + assert(surf_index < BRW_MAX_SURFACES); + + prog_data->base.binding_table.size_bytes = + MAX2(prog_data->base.binding_table.size_bytes, (surf_index + 1) * 4); +} + +void +gen8_vec4_generator::generate_tex(vec4_instruction *ir, struct brw_reg dst) +{ + int msg_type = 0; + + switch (ir->opcode) { + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXL: + if (ir->shadow_compare) { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; + } + break; + case SHADER_OPCODE_TXD: + if (ir->shadow_compare) { + msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; + } + break; + case SHADER_OPCODE_TXF: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_MS: + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; + break; + case SHADER_OPCODE_TXF_MCS: + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; + break; + case SHADER_OPCODE_TXS: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; + break; + case SHADER_OPCODE_TG4: + if (ir->shadow_compare) { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; + } else { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; + } + break; + case SHADER_OPCODE_TG4_OFFSET: + if (ir->shadow_compare) { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; + } else { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; + } + break; + default: + assert(!"should not get here: invalid VS texture opcode"); + break; + } + + if (ir->header_present) { + MOV_RAW(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD), + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + if (ir->texture_offset) { + /* Set the offset bits in DWord 2. */ + default_state.access_mode = BRW_ALIGN_1; + MOV_RAW(retype(brw_vec1_reg(MRF, ir->base_mrf, 2), + BRW_REGISTER_TYPE_UD), + brw_imm_ud(ir->texture_offset)); + default_state.access_mode = BRW_ALIGN_16; + } + } + + uint32_t surf_index = + prog_data->base.binding_table.texture_start + ir->sampler; + + gen8_instruction *inst = next_inst(BRW_OPCODE_SEND); + gen8_set_dst(brw, inst, dst); + gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf)); + gen8_set_sampler_message(brw, inst, + surf_index, + ir->sampler, + msg_type, + 1, + ir->mlen, + ir->header_present, + BRW_SAMPLER_SIMD_MODE_SIMD4X2); + + mark_surface_used(surf_index); +} + +void +gen8_vec4_generator::generate_urb_write(vec4_instruction *ir, bool vs) +{ + struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0); + + /* Copy g0. */ + if (vs) + MOV_RAW(header, brw_vec8_grf(0, 0)); + + gen8_instruction *inst; + if (!(ir->urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { + /* Enable Channel Masks in the URB_WRITE_OWORD message header */ + default_state.access_mode = BRW_ALIGN_1; + inst = OR(retype(brw_vec1_grf(GEN7_MRF_HACK_START + ir->base_mrf, 5), + BRW_REGISTER_TYPE_UD), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xff00)); + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + default_state.access_mode = BRW_ALIGN_16; + } + + inst = next_inst(BRW_OPCODE_SEND); + gen8_set_urb_message(brw, inst, ir->urb_write_flags, ir->mlen, 0, ir->offset, + true); + gen8_set_dst(brw, inst, brw_null_reg()); + gen8_set_src0(brw, inst, header); +} + +void +gen8_vec4_generator::generate_gs_set_vertex_count(struct brw_reg eot_mrf_header, + struct brw_reg src) +{ + /* Move the vertex count into the second MRF for the EOT write. */ + assert(eot_mrf_header.file == BRW_MESSAGE_REGISTER_FILE); + int dst_nr = GEN7_MRF_HACK_START + eot_mrf_header.nr + 1; + MOV(retype(brw_vec8_grf(dst_nr, 0), BRW_REGISTER_TYPE_UD), src); +} + +void +gen8_vec4_generator::generate_gs_thread_end(vec4_instruction *ir) +{ + struct brw_reg src = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0); + gen8_instruction *inst; + + /* Enable Channel Masks in the URB_WRITE_HWORD message header */ + default_state.access_mode = BRW_ALIGN_1; + inst = OR(retype(brw_vec1_grf(GEN7_MRF_HACK_START + ir->base_mrf, 5), + BRW_REGISTER_TYPE_UD), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xff00)); /* could be 0x1100 but shouldn't matter */ + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + default_state.access_mode = BRW_ALIGN_16; + + /* mlen = 2: g0 header + vertex count */ + inst = next_inst(BRW_OPCODE_SEND); + gen8_set_urb_message(brw, inst, BRW_URB_WRITE_EOT, 2, 0, 0, true); + gen8_set_dst(brw, inst, brw_null_reg()); + gen8_set_src0(brw, inst, src); +} + +void +gen8_vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message + * Header: M0.3): + * + * Slot 0 Offset. This field, after adding to the Global Offset field + * in the message descriptor, specifies the offset (in 256-bit units) + * from the start of the URB entry, as referenced by URB Handle 0, at + * which the data will be accessed. + * + * Similar text describes DWORD M0.4, which is slot 1 offset. + * + * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components + * of the register for geometry shader invocations 0 and 1) by the + * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. + * + * We can do this with the following EU instruction: + * + * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all } + */ + default_state.access_mode = BRW_ALIGN_1; + gen8_instruction *inst = + MUL(suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), src1); + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + default_state.access_mode = BRW_ALIGN_16; +} + +void +gen8_vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst, + struct brw_reg src) +{ + assert(src.file == BRW_IMMEDIATE_VALUE); + + default_state.access_mode = BRW_ALIGN_1; + + gen8_instruction *inst = MOV(suboffset(vec1(dst), 2), src); + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + + default_state.access_mode = BRW_ALIGN_16; +} + +void +gen8_vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst) +{ + /* We want to left shift just DWORD 4 (the x component belonging to the + * second geometry shader invocation) by 4 bits. So generate the + * instruction: + * + * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } + */ + dst = suboffset(vec1(dst), 4); + default_state.access_mode = BRW_ALIGN_1; + gen8_instruction *inst = SHL(dst, dst, brw_imm_ud(4)); + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + default_state.access_mode = BRW_ALIGN_16; +} + +void +gen8_vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst, + struct brw_reg src) +{ + /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message + * Header: M0.5): + * + * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask + * + * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 + * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls + * Vertex 0 DATA[7]. This bit is ANDed with the corresponding + * channel enable to determine the final channel enable. For the + * URB_READ_OWORD & URB_READ_HWORD messages, when final channel + * enable is 1 it indicates that Vertex 1 DATA [3] will be included + * in the writeback message. For the URB_WRITE_OWORD & + * URB_WRITE_HWORD messages, when final channel enable is 1 it + * indicates that Vertex 1 DATA [3] will be written to the surface. + * + * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included + * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included + * + * 14 Vertex 1 DATA [2] Channel Mask + * 13 Vertex 1 DATA [1] Channel Mask + * 12 Vertex 1 DATA [0] Channel Mask + * 11 Vertex 0 DATA [3] Channel Mask + * 10 Vertex 0 DATA [2] Channel Mask + * 9 Vertex 0 DATA [1] Channel Mask + * 8 Vertex 0 DATA [0] Channel Mask + * + * (This is from a section of the PRM that is agnostic to the particular + * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to + * geometry shader invocations 0 and 1, respectively). Since we have the + * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, + * and the enable flags for geometry shader invocation 1 in bits 7:0 of + * DWORD 4, we just need to OR them together and store the result in bits + * 15:8 of DWORD 5. + * + * It's easier to get the EU to do this if we think of the src and dst + * registers as composed of 32 bytes each; then, we want to pick up the + * contents of bytes 0 and 16 from src, OR them together, and store them in + * byte 21. + * + * We can do that by the following EU instruction: + * + * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } + * + * Note: this relies on the source register having zeros in (a) bits 7:4 of + * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the + * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which + * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to + * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to + * contain valid channel mask values (which are in the range 0x0-0xf). + */ + dst = retype(dst, BRW_REGISTER_TYPE_UB); + src = retype(src, BRW_REGISTER_TYPE_UB); + + default_state.access_mode = BRW_ALIGN_1; + + gen8_instruction *inst = + OR(suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); + gen8_set_mask_control(inst, BRW_MASK_DISABLE); + + default_state.access_mode = BRW_ALIGN_16; +} + +void +gen8_vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1, + struct brw_reg index) +{ + int second_vertex_offset = 1; + + m1 = retype(m1, BRW_REGISTER_TYPE_D); + + /* Set up M1 (message payload). Only the block offsets in M1.0 and + * M1.4 are used, and the rest are ignored. + */ + struct brw_reg m1_0 = suboffset(vec1(m1), 0); + struct brw_reg m1_4 = suboffset(vec1(m1), 4); + struct brw_reg index_0 = suboffset(vec1(index), 0); + struct brw_reg index_4 = suboffset(vec1(index), 4); + + default_state.mask_control = BRW_MASK_DISABLE; + default_state.access_mode = BRW_ALIGN_1; + + MOV(m1_0, index_0); + + if (index.file == BRW_IMMEDIATE_VALUE) { + index_4.dw1.ud += second_vertex_offset; + MOV(m1_4, index_4); + } else { + ADD(m1_4, index_4, brw_imm_d(second_vertex_offset)); + } + + default_state.mask_control = BRW_MASK_ENABLE; + default_state.access_mode = BRW_ALIGN_16; +} + +void +gen8_vec4_generator::generate_scratch_read(vec4_instruction *ir, + struct brw_reg dst, + struct brw_reg index) +{ + struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0); + + MOV_RAW(header, brw_vec8_grf(0, 0)); + + generate_oword_dual_block_offsets(brw_message_reg(ir->base_mrf + 1), index); + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + gen8_instruction *send = next_inst(BRW_OPCODE_SEND); + gen8_set_dst(brw, send, dst); + gen8_set_src0(brw, send, header); + gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE, + 255, /* binding table index: stateless access */ + GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + 2, /* mlen */ + 1, /* rlen */ + true, /* header present */ + false); /* EOT */ +} + +void +gen8_vec4_generator::generate_scratch_write(vec4_instruction *ir, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg index) +{ + struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0); + + MOV_RAW(header, brw_vec8_grf(0, 0)); + + generate_oword_dual_block_offsets(brw_message_reg(ir->base_mrf + 1), index); + + MOV(retype(brw_message_reg(ir->base_mrf + 2), BRW_REGISTER_TYPE_D), + retype(src, BRW_REGISTER_TYPE_D)); + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + gen8_instruction *send = next_inst(BRW_OPCODE_SEND); + gen8_set_dst(brw, send, dst); + gen8_set_src0(brw, send, header); + gen8_set_pred_control(send, ir->predicate); + gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE, + 255, /* binding table index: stateless access */ + GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + 3, /* mlen */ + 0, /* rlen */ + true, /* header present */ + false); /* EOT */ +} + +void +gen8_vec4_generator::generate_pull_constant_load(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) +{ + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.dw1.ud; + + assert(offset.file == BRW_GENERAL_REGISTER_FILE); + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + gen8_instruction *send = next_inst(BRW_OPCODE_SEND); + gen8_set_dst(brw, send, dst); + gen8_set_src0(brw, send, offset); + gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE, + surf_index, + GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ, + 0, /* message control */ + 1, /* mlen */ + 1, /* rlen */ + false, /* no header */ + false); /* EOT */ + + mark_surface_used(surf_index); +} + +void +gen8_vec4_generator::generate_vec4_instruction(vec4_instruction *instruction, + struct brw_reg dst, + struct brw_reg *src) +{ + vec4_instruction *ir = (vec4_instruction *) instruction; + + if (dst.width == BRW_WIDTH_4) { + /* This happens in attribute fixups for "dual instanced" geometry + * shaders, since they use attributes that are vec4's. Since the exec + * width is only 4, it's essential that the caller set + * force_writemask_all in order to make sure the instruction is executed + * regardless of which channels are enabled. + */ + assert(ir->force_writemask_all); + + /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy + * the following register region restrictions (from Graphics BSpec: + * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions + * > Register Region Restrictions) + * + * 1. ExecSize must be greater than or equal to Width. + * + * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set + * to Width * HorzStride." + */ + for (int i = 0; i < 3; i++) { + if (src[i].file == BRW_GENERAL_REGISTER_FILE) + src[i] = stride(src[i], 4, 4, 1); + } + } + + switch (ir->opcode) { + case BRW_OPCODE_MOV: + MOV(dst, src[0]); + break; + + case BRW_OPCODE_ADD: + ADD(dst, src[0], src[1]); + break; + + case BRW_OPCODE_MUL: + MUL(dst, src[0], src[1]); + break; + + case BRW_OPCODE_MACH: + MACH(dst, src[0], src[1]); + break; + + case BRW_OPCODE_MAD: + MAD(dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_FRC: + FRC(dst, src[0]); + break; + + case BRW_OPCODE_RNDD: + RNDD(dst, src[0]); + break; + + case BRW_OPCODE_RNDE: + RNDE(dst, src[0]); + break; + + case BRW_OPCODE_RNDZ: + RNDZ(dst, src[0]); + break; + + case BRW_OPCODE_AND: + AND(dst, src[0], src[1]); + break; + + case BRW_OPCODE_OR: + OR(dst, src[0], src[1]); + break; + + case BRW_OPCODE_XOR: + XOR(dst, src[0], src[1]); + break; + + case BRW_OPCODE_NOT: + NOT(dst, src[0]); + break; + + case BRW_OPCODE_ASR: + ASR(dst, src[0], src[1]); + break; + + case BRW_OPCODE_SHR: + SHR(dst, src[0], src[1]); + break; + + case BRW_OPCODE_SHL: + SHL(dst, src[0], src[1]); + break; + + case BRW_OPCODE_CMP: + CMP(dst, ir->conditional_mod, src[0], src[1]); + break; + + case BRW_OPCODE_SEL: + SEL(dst, src[0], src[1]); + break; + + case BRW_OPCODE_DPH: + DPH(dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP4: + DP4(dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP3: + DP3(dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP2: + DP2(dst, src[0], src[1]); + break; + + case BRW_OPCODE_F32TO16: + F32TO16(dst, src[0]); + break; + + case BRW_OPCODE_F16TO32: + F16TO32(dst, src[0]); + break; + + case BRW_OPCODE_LRP: + LRP(dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFREV: + /* BFREV only supports UD type for src and dst. */ + BFREV(retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + + case BRW_OPCODE_FBH: + /* FBH only supports UD type for dst. */ + FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + + case BRW_OPCODE_FBL: + /* FBL only supports UD type for dst. */ + FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + + case BRW_OPCODE_CBIT: + /* CBIT only supports UD type for dst. */ + CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + + case BRW_OPCODE_ADDC: + ADDC(dst, src[0], src[1]); + break; + + case BRW_OPCODE_SUBB: + SUBB(dst, src[0], src[1]); + break; + + case BRW_OPCODE_BFE: + BFE(dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFI1: + BFI1(dst, src[0], src[1]); + break; + + case BRW_OPCODE_BFI2: + BFI2(dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_IF: + IF(ir->predicate); + break; + + case BRW_OPCODE_ELSE: + ELSE(); + break; + + case BRW_OPCODE_ENDIF: + ENDIF(); + break; + + case BRW_OPCODE_DO: + DO(); + break; + + case BRW_OPCODE_BREAK: + BREAK(); + break; + + case BRW_OPCODE_CONTINUE: + CONTINUE(); + break; + + case BRW_OPCODE_WHILE: + WHILE(); + break; + + case SHADER_OPCODE_RCP: + MATH(BRW_MATH_FUNCTION_INV, dst, src[0]); + break; + + case SHADER_OPCODE_RSQ: + MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]); + break; + + case SHADER_OPCODE_SQRT: + MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]); + break; + + case SHADER_OPCODE_EXP2: + MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]); + break; + + case SHADER_OPCODE_LOG2: + MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]); + break; + + case SHADER_OPCODE_SIN: + MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]); + break; + + case SHADER_OPCODE_COS: + MATH(BRW_MATH_FUNCTION_COS, dst, src[0]); + break; + + case SHADER_OPCODE_POW: + MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_INT_QUOTIENT: + MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_INT_REMAINDER: + MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_MS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + generate_tex(ir, dst); + break; + + case VS_OPCODE_URB_WRITE: + generate_urb_write(ir, true); + break; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + generate_scratch_read(ir, dst, src[0]); + break; + + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + generate_scratch_write(ir, dst, src[0], src[1]); + break; + + case VS_OPCODE_PULL_CONSTANT_LOAD: + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + generate_pull_constant_load(ir, dst, src[0], src[1]); + break; + + case GS_OPCODE_URB_WRITE: + generate_urb_write(ir, false); + break; + + case GS_OPCODE_THREAD_END: + generate_gs_thread_end(ir); + break; + + case GS_OPCODE_SET_WRITE_OFFSET: + generate_gs_set_write_offset(dst, src[0], src[1]); + break; + + case GS_OPCODE_SET_VERTEX_COUNT: + generate_gs_set_vertex_count(dst, src[0]); + break; + + case GS_OPCODE_SET_DWORD_2_IMMED: + generate_gs_set_dword_2_immed(dst, src[0]); + break; + + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + generate_gs_prepare_channel_masks(dst); + break; + + case GS_OPCODE_SET_CHANNEL_MASKS: + generate_gs_set_channel_masks(dst, src[0]); + break; + + case SHADER_OPCODE_SHADER_TIME_ADD: + assert(!"XXX: Missing Gen8 vec4 support for INTEL_DEBUG=shader_time"); + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_ATOMIC"); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_SURFACE_READ"); + break; + + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + assert(!"VS_OPCODE_UNPACK_FLAGS_SIMD4X2 should not be used on Gen8+."); + break; + + default: + if (ir->opcode < (int) ARRAY_SIZE(opcode_descs)) { + _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n", + opcode_descs[ir->opcode].name); + } else { + _mesa_problem(ctx, "Unsupported opcode %d in VS", ir->opcode); + } + abort(); + } +} + +void +gen8_vec4_generator::generate_code(exec_list *instructions) +{ + int last_native_inst_offset = 0; + const char *last_annotation_string = NULL; + const void *last_annotation_ir = NULL; + + if (unlikely(debug_flag)) { + if (shader) { + printf("Native code for vertex shader %d:\n", shader_prog->Name); + } else { + printf("Native code for vertex program %d:\n", prog->Id); + } + } + + foreach_list(node, instructions) { + vec4_instruction *ir = (vec4_instruction *) node; + struct brw_reg src[3], dst; + + if (unlikely(debug_flag)) { + if (last_annotation_ir != ir->ir) { + last_annotation_ir = ir->ir; + if (last_annotation_ir) { + printf(" "); + if (shader) { + ((ir_instruction *) last_annotation_ir)->print(); + } else { + const prog_instruction *vpi; + vpi = (const prog_instruction *) ir->ir; + printf("%d: ", (int)(vpi - prog->Instructions)); + _mesa_fprint_instruction_opt(stdout, vpi, 0, + PROG_PRINT_DEBUG, NULL); + } + printf("\n"); + } + } + if (last_annotation_string != ir->annotation) { + last_annotation_string = ir->annotation; + if (last_annotation_string) + printf(" %s\n", last_annotation_string); + } + } + + for (unsigned int i = 0; i < 3; i++) { + src[i] = ir->get_src(prog_data, i); + } + dst = ir->get_dst(); + + default_state.conditional_mod = ir->conditional_mod; + default_state.predicate = ir->predicate; + default_state.predicate_inverse = ir->predicate_inverse; + default_state.saturate = ir->saturate; + + const unsigned pre_emit_nr_inst = nr_inst; + + generate_vec4_instruction(ir, dst, src); + + if (ir->no_dd_clear || ir->no_dd_check) { + assert(nr_inst == pre_emit_nr_inst + 1 || + !"no_dd_check or no_dd_clear set for IR emitting more " + "than 1 instruction"); + + gen8_instruction *last = &store[pre_emit_nr_inst]; + gen8_set_no_dd_clear(last, ir->no_dd_clear); + gen8_set_no_dd_check(last, ir->no_dd_check); + } + + if (unlikely(debug_flag)) { + disassemble(stdout, last_native_inst_offset, next_inst_offset); + } + + last_native_inst_offset = next_inst_offset; + } + + if (unlikely(debug_flag)) { + printf("\n"); + } + + patch_jump_targets(); + + /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS + * emit issues, it doesn't get the jump distances into the output, + * which is often something we want to debug. So this is here in + * case you're doing that. + */ + if (0 && unlikely(debug_flag)) { + disassemble(stdout, 0, next_inst_offset); + } +} + +const unsigned * +gen8_vec4_generator::generate_assembly(exec_list *instructions, + unsigned *assembly_size) +{ + default_state.access_mode = BRW_ALIGN_16; + default_state.exec_size = BRW_EXECUTE_8; + generate_code(instructions); + *assembly_size = next_inst_offset; + return (const unsigned *) store; +} + +} /* namespace brw */ -- 2.11.0