From 8c6d147373cbdefef5945b00626bb62bb03198ca Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 26 Jan 2016 10:30:39 +0100 Subject: [PATCH] i965/fs: support doubles with shared variable stores This is pretty much the same we do with SSBOs. v2: do not shuffle in-place, it is not safe since the original 64-bit data could be used after the write, instead use a temporary like we do for SSBO stores (Iago) Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 40 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 78e4f8415f1..d0ffff233a7 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -3125,6 +3125,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, /* Writemask */ unsigned writemask = instr->const_index[1]; + /* get_nir_src() retypes to integer. Be wary of 64-bit types though + * since the untyped writes below operate in units of 32-bits, which + * means that we need to write twice as many components each time. + * Also, we have to suffle 64-bit data to be in the appropriate layout + * expected by our 32-bit write messages. + */ + unsigned type_size = 4; + unsigned bit_size = instr->src[0].is_ssa ? + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; + if (bit_size == 64) { + type_size = 8; + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type); + shuffle_64bit_data_for_32bit_write( + bld, + retype(tmp, BRW_REGISTER_TYPE_F), + retype(val_reg, BRW_REGISTER_TYPE_DF), + instr->num_components); + val_reg = tmp; + } + + unsigned type_slots = type_size / 4; + /* Combine groups of consecutive enabled channels in one write * message. We use ffs to find the first enabled channel and then ffs on * the bit-inverse, down-shifted writemask to determine the length of @@ -3133,22 +3156,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, while (writemask) { unsigned first_component = ffs(writemask) - 1; unsigned length = ffs(~(writemask >> first_component)) - 1; - fs_reg offset_reg; + /* We can't write more than 2 64-bit components at once. Limit the + * length of the write to what we can do and let the next iteration + * handle the rest + */ + if (type_size > 4) + length = MIN2(2, length); + + fs_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset) { offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] + - 4 * first_component); + type_size * first_component); } else { offset_reg = vgrf(glsl_type::uint_type); bld.ADD(offset_reg, retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), - brw_imm_ud(instr->const_index[0] + 4 * first_component)); + brw_imm_ud(instr->const_index[0] + type_size * first_component)); } emit_untyped_write(bld, surf_index, offset_reg, - offset(val_reg, bld, first_component), - 1 /* dims */, length, + offset(val_reg, bld, first_component * type_slots), + 1 /* dims */, length * type_slots, BRW_PREDICATE_NONE); /* Clear the bits in the writemask that we just wrote, then try -- 2.11.0