src/mesa/drivers/dri/i965/gen7_wm_state.c

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <stdbool.h>
  25 #include "brw_context.h"
  26 #include "brw_state.h"
  27 #include "brw_defines.h"
  28 #include "brw_util.h"
  29 #include "brw_wm.h"
  30 #include "program/program.h"
  31 #include "program/prog_parameter.h"
  32 #include "program/prog_statevars.h"
  33 #include "main/framebuffer.h"
  34 #include "intel_batchbuffer.h"
  35
  36 static void
  37 upload_wm_state(struct brw_context *brw)
  38 {
  39    struct gl_context *ctx = &brw->ctx;
  40    /* BRW_NEW_FRAGMENT_PROGRAM */
  41    const struct brw_fragment_program *fp =
  42       brw_fragment_program_const(brw->fragment_program);
  43    /* BRW_NEW_FS_PROG_DATA */
  44    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
  45    bool writes_depth = prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
  46    uint32_t dw1, dw2;
  47
  48    /* _NEW_BUFFERS */
  49    const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
  50
  51    dw1 = dw2 = 0;
  52    dw1 |= GEN7_WM_STATISTICS_ENABLE;
  53    dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
  54    dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
  55
  56    /* _NEW_LINE */
  57    if (ctx->Line.StippleFlag)
  58       dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
  59
  60    /* _NEW_POLYGON */
  61    if (ctx->Polygon.StippleFlag)
  62       dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
  63
  64    if (fp->program.Base.InputsRead & VARYING_BIT_POS)
  65       dw1 |= GEN7_WM_USES_SOURCE_DEPTH | GEN7_WM_USES_SOURCE_W;
  66
  67    dw1 |= prog_data->computed_depth_mode << GEN7_WM_COMPUTED_DEPTH_MODE_SHIFT;
  68    dw1 |= prog_data->barycentric_interp_modes <<
  69       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
  70
  71    /* _NEW_COLOR, _NEW_MULTISAMPLE */
  72    /* Enable if the pixel shader kernel generates and outputs oMask.
  73     */
  74    if (prog_data->uses_kill || ctx->Color.AlphaEnabled ||
  75        ctx->Multisample.SampleAlphaToCoverage ||
  76        prog_data->uses_omask) {
  77       dw1 |= GEN7_WM_KILL_ENABLE;
  78    }
  79
  80    /* _NEW_BUFFERS | _NEW_COLOR */
  81    const bool active_fs_has_side_effects =
  82       _mesa_active_fragment_shader_has_side_effects(&brw->ctx);
  83    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
  84        active_fs_has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) {
  85       dw1 |= GEN7_WM_DISPATCH_ENABLE;
  86    }
  87    if (multisampled_fbo) {
  88       /* _NEW_MULTISAMPLE */
  89       if (ctx->Multisample.Enabled)
  90          dw1 |= GEN7_WM_MSRAST_ON_PATTERN;
  91       else
  92          dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
  93
  94       if (_mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false) > 1)
  95          dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
  96       else
  97          dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
  98    } else {
  99       dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
 100       dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
 101    }
 102
 103    if (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
 104       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
 105    }
 106
 107    /* BRW_NEW_FS_PROG_DATA */
 108    if (prog_data->early_fragment_tests)
 109       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
 110    else if (active_fs_has_side_effects)
 111       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
 112
 113    /* The "UAV access enable" bits are unnecessary on HSW because they only
 114     * seem to have an effect on the HW-assisted coherency mechanism which we
 115     * don't need, and the rasterization-related UAV_ONLY flag and the
 116     * DISPATCH_ENABLE bit can be set independently from it.
 117     * C.f. gen8_upload_ps_extra().
 118     *
 119     * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR
 120     */
 121    if (brw->is_haswell &&
 122        !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
 123        active_fs_has_side_effects)
 124       dw2 |= HSW_WM_UAV_ONLY;
 125
 126    BEGIN_BATCH(3);
 127    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
 128    OUT_BATCH(dw1);
 129    OUT_BATCH(dw2);
 130    ADVANCE_BATCH();
 131 }
 132
 133 const struct brw_tracked_state gen7_wm_state = {
 134    .dirty = {
 135       .mesa  = _NEW_BUFFERS |
 136                _NEW_COLOR |
 137                _NEW_LINE |
 138                _NEW_MULTISAMPLE |
 139                _NEW_POLYGON,
 140       .brw   = BRW_NEW_BATCH |
 141                BRW_NEW_FRAGMENT_PROGRAM |
 142                BRW_NEW_FS_PROG_DATA,
 143    },
 144    .emit = upload_wm_state,
 145 };
 146
 147 static void
 148 gen7_upload_ps_state(struct brw_context *brw,
 149                      const struct gl_fragment_program *fp,
 150                      const struct brw_stage_state *stage_state,
 151                      const struct brw_wm_prog_data *prog_data,
 152                      bool enable_dual_src_blend, unsigned sample_mask,
 153                      unsigned fast_clear_op)
 154 {
 155    struct gl_context *ctx = &brw->ctx;
 156    uint32_t dw2, dw4, dw5, ksp0, ksp2;
 157    const int max_threads_shift = brw->is_haswell ?
 158       HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
 159
 160    dw2 = dw4 = dw5 = ksp2 = 0;
 161
 162    const unsigned sampler_count =
 163       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
 164    dw2 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT);
 165
 166    dw2 |= ((prog_data->base.binding_table.size_bytes / 4) <<
 167            GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 168
 169    if (prog_data->base.use_alt_mode)
 170       dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
 171
 172    /* Haswell requires the sample mask to be set in this packet as well as
 173     * in 3DSTATE_SAMPLE_MASK; the values should match. */
 174    /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
 175    if (brw->is_haswell)
 176       dw4 |= SET_FIELD(sample_mask, HSW_PS_SAMPLE_MASK);
 177
 178    dw4 |= (brw->max_wm_threads - 1) << max_threads_shift;
 179
 180    if (prog_data->base.nr_params > 0)
 181       dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
 182
 183    /* From the IVB PRM, volume 2 part 1, page 287:
 184     * "This bit is inserted in the PS payload header and made available to
 185     * the DataPort (either via the message header or via header bypass) to
 186     * indicate that oMask data (one or two phases) is included in Render
 187     * Target Write messages. If present, the oMask data is used to mask off
 188     * samples."
 189     */
 190    if (prog_data->uses_omask)
 191       dw4 |= GEN7_PS_OMASK_TO_RENDER_TARGET;
 192
 193    /* From the IVB PRM, volume 2 part 1, page 287:
 194     * "If the PS kernel does not need the Position XY Offsets to
 195     * compute a Position Value, then this field should be programmed
 196     * to POSOFFSET_NONE."
 197     * "SW Recommendation: If the PS kernel needs the Position Offsets
 198     * to compute a Position XY value, this field should match Position
 199     * ZW Interpolation Mode to ensure a consistent position.xyzw
 200     * computation."
 201     * We only require XY sample offsets. So, this recommendation doesn't
 202     * look useful at the moment. We might need this in future.
 203     */
 204    if (prog_data->uses_pos_offset)
 205       dw4 |= GEN7_PS_POSOFFSET_SAMPLE;
 206    else
 207       dw4 |= GEN7_PS_POSOFFSET_NONE;
 208
 209    /* The hardware wedges if you have this bit set but don't turn on any dual
 210     * source blend factors.
 211     */
 212    if (enable_dual_src_blend)
 213       dw4 |= GEN7_PS_DUAL_SOURCE_BLEND_ENABLE;
 214
 215    /* BRW_NEW_FS_PROG_DATA */
 216    if (prog_data->num_varying_inputs != 0)
 217       dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
 218
 219    /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
 220     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
 221     * is successfully compiled. In majority of the cases that bring us
 222     * better performance than 'SIMD8 only' dispatch.
 223     */
 224    int min_inv_per_frag =
 225       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
 226    assert(min_inv_per_frag >= 1);
 227
 228    if (prog_data->prog_offset_16 || prog_data->no_8) {
 229       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
 230       if (!prog_data->no_8 && min_inv_per_frag == 1) {
 231          dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
 232          dw5 |= (prog_data->base.dispatch_grf_start_reg <<
 233                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 234          dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
 235                  GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
 236          ksp0 = stage_state->prog_offset;
 237          ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
 238       } else {
 239          dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
 240                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 241          ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
 242       }
 243    }
 244    else {
 245       dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
 246       dw5 |= (prog_data->base.dispatch_grf_start_reg <<
 247               GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 248       ksp0 = stage_state->prog_offset;
 249    }
 250
 251    dw4 |= fast_clear_op;
 252
 253    BEGIN_BATCH(8);
 254    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
 255    OUT_BATCH(ksp0);
 256    OUT_BATCH(dw2);
 257    if (prog_data->base.total_scratch) {
 258       OUT_RELOC(brw->wm.base.scratch_bo,
 259                 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 260                 ffs(prog_data->base.total_scratch) - 11);
 261    } else {
 262       OUT_BATCH(0);
 263    }
 264    OUT_BATCH(dw4);
 265    OUT_BATCH(dw5);
 266    OUT_BATCH(0); /* kernel 1 pointer */
 267    OUT_BATCH(ksp2);
 268    ADVANCE_BATCH();
 269 }
 270
 271 static void
 272 upload_ps_state(struct brw_context *brw)
 273 {
 274    /* BRW_NEW_FS_PROG_DATA */
 275    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
 276    const struct gl_context *ctx = &brw->ctx;
 277    /* BRW_NEW_FS_PROG_DATA | _NEW_COLOR */
 278    const bool enable_dual_src_blend = prog_data->dual_src_blend &&
 279                                       (ctx->Color.BlendEnabled & 1) &&
 280                                       ctx->Color.Blend[0]._UsesDualSrc;
 281    /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
 282    const unsigned sample_mask =
 283       brw->is_haswell ? gen6_determine_sample_mask(brw) : 0;
 284
 285    gen7_upload_ps_state(brw, brw->fragment_program, &brw->wm.base, prog_data,
 286                         enable_dual_src_blend, sample_mask,
 287                         brw->wm.fast_clear_op);
 288 }
 289
 290 const struct brw_tracked_state gen7_ps_state = {
 291    .dirty = {
 292       .mesa  = _NEW_BUFFERS |
 293                _NEW_COLOR |
 294                _NEW_MULTISAMPLE,
 295       .brw   = BRW_NEW_BATCH |
 296                BRW_NEW_FRAGMENT_PROGRAM |
 297                BRW_NEW_FS_PROG_DATA,
 298    },
 299    .emit = upload_ps_state,
 300 };