OSDN Git Service

a8ecbd24d582e752aac740c440b322c1c730d70b
[android-x86/external-mesa.git] / src / intel / vulkan / genX_cmd_buffer.c
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28
29 #include "common/gen_l3_config.h"
30 #include "genxml/gen_macros.h"
31 #include "genxml/genX_pack.h"
32
33 static void
34 emit_lrm(struct anv_batch *batch,
35          uint32_t reg, struct anv_bo *bo, uint32_t offset)
36 {
37    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
38       lrm.RegisterAddress  = reg;
39       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
40    }
41 }
42
43 static void
44 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
45 {
46    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
47       lri.RegisterOffset   = reg;
48       lri.DataDWord        = imm;
49    }
50 }
51
52 void
53 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
54 {
55    struct anv_device *device = cmd_buffer->device;
56
57 /* XXX: Do we need this on more than just BDW? */
58 #if (GEN_GEN >= 8)
59    /* Emit a render target cache flush.
60     *
61     * This isn't documented anywhere in the PRM.  However, it seems to be
62     * necessary prior to changing the surface state base adress.  Without
63     * this, we get GPU hangs when using multi-level command buffers which
64     * clear depth, reset state base address, and then go render stuff.
65     */
66    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
67       pc.RenderTargetCacheFlushEnable = true;
68    }
69 #endif
70
71    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
72       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
73       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
74       sba.GeneralStateBaseAddressModifyEnable = true;
75
76       sba.SurfaceStateBaseAddress =
77          anv_cmd_buffer_surface_base_address(cmd_buffer);
78       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
79       sba.SurfaceStateBaseAddressModifyEnable = true;
80
81       sba.DynamicStateBaseAddress =
82          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
83       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
84       sba.DynamicStateBaseAddressModifyEnable = true;
85
86       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
87       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
88       sba.IndirectObjectBaseAddressModifyEnable = true;
89
90       sba.InstructionBaseAddress =
91          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
92       sba.InstructionMemoryObjectControlState = GENX(MOCS);
93       sba.InstructionBaseAddressModifyEnable = true;
94
95 #  if (GEN_GEN >= 8)
96       /* Broadwell requires that we specify a buffer size for a bunch of
97        * these fields.  However, since we will be growing the BO's live, we
98        * just set them all to the maximum.
99        */
100       sba.GeneralStateBufferSize                = 0xfffff;
101       sba.GeneralStateBufferSizeModifyEnable    = true;
102       sba.DynamicStateBufferSize                = 0xfffff;
103       sba.DynamicStateBufferSizeModifyEnable    = true;
104       sba.IndirectObjectBufferSize              = 0xfffff;
105       sba.IndirectObjectBufferSizeModifyEnable  = true;
106       sba.InstructionBufferSize                 = 0xfffff;
107       sba.InstructionBuffersizeModifyEnable     = true;
108 #  endif
109    }
110
111    /* After re-setting the surface state base address, we have to do some
112     * cache flusing so that the sampler engine will pick up the new
113     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
114     * Shared Function > 3D Sampler > State > State Caching (page 96):
115     *
116     *    Coherency with system memory in the state cache, like the texture
117     *    cache is handled partially by software. It is expected that the
118     *    command stream or shader will issue Cache Flush operation or
119     *    Cache_Flush sampler message to ensure that the L1 cache remains
120     *    coherent with system memory.
121     *
122     *    [...]
123     *
124     *    Whenever the value of the Dynamic_State_Base_Addr,
125     *    Surface_State_Base_Addr are altered, the L1 state cache must be
126     *    invalidated to ensure the new surface or sampler state is fetched
127     *    from system memory.
128     *
129     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
130     * which, according the PIPE_CONTROL instruction documentation in the
131     * Broadwell PRM:
132     *
133     *    Setting this bit is independent of any other bit in this packet.
134     *    This bit controls the invalidation of the L1 and L2 state caches
135     *    at the top of the pipe i.e. at the parsing time.
136     *
137     * Unfortunately, experimentation seems to indicate that state cache
138     * invalidation through a PIPE_CONTROL does nothing whatsoever in
139     * regards to surface state and binding tables.  In stead, it seems that
140     * invalidating the texture cache is what is actually needed.
141     *
142     * XXX:  As far as we have been able to determine through
143     * experimentation, shows that flush the texture cache appears to be
144     * sufficient.  The theory here is that all of the sampling/rendering
145     * units cache the binding table in the texture cache.  However, we have
146     * yet to be able to actually confirm this.
147     */
148    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
149       pc.TextureCacheInvalidationEnable = true;
150    }
151 }
152
153 VkResult
154 genX(BeginCommandBuffer)(
155     VkCommandBuffer                             commandBuffer,
156     const VkCommandBufferBeginInfo*             pBeginInfo)
157 {
158    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
159
160    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
161     * command buffer's state. Otherwise, we must *reset* its state. In both
162     * cases we reset it.
163     *
164     * From the Vulkan 1.0 spec:
165     *
166     *    If a command buffer is in the executable state and the command buffer
167     *    was allocated from a command pool with the
168     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
169     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
170     *    as if vkResetCommandBuffer had been called with
171     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
172     *    the command buffer in the recording state.
173     */
174    anv_cmd_buffer_reset(cmd_buffer);
175
176    cmd_buffer->usage_flags = pBeginInfo->flags;
177
178    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
179           !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
180
181    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
182
183    if (cmd_buffer->usage_flags &
184        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
185       cmd_buffer->state.framebuffer =
186          anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
187       cmd_buffer->state.pass =
188          anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
189       cmd_buffer->state.subpass =
190          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
191
192       cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
193    }
194
195    return VK_SUCCESS;
196 }
197
198 VkResult
199 genX(EndCommandBuffer)(
200     VkCommandBuffer                             commandBuffer)
201 {
202    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
203    struct anv_device *device = cmd_buffer->device;
204
205    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
206
207    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
208       /* The algorithm used to compute the validate list is not threadsafe as
209        * it uses the bo->index field.  We have to lock the device around it.
210        * Fortunately, the chances for contention here are probably very low.
211        */
212       pthread_mutex_lock(&device->mutex);
213       anv_cmd_buffer_prepare_execbuf(cmd_buffer);
214       pthread_mutex_unlock(&device->mutex);
215    }
216
217    return VK_SUCCESS;
218 }
219
220 void
221 genX(CmdExecuteCommands)(
222     VkCommandBuffer                             commandBuffer,
223     uint32_t                                    commandBufferCount,
224     const VkCommandBuffer*                      pCmdBuffers)
225 {
226    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
227
228    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
229
230    for (uint32_t i = 0; i < commandBufferCount; i++) {
231       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
232
233       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
234
235       anv_cmd_buffer_add_secondary(primary, secondary);
236    }
237
238    /* Each of the secondary command buffers will use its own state base
239     * address.  We need to re-emit state base address for the primary after
240     * all of the secondaries are done.
241     *
242     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
243     * address calls?
244     */
245    genX(cmd_buffer_emit_state_base_address)(primary);
246 }
247
248 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
249 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
250 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
251
252 /**
253  * Program the hardware to use the specified L3 configuration.
254  */
255 void
256 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
257                            const struct gen_l3_config *cfg)
258 {
259    assert(cfg);
260    if (cfg == cmd_buffer->state.current_l3_config)
261       return;
262
263    if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
264       fprintf(stderr, "L3 config transition: ");
265       gen_dump_l3_config(cfg, stderr);
266    }
267
268    const bool has_slm = cfg->n[GEN_L3P_SLM];
269
270    /* According to the hardware docs, the L3 partitioning can only be changed
271     * while the pipeline is completely drained and the caches are flushed,
272     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
273     */
274    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
275       pc.DCFlushEnable = true;
276       pc.PostSyncOperation = NoWrite;
277       pc.CommandStreamerStallEnable = true;
278    }
279
280    /* ...followed by a second pipelined PIPE_CONTROL that initiates
281     * invalidation of the relevant caches.  Note that because RO invalidation
282     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
283     * command is processed by the CS) we cannot combine it with the previous
284     * stalling flush as the hardware documentation suggests, because that
285     * would cause the CS to stall on previous rendering *after* RO
286     * invalidation and wouldn't prevent the RO caches from being polluted by
287     * concurrent rendering before the stall completes.  This intentionally
288     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
289     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
290     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
291     * already guarantee that there is no concurrent GPGPU kernel execution
292     * (see SKL HSD 2132585).
293     */
294    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
295       pc.TextureCacheInvalidationEnable = true;
296       pc.ConstantCacheInvalidationEnable = true;
297       pc.InstructionCacheInvalidateEnable = true;
298       pc.StateCacheInvalidationEnable = true;
299       pc.PostSyncOperation = NoWrite;
300    }
301
302    /* Now send a third stalling flush to make sure that invalidation is
303     * complete when the L3 configuration registers are modified.
304     */
305    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
306       pc.DCFlushEnable = true;
307       pc.PostSyncOperation = NoWrite;
308       pc.CommandStreamerStallEnable = true;
309    }
310
311 #if GEN_GEN >= 8
312
313    assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
314
315    uint32_t l3cr;
316    anv_pack_struct(&l3cr, GENX(L3CNTLREG),
317                    .SLMEnable = has_slm,
318                    .URBAllocation = cfg->n[GEN_L3P_URB],
319                    .ROAllocation = cfg->n[GEN_L3P_RO],
320                    .DCAllocation = cfg->n[GEN_L3P_DC],
321                    .AllAllocation = cfg->n[GEN_L3P_ALL]);
322
323    /* Set up the L3 partitioning. */
324    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
325
326 #else
327
328    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
329    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
330                        cfg->n[GEN_L3P_ALL];
331    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
332                       cfg->n[GEN_L3P_ALL];
333    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
334                       cfg->n[GEN_L3P_ALL];
335
336    assert(!cfg->n[GEN_L3P_ALL]);
337
338    /* When enabled SLM only uses a portion of the L3 on half of the banks,
339     * the matching space on the remaining banks has to be allocated to a
340     * client (URB for all validated configurations) set to the
341     * lower-bandwidth 2-bank address hashing mode.
342     */
343    const struct gen_device_info *devinfo = &cmd_buffer->device->info;
344    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
345    assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
346
347    /* Minimum number of ways that can be allocated to the URB. */
348    const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
349    assert(cfg->n[GEN_L3P_URB] >= n0_urb);
350
351    uint32_t l3sqcr1, l3cr2, l3cr3;
352    anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
353                    .ConvertDC_UC = !has_dc,
354                    .ConvertIS_UC = !has_is,
355                    .ConvertC_UC = !has_c,
356                    .ConvertT_UC = !has_t);
357    l3sqcr1 |=
358       GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
359       devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
360       IVB_L3SQCREG1_SQGHPCI_DEFAULT;
361
362    anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
363                    .SLMEnable = has_slm,
364                    .URBLowBandwidth = urb_low_bw,
365                    .URBAllocation = cfg->n[GEN_L3P_URB],
366 #if !GEN_IS_HASWELL
367                    .ALLAllocation = cfg->n[GEN_L3P_ALL],
368 #endif
369                    .ROAllocation = cfg->n[GEN_L3P_RO],
370                    .DCAllocation = cfg->n[GEN_L3P_DC]);
371
372    anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
373                    .ISAllocation = cfg->n[GEN_L3P_IS],
374                    .ISLowBandwidth = 0,
375                    .CAllocation = cfg->n[GEN_L3P_C],
376                    .CLowBandwidth = 0,
377                    .TAllocation = cfg->n[GEN_L3P_T],
378                    .TLowBandwidth = 0);
379
380    /* Set up the L3 partitioning. */
381    emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
382    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
383    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
384
385 #if GEN_IS_HASWELL
386    if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
387       /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
388        * them disabled to avoid crashing the system hard.
389        */
390       uint32_t scratch1, chicken3;
391       anv_pack_struct(&scratch1, GENX(SCRATCH1),
392                       .L3AtomicDisable = !has_dc);
393       anv_pack_struct(&chicken3, GENX(CHICKEN3),
394                       .L3AtomicDisableMask = true,
395                       .L3AtomicDisable = !has_dc);
396       emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
397       emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
398    }
399 #endif
400
401 #endif
402
403    cmd_buffer->state.current_l3_config = cfg;
404 }
405
406 void
407 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
408 {
409    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
410
411    /* Flushes are pipelined while invalidations are handled immediately.
412     * Therefore, if we're flushing anything then we need to schedule a stall
413     * before any invalidations can happen.
414     */
415    if (bits & ANV_PIPE_FLUSH_BITS)
416       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
417
418    /* If we're going to do an invalidate and we have a pending CS stall that
419     * has yet to be resolved, we do the CS stall now.
420     */
421    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
422        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
423       bits |= ANV_PIPE_CS_STALL_BIT;
424       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
425    }
426
427    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
428       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
429          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
430          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
431          pipe.RenderTargetCacheFlushEnable =
432             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
433
434          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
435          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
436          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
437
438          /*
439           * According to the Broadwell documentation, any PIPE_CONTROL with the
440           * "Command Streamer Stall" bit set must also have another bit set,
441           * with five different options:
442           *
443           *  - Render Target Cache Flush
444           *  - Depth Cache Flush
445           *  - Stall at Pixel Scoreboard
446           *  - Post-Sync Operation
447           *  - Depth Stall
448           *  - DC Flush Enable
449           *
450           * I chose "Stall at Pixel Scoreboard" since that's what we use in
451           * mesa and it seems to work fine. The choice is fairly arbitrary.
452           */
453          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
454              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
455                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
456             pipe.StallAtPixelScoreboard = true;
457       }
458
459       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
460    }
461
462    if (bits & ANV_PIPE_INVALIDATE_BITS) {
463       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
464          pipe.StateCacheInvalidationEnable =
465             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
466          pipe.ConstantCacheInvalidationEnable =
467             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
468          pipe.VFCacheInvalidationEnable =
469             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
470          pipe.TextureCacheInvalidationEnable =
471             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
472          pipe.InstructionCacheInvalidateEnable =
473             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
474       }
475
476       bits &= ~ANV_PIPE_INVALIDATE_BITS;
477    }
478
479    cmd_buffer->state.pending_pipe_bits = bits;
480 }
481
482 void genX(CmdPipelineBarrier)(
483     VkCommandBuffer                             commandBuffer,
484     VkPipelineStageFlags                        srcStageMask,
485     VkPipelineStageFlags                        destStageMask,
486     VkBool32                                    byRegion,
487     uint32_t                                    memoryBarrierCount,
488     const VkMemoryBarrier*                      pMemoryBarriers,
489     uint32_t                                    bufferMemoryBarrierCount,
490     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
491     uint32_t                                    imageMemoryBarrierCount,
492     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
493 {
494    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
495    uint32_t b;
496
497    /* XXX: Right now, we're really dumb and just flush whatever categories
498     * the app asks for.  One of these days we may make this a bit better
499     * but right now that's all the hardware allows for in most areas.
500     */
501    VkAccessFlags src_flags = 0;
502    VkAccessFlags dst_flags = 0;
503
504    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
505       src_flags |= pMemoryBarriers[i].srcAccessMask;
506       dst_flags |= pMemoryBarriers[i].dstAccessMask;
507    }
508
509    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
510       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
511       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
512    }
513
514    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
515       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
516       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
517    }
518
519    enum anv_pipe_bits pipe_bits = 0;
520
521    for_each_bit(b, src_flags) {
522       switch ((VkAccessFlagBits)(1 << b)) {
523       case VK_ACCESS_SHADER_WRITE_BIT:
524          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
525          break;
526       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
527          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
528          break;
529       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
530          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
531          break;
532       case VK_ACCESS_TRANSFER_WRITE_BIT:
533          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
534          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
535          break;
536       default:
537          break; /* Nothing to do */
538       }
539    }
540
541    for_each_bit(b, dst_flags) {
542       switch ((VkAccessFlagBits)(1 << b)) {
543       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
544       case VK_ACCESS_INDEX_READ_BIT:
545       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
546          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
547          break;
548       case VK_ACCESS_UNIFORM_READ_BIT:
549          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
550          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
551          break;
552       case VK_ACCESS_SHADER_READ_BIT:
553       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
554       case VK_ACCESS_TRANSFER_READ_BIT:
555          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
556          break;
557       default:
558          break; /* Nothing to do */
559       }
560    }
561
562    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
563 }
564
565 static void
566 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
567 {
568    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
569
570    /* In order to avoid thrash, we assume that vertex and fragment stages
571     * always exist.  In the rare case where one is missing *and* the other
572     * uses push concstants, this may be suboptimal.  However, avoiding stalls
573     * seems more important.
574     */
575    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
576
577    if (stages == cmd_buffer->state.push_constant_stages)
578       return;
579
580 #if GEN_GEN >= 8
581    const unsigned push_constant_kb = 32;
582 #elif GEN_IS_HASWELL
583    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
584 #else
585    const unsigned push_constant_kb = 16;
586 #endif
587
588    const unsigned num_stages =
589       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
590    unsigned size_per_stage = push_constant_kb / num_stages;
591
592    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
593     * units of 2KB.  Incidentally, these are the same platforms that have
594     * 32KB worth of push constant space.
595     */
596    if (push_constant_kb == 32)
597       size_per_stage &= ~1u;
598
599    uint32_t kb_used = 0;
600    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
601       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
602       anv_batch_emit(&cmd_buffer->batch,
603                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
604          alloc._3DCommandSubOpcode  = 18 + i;
605          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
606          alloc.ConstantBufferSize   = push_size;
607       }
608       kb_used += push_size;
609    }
610
611    anv_batch_emit(&cmd_buffer->batch,
612                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
613       alloc.ConstantBufferOffset = kb_used;
614       alloc.ConstantBufferSize = push_constant_kb - kb_used;
615    }
616
617    cmd_buffer->state.push_constant_stages = stages;
618
619    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
620     *
621     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
622     *    the next 3DPRIMITIVE command after programming the
623     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
624     *
625     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
626     * pipeline setup, we need to dirty push constants.
627     */
628    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
629 }
630
631 static void
632 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
633                                     uint32_t stages)
634 {
635    static const uint32_t sampler_state_opcodes[] = {
636       [MESA_SHADER_VERTEX]                      = 43,
637       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
638       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
639       [MESA_SHADER_GEOMETRY]                    = 46,
640       [MESA_SHADER_FRAGMENT]                    = 47,
641       [MESA_SHADER_COMPUTE]                     = 0,
642    };
643
644    static const uint32_t binding_table_opcodes[] = {
645       [MESA_SHADER_VERTEX]                      = 38,
646       [MESA_SHADER_TESS_CTRL]                   = 39,
647       [MESA_SHADER_TESS_EVAL]                   = 40,
648       [MESA_SHADER_GEOMETRY]                    = 41,
649       [MESA_SHADER_FRAGMENT]                    = 42,
650       [MESA_SHADER_COMPUTE]                     = 0,
651    };
652
653    anv_foreach_stage(s, stages) {
654       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
655          anv_batch_emit(&cmd_buffer->batch,
656                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
657             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
658             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
659          }
660       }
661
662       /* Always emit binding table pointers if we're asked to, since on SKL
663        * this is what flushes push constants. */
664       anv_batch_emit(&cmd_buffer->batch,
665                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
666          btp._3DCommandSubOpcode = binding_table_opcodes[s];
667          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
668       }
669    }
670 }
671
672 static uint32_t
673 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
674 {
675    static const uint32_t push_constant_opcodes[] = {
676       [MESA_SHADER_VERTEX]                      = 21,
677       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
678       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
679       [MESA_SHADER_GEOMETRY]                    = 22,
680       [MESA_SHADER_FRAGMENT]                    = 23,
681       [MESA_SHADER_COMPUTE]                     = 0,
682    };
683
684    VkShaderStageFlags flushed = 0;
685
686    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
687       if (stage == MESA_SHADER_COMPUTE)
688          continue;
689
690       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
691
692       if (state.offset == 0) {
693          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
694             c._3DCommandSubOpcode = push_constant_opcodes[stage];
695       } else {
696          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
697             c._3DCommandSubOpcode = push_constant_opcodes[stage],
698             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
699 #if GEN_GEN >= 9
700                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
701                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
702 #else
703                .PointerToConstantBuffer0 = { .offset = state.offset },
704                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
705 #endif
706             };
707          }
708       }
709
710       flushed |= mesa_to_vk_shader_stage(stage);
711    }
712
713    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
714
715    return flushed;
716 }
717
718 void
719 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
720 {
721    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
722    uint32_t *p;
723
724    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
725
726    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
727
728    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
729
730    genX(flush_pipeline_select_3d)(cmd_buffer);
731
732    if (vb_emit) {
733       const uint32_t num_buffers = __builtin_popcount(vb_emit);
734       const uint32_t num_dwords = 1 + num_buffers * 4;
735
736       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
737                           GENX(3DSTATE_VERTEX_BUFFERS));
738       uint32_t vb, i = 0;
739       for_each_bit(vb, vb_emit) {
740          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
741          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
742
743          struct GENX(VERTEX_BUFFER_STATE) state = {
744             .VertexBufferIndex = vb,
745
746 #if GEN_GEN >= 8
747             .MemoryObjectControlState = GENX(MOCS),
748 #else
749             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
750             .InstanceDataStepRate = 1,
751             .VertexBufferMemoryObjectControlState = GENX(MOCS),
752 #endif
753
754             .AddressModifyEnable = true,
755             .BufferPitch = pipeline->binding_stride[vb],
756             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
757
758 #if GEN_GEN >= 8
759             .BufferSize = buffer->size - offset
760 #else
761             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
762 #endif
763          };
764
765          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
766          i++;
767       }
768    }
769
770    cmd_buffer->state.vb_dirty &= ~vb_emit;
771
772    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
773       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
774
775       /* The exact descriptor layout is pulled from the pipeline, so we need
776        * to re-emit binding tables on every pipeline change.
777        */
778       cmd_buffer->state.descriptors_dirty |=
779          cmd_buffer->state.pipeline->active_stages;
780
781       /* If the pipeline changed, we may need to re-allocate push constant
782        * space in the URB.
783        */
784       cmd_buffer_alloc_push_constants(cmd_buffer);
785    }
786
787 #if GEN_GEN <= 7
788    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
789        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
790       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
791        *
792        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
793        *    stall needs to be sent just prior to any 3DSTATE_VS,
794        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
795        *    3DSTATE_BINDING_TABLE_POINTER_VS,
796        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
797        *    PIPE_CONTROL needs to be sent before any combination of VS
798        *    associated 3DSTATE."
799        */
800       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
801          pc.DepthStallEnable  = true;
802          pc.PostSyncOperation = WriteImmediateData;
803          pc.Address           =
804             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
805       }
806    }
807 #endif
808
809    /* Render targets live in the same binding table as fragment descriptors */
810    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
811       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
812
813    /* We emit the binding tables and sampler tables first, then emit push
814     * constants and then finally emit binding table and sampler table
815     * pointers.  It has to happen in this order, since emitting the binding
816     * tables may change the push constants (in case of storage images). After
817     * emitting push constants, on SKL+ we have to emit the corresponding
818     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
819     */
820    uint32_t dirty = 0;
821    if (cmd_buffer->state.descriptors_dirty)
822       dirty = anv_cmd_buffer_flush_descriptor_sets(cmd_buffer);
823
824    if (cmd_buffer->state.push_constants_dirty) {
825 #if GEN_GEN >= 9
826       /* On Sky Lake and later, the binding table pointers commands are
827        * what actually flush the changes to push constant state so we need
828        * to dirty them so they get re-emitted below.
829        */
830       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
831 #else
832       cmd_buffer_flush_push_constants(cmd_buffer);
833 #endif
834    }
835
836    if (dirty)
837       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
838
839    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
840       gen8_cmd_buffer_emit_viewport(cmd_buffer);
841
842    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
843                                   ANV_CMD_DIRTY_PIPELINE)) {
844       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
845                                           pipeline->depth_clamp_enable);
846    }
847
848    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
849       gen7_cmd_buffer_emit_scissor(cmd_buffer);
850
851    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
852
853    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
854 }
855
856 static void
857 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
858                              struct anv_bo *bo, uint32_t offset)
859 {
860    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
861                                  GENX(3DSTATE_VERTEX_BUFFERS));
862
863    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
864       &(struct GENX(VERTEX_BUFFER_STATE)) {
865          .VertexBufferIndex = 32, /* Reserved for this */
866          .AddressModifyEnable = true,
867          .BufferPitch = 0,
868 #if (GEN_GEN >= 8)
869          .MemoryObjectControlState = GENX(MOCS),
870          .BufferStartingAddress = { bo, offset },
871          .BufferSize = 8
872 #else
873          .VertexBufferMemoryObjectControlState = GENX(MOCS),
874          .BufferStartingAddress = { bo, offset },
875          .EndAddress = { bo, offset + 8 },
876 #endif
877       });
878 }
879
880 static void
881 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
882                           uint32_t base_vertex, uint32_t base_instance)
883 {
884    struct anv_state id_state =
885       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
886
887    ((uint32_t *)id_state.map)[0] = base_vertex;
888    ((uint32_t *)id_state.map)[1] = base_instance;
889
890    if (!cmd_buffer->device->info.has_llc)
891       anv_state_clflush(id_state);
892
893    emit_base_vertex_instance_bo(cmd_buffer,
894       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
895 }
896
897 void genX(CmdDraw)(
898     VkCommandBuffer                             commandBuffer,
899     uint32_t                                    vertexCount,
900     uint32_t                                    instanceCount,
901     uint32_t                                    firstVertex,
902     uint32_t                                    firstInstance)
903 {
904    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
905    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
906    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
907
908    genX(cmd_buffer_flush_state)(cmd_buffer);
909
910    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
911       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
912
913    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
914       prim.VertexAccessType         = SEQUENTIAL;
915       prim.PrimitiveTopologyType    = pipeline->topology;
916       prim.VertexCountPerInstance   = vertexCount;
917       prim.StartVertexLocation      = firstVertex;
918       prim.InstanceCount            = instanceCount;
919       prim.StartInstanceLocation    = firstInstance;
920       prim.BaseVertexLocation       = 0;
921    }
922 }
923
924 void genX(CmdDrawIndexed)(
925     VkCommandBuffer                             commandBuffer,
926     uint32_t                                    indexCount,
927     uint32_t                                    instanceCount,
928     uint32_t                                    firstIndex,
929     int32_t                                     vertexOffset,
930     uint32_t                                    firstInstance)
931 {
932    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
933    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
934    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
935
936    genX(cmd_buffer_flush_state)(cmd_buffer);
937
938    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
939       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
940
941    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
942       prim.VertexAccessType         = RANDOM;
943       prim.PrimitiveTopologyType    = pipeline->topology;
944       prim.VertexCountPerInstance   = indexCount;
945       prim.StartVertexLocation      = firstIndex;
946       prim.InstanceCount            = instanceCount;
947       prim.StartInstanceLocation    = firstInstance;
948       prim.BaseVertexLocation       = vertexOffset;
949    }
950 }
951
952 /* Auto-Draw / Indirect Registers */
953 #define GEN7_3DPRIM_END_OFFSET          0x2420
954 #define GEN7_3DPRIM_START_VERTEX        0x2430
955 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
956 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
957 #define GEN7_3DPRIM_START_INSTANCE      0x243C
958 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
959
960 void genX(CmdDrawIndirect)(
961     VkCommandBuffer                             commandBuffer,
962     VkBuffer                                    _buffer,
963     VkDeviceSize                                offset,
964     uint32_t                                    drawCount,
965     uint32_t                                    stride)
966 {
967    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
968    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
969    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
970    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
971    struct anv_bo *bo = buffer->bo;
972    uint32_t bo_offset = buffer->offset + offset;
973
974    genX(cmd_buffer_flush_state)(cmd_buffer);
975
976    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
977       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
978
979    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
980    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
981    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
982    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
983    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
984
985    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
986       prim.IndirectParameterEnable  = true;
987       prim.VertexAccessType         = SEQUENTIAL;
988       prim.PrimitiveTopologyType    = pipeline->topology;
989    }
990 }
991
992 void genX(CmdDrawIndexedIndirect)(
993     VkCommandBuffer                             commandBuffer,
994     VkBuffer                                    _buffer,
995     VkDeviceSize                                offset,
996     uint32_t                                    drawCount,
997     uint32_t                                    stride)
998 {
999    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1000    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1001    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1002    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1003    struct anv_bo *bo = buffer->bo;
1004    uint32_t bo_offset = buffer->offset + offset;
1005
1006    genX(cmd_buffer_flush_state)(cmd_buffer);
1007
1008    /* TODO: We need to stomp base vertex to 0 somehow */
1009    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1010       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
1011
1012    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1013    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1014    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1015    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
1016    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
1017
1018    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1019       prim.IndirectParameterEnable  = true;
1020       prim.VertexAccessType         = RANDOM;
1021       prim.PrimitiveTopologyType    = pipeline->topology;
1022    }
1023 }
1024
1025 static VkResult
1026 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
1027 {
1028    struct anv_device *device = cmd_buffer->device;
1029    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1030    struct anv_state surfaces = { 0, }, samplers = { 0, };
1031    VkResult result;
1032
1033    result = anv_cmd_buffer_emit_samplers(cmd_buffer,
1034                                          MESA_SHADER_COMPUTE, &samplers);
1035    if (result != VK_SUCCESS)
1036       return result;
1037    result = anv_cmd_buffer_emit_binding_table(cmd_buffer,
1038                                               MESA_SHADER_COMPUTE, &surfaces);
1039    if (result != VK_SUCCESS)
1040       return result;
1041
1042    struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer);
1043
1044    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
1045    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1046
1047    if (push_state.alloc_size) {
1048       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1049          curbe.CURBETotalDataLength    = push_state.alloc_size;
1050          curbe.CURBEDataStartAddress   = push_state.offset;
1051       }
1052    }
1053
1054    const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared);
1055
1056    struct anv_state state =
1057       anv_state_pool_emit(&device->dynamic_state_pool,
1058                           GENX(INTERFACE_DESCRIPTOR_DATA), 64,
1059                           .KernelStartPointer = pipeline->cs_simd,
1060                           .BindingTablePointer = surfaces.offset,
1061                           .BindingTableEntryCount = 0,
1062                           .SamplerStatePointer = samplers.offset,
1063                           .SamplerCount = 0,
1064 #if !GEN_IS_HASWELL
1065                           .ConstantURBEntryReadOffset = 0,
1066 #endif
1067                           .ConstantURBEntryReadLength =
1068                              cs_prog_data->push.per_thread.regs,
1069 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1070                           .CrossThreadConstantDataReadLength =
1071                              cs_prog_data->push.cross_thread.regs,
1072 #endif
1073                           .BarrierEnable = cs_prog_data->uses_barrier,
1074                           .SharedLocalMemorySize = slm_size,
1075                           .NumberofThreadsinGPGPUThreadGroup =
1076                              cs_prog_data->threads);
1077
1078    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1079    anv_batch_emit(&cmd_buffer->batch,
1080                   GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1081       mid.InterfaceDescriptorTotalLength        = size;
1082       mid.InterfaceDescriptorDataStartAddress   = state.offset;
1083    }
1084
1085    return VK_SUCCESS;
1086 }
1087
1088 void
1089 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
1090 {
1091    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1092    MAYBE_UNUSED VkResult result;
1093
1094    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1095
1096    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1097
1098    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1099
1100    if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)
1101       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1102
1103    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
1104        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
1105       /* FIXME: figure out descriptors for gen7 */
1106       result = flush_compute_descriptor_set(cmd_buffer);
1107       assert(result == VK_SUCCESS);
1108       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
1109    }
1110
1111    cmd_buffer->state.compute_dirty = 0;
1112
1113    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1114 }
1115
1116 #if GEN_GEN == 7
1117
1118 static bool
1119 verify_cmd_parser(const struct anv_device *device,
1120                   int required_version,
1121                   const char *function)
1122 {
1123    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
1124       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
1125                 "cmd parser version %d is required for %s",
1126                 required_version, function);
1127       return false;
1128    } else {
1129       return true;
1130    }
1131 }
1132
1133 #endif
1134
1135 void genX(CmdDispatch)(
1136     VkCommandBuffer                             commandBuffer,
1137     uint32_t                                    x,
1138     uint32_t                                    y,
1139     uint32_t                                    z)
1140 {
1141    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1142    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1143    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1144
1145    if (prog_data->uses_num_work_groups) {
1146       struct anv_state state =
1147          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
1148       uint32_t *sizes = state.map;
1149       sizes[0] = x;
1150       sizes[1] = y;
1151       sizes[2] = z;
1152       if (!cmd_buffer->device->info.has_llc)
1153          anv_state_clflush(state);
1154       cmd_buffer->state.num_workgroups_offset = state.offset;
1155       cmd_buffer->state.num_workgroups_bo =
1156          &cmd_buffer->device->dynamic_state_block_pool.bo;
1157    }
1158
1159    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1160
1161    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
1162       ggw.SIMDSize                     = prog_data->simd_size / 16;
1163       ggw.ThreadDepthCounterMaximum    = 0;
1164       ggw.ThreadHeightCounterMaximum   = 0;
1165       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1166       ggw.ThreadGroupIDXDimension      = x;
1167       ggw.ThreadGroupIDYDimension      = y;
1168       ggw.ThreadGroupIDZDimension      = z;
1169       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1170       ggw.BottomExecutionMask          = 0xffffffff;
1171    }
1172
1173    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
1174 }
1175
1176 #define GPGPU_DISPATCHDIMX 0x2500
1177 #define GPGPU_DISPATCHDIMY 0x2504
1178 #define GPGPU_DISPATCHDIMZ 0x2508
1179
1180 #define MI_PREDICATE_SRC0  0x2400
1181 #define MI_PREDICATE_SRC1  0x2408
1182
1183 void genX(CmdDispatchIndirect)(
1184     VkCommandBuffer                             commandBuffer,
1185     VkBuffer                                    _buffer,
1186     VkDeviceSize                                offset)
1187 {
1188    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1189    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1190    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1191    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1192    struct anv_bo *bo = buffer->bo;
1193    uint32_t bo_offset = buffer->offset + offset;
1194    struct anv_batch *batch = &cmd_buffer->batch;
1195
1196 #if GEN_GEN == 7
1197    /* Linux 4.4 added command parser version 5 which allows the GPGPU
1198     * indirect dispatch registers to be written.
1199     */
1200    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
1201       return;
1202 #endif
1203
1204    if (prog_data->uses_num_work_groups) {
1205       cmd_buffer->state.num_workgroups_offset = bo_offset;
1206       cmd_buffer->state.num_workgroups_bo = bo;
1207    }
1208
1209    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1210
1211    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
1212    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
1213    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
1214
1215 #if GEN_GEN <= 7
1216    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
1217    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
1218    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
1219    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
1220
1221    /* Load compute_dispatch_indirect_x_size into SRC0 */
1222    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
1223
1224    /* predicate = (compute_dispatch_indirect_x_size == 0); */
1225    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1226       mip.LoadOperation    = LOAD_LOAD;
1227       mip.CombineOperation = COMBINE_SET;
1228       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1229    }
1230
1231    /* Load compute_dispatch_indirect_y_size into SRC0 */
1232    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
1233
1234    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
1235    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1236       mip.LoadOperation    = LOAD_LOAD;
1237       mip.CombineOperation = COMBINE_OR;
1238       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1239    }
1240
1241    /* Load compute_dispatch_indirect_z_size into SRC0 */
1242    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
1243
1244    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
1245    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1246       mip.LoadOperation    = LOAD_LOAD;
1247       mip.CombineOperation = COMBINE_OR;
1248       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1249    }
1250
1251    /* predicate = !predicate; */
1252 #define COMPARE_FALSE                           1
1253    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1254       mip.LoadOperation    = LOAD_LOADINV;
1255       mip.CombineOperation = COMBINE_OR;
1256       mip.CompareOperation = COMPARE_FALSE;
1257    }
1258 #endif
1259
1260    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
1261       ggw.IndirectParameterEnable      = true;
1262       ggw.PredicateEnable              = GEN_GEN <= 7;
1263       ggw.SIMDSize                     = prog_data->simd_size / 16;
1264       ggw.ThreadDepthCounterMaximum    = 0;
1265       ggw.ThreadHeightCounterMaximum   = 0;
1266       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1267       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1268       ggw.BottomExecutionMask          = 0xffffffff;
1269    }
1270
1271    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
1272 }
1273
1274 static void
1275 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
1276                                       uint32_t pipeline)
1277 {
1278 #if GEN_GEN >= 8 && GEN_GEN < 10
1279    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1280     *
1281     *   Software must clear the COLOR_CALC_STATE Valid field in
1282     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1283     *   with Pipeline Select set to GPGPU.
1284     *
1285     * The internal hardware docs recommend the same workaround for Gen9
1286     * hardware too.
1287     */
1288    if (pipeline == GPGPU)
1289       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1290 #elif GEN_GEN <= 7
1291       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1292        * PIPELINE_SELECT [DevBWR+]":
1293        *
1294        *   Project: DEVSNB+
1295        *
1296        *   Software must ensure all the write caches are flushed through a
1297        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1298        *   command to invalidate read only caches prior to programming
1299        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
1300        */
1301       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1302          pc.RenderTargetCacheFlushEnable  = true;
1303          pc.DepthCacheFlushEnable         = true;
1304          pc.DCFlushEnable                 = true;
1305          pc.PostSyncOperation             = NoWrite;
1306          pc.CommandStreamerStallEnable    = true;
1307       }
1308
1309       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1310          pc.TextureCacheInvalidationEnable   = true;
1311          pc.ConstantCacheInvalidationEnable  = true;
1312          pc.StateCacheInvalidationEnable     = true;
1313          pc.InstructionCacheInvalidateEnable = true;
1314          pc.PostSyncOperation                = NoWrite;
1315       }
1316 #endif
1317 }
1318
1319 void
1320 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
1321 {
1322    if (cmd_buffer->state.current_pipeline != _3D) {
1323       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
1324
1325       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1326 #if GEN_GEN >= 9
1327          ps.MaskBits = 3;
1328 #endif
1329          ps.PipelineSelection = _3D;
1330       }
1331
1332       cmd_buffer->state.current_pipeline = _3D;
1333    }
1334 }
1335
1336 void
1337 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
1338 {
1339    if (cmd_buffer->state.current_pipeline != GPGPU) {
1340       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
1341
1342       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1343 #if GEN_GEN >= 9
1344          ps.MaskBits = 3;
1345 #endif
1346          ps.PipelineSelection = GPGPU;
1347       }
1348
1349       cmd_buffer->state.current_pipeline = GPGPU;
1350    }
1351 }
1352
1353 struct anv_state
1354 genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
1355                                           struct anv_framebuffer *fb)
1356 {
1357    struct anv_state state =
1358       anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
1359
1360    struct GENX(RENDER_SURFACE_STATE) null_ss = {
1361       .SurfaceType = SURFTYPE_NULL,
1362       .SurfaceArray = fb->layers > 0,
1363       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1364 #if GEN_GEN >= 8
1365       .TileMode = YMAJOR,
1366 #else
1367       .TiledSurface = true,
1368 #endif
1369       .Width = fb->width - 1,
1370       .Height = fb->height - 1,
1371       .Depth = fb->layers - 1,
1372       .RenderTargetViewExtent = fb->layers - 1,
1373    };
1374
1375    GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
1376
1377    if (!cmd_buffer->device->info.has_llc)
1378       anv_state_clflush(state);
1379
1380    return state;
1381 }
1382
1383 static void
1384 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
1385 {
1386    struct anv_device *device = cmd_buffer->device;
1387    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
1388    const struct anv_image_view *iview =
1389       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1390    const struct anv_image *image = iview ? iview->image : NULL;
1391    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1392    const bool has_hiz = image != NULL && anv_image_has_hiz(image);
1393    const bool has_stencil =
1394       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1395
1396    /* FIXME: Implement the PMA stall W/A */
1397    /* FIXME: Width and Height are wrong */
1398
1399    /* Emit 3DSTATE_DEPTH_BUFFER */
1400    if (has_depth) {
1401       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1402          db.SurfaceType                   = SURFTYPE_2D;
1403          db.DepthWriteEnable              = true;
1404          db.StencilWriteEnable            = has_stencil;
1405
1406          if (cmd_buffer->state.pass->subpass_count == 1) {
1407             db.HierarchicalDepthBufferEnable = has_hiz;
1408          } else {
1409             anv_finishme("Multiple-subpass HiZ not implemented");
1410          }
1411
1412          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1413                                                       &image->depth_surface.isl);
1414
1415          db.SurfaceBaseAddress = (struct anv_address) {
1416             .bo = image->bo,
1417             .offset = image->offset + image->depth_surface.offset,
1418          };
1419          db.DepthBufferObjectControlState = GENX(MOCS);
1420
1421          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
1422          db.Height               = image->extent.height - 1;
1423          db.Width                = image->extent.width - 1;
1424          db.LOD                  = iview->isl.base_level;
1425          db.Depth                = image->array_size - 1; /* FIXME: 3-D */
1426          db.MinimumArrayElement  = iview->isl.base_array_layer;
1427
1428 #if GEN_GEN >= 8
1429          db.SurfaceQPitch =
1430             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
1431 #endif
1432          db.RenderTargetViewExtent = 1 - 1;
1433       }
1434    } else {
1435       /* Even when no depth buffer is present, the hardware requires that
1436        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1437        *
1438        *    If a null depth buffer is bound, the driver must instead bind depth as:
1439        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1440        *       3DSTATE_DEPTH.Width = 1
1441        *       3DSTATE_DEPTH.Height = 1
1442        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1443        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
1444        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1445        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1446        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1447        *
1448        * The PRM is wrong, though. The width and height must be programmed to
1449        * actual framebuffer's width and height, even when neither depth buffer
1450        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
1451        * be combined with a stencil buffer so we use D32_FLOAT instead.
1452        */
1453       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1454          db.SurfaceType          = SURFTYPE_2D;
1455          db.SurfaceFormat        = D32_FLOAT;
1456          db.Width                = fb->width - 1;
1457          db.Height               = fb->height - 1;
1458          db.StencilWriteEnable   = has_stencil;
1459       }
1460    }
1461
1462    if (has_hiz) {
1463       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
1464          hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
1465          hdb.SurfacePitch = image->hiz_surface.isl.row_pitch - 1;
1466          hdb.SurfaceBaseAddress = (struct anv_address) {
1467             .bo = image->bo,
1468             .offset = image->offset + image->hiz_surface.offset,
1469          };
1470 #if GEN_GEN >= 8
1471          /* From the SKL PRM Vol2a:
1472           *
1473           *    The interpretation of this field is dependent on Surface Type
1474           *    as follows:
1475           *    - SURFTYPE_1D: distance in pixels between array slices
1476           *    - SURFTYPE_2D/CUBE: distance in rows between array slices
1477           *    - SURFTYPE_3D: distance in rows between R - slices
1478           */
1479          hdb.SurfaceQPitch =
1480             image->hiz_surface.isl.dim == ISL_SURF_DIM_1D ?
1481                isl_surf_get_array_pitch_el(&image->hiz_surface.isl) >> 2 :
1482                isl_surf_get_array_pitch_el_rows(&image->hiz_surface.isl) >> 2;
1483 #endif
1484       }
1485    } else {
1486       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
1487    }
1488
1489    /* Emit 3DSTATE_STENCIL_BUFFER */
1490    if (has_stencil) {
1491       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1492 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1493          sb.StencilBufferEnable = true;
1494 #endif
1495          sb.StencilBufferObjectControlState = GENX(MOCS);
1496
1497          sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
1498
1499 #if GEN_GEN >= 8
1500          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
1501 #endif
1502          sb.SurfaceBaseAddress = (struct anv_address) {
1503             .bo = image->bo,
1504             .offset = image->offset + image->stencil_surface.offset,
1505          };
1506       }
1507    } else {
1508       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1509    }
1510
1511    /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
1512     *
1513     *    3DSTATE_CLEAR_PARAMS must always be programmed in the along with
1514     *    the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
1515     *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
1516     *
1517     * Testing also shows that some variant of this restriction may exist HSW+.
1518     * On BDW+, it is not possible to emit 2 of these packets consecutively when
1519     * both have DepthClearValueValid set. An analysis of such state programming
1520     * on SKL showed that the GPU doesn't register the latter packet's clear
1521     * value.
1522     */
1523    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
1524       if (has_hiz) {
1525          cp.DepthClearValueValid = true;
1526          const uint32_t ds =
1527             cmd_buffer->state.subpass->depth_stencil_attachment;
1528          cp.DepthClearValue =
1529             cmd_buffer->state.attachments[ds].clear_value.depthStencil.depth;
1530       }
1531    }
1532 }
1533
1534 static void
1535 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1536                              struct anv_subpass *subpass)
1537 {
1538    cmd_buffer->state.subpass = subpass;
1539
1540    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1541
1542    cmd_buffer_emit_depth_stencil(cmd_buffer);
1543    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_HIZ_RESOLVE);
1544    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_CLEAR);
1545
1546    anv_cmd_buffer_clear_subpass(cmd_buffer);
1547 }
1548
1549 void genX(CmdBeginRenderPass)(
1550     VkCommandBuffer                             commandBuffer,
1551     const VkRenderPassBeginInfo*                pRenderPassBegin,
1552     VkSubpassContents                           contents)
1553 {
1554    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1555    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1556    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1557
1558    cmd_buffer->state.framebuffer = framebuffer;
1559    cmd_buffer->state.pass = pass;
1560    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1561    anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
1562
1563    genX(flush_pipeline_select_3d)(cmd_buffer);
1564
1565    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1566 }
1567
1568 void genX(CmdNextSubpass)(
1569     VkCommandBuffer                             commandBuffer,
1570     VkSubpassContents                           contents)
1571 {
1572    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1573
1574    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1575
1576    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1577    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1578 }
1579
1580 void genX(CmdEndRenderPass)(
1581     VkCommandBuffer                             commandBuffer)
1582 {
1583    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1584
1585    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_RESOLVE);
1586    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1587
1588 #ifndef NDEBUG
1589    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
1590 #endif
1591 }
1592
1593 static void
1594 emit_ps_depth_count(struct anv_batch *batch,
1595                     struct anv_bo *bo, uint32_t offset)
1596 {
1597    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1598       pc.DestinationAddressType  = DAT_PPGTT;
1599       pc.PostSyncOperation       = WritePSDepthCount;
1600       pc.DepthStallEnable        = true;
1601       pc.Address                 = (struct anv_address) { bo, offset };
1602    }
1603 }
1604
1605 static void
1606 emit_query_availability(struct anv_batch *batch,
1607                         struct anv_bo *bo, uint32_t offset)
1608 {
1609    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1610       pc.DestinationAddressType  = DAT_PPGTT;
1611       pc.PostSyncOperation       = WriteImmediateData;
1612       pc.Address                 = (struct anv_address) { bo, offset };
1613       pc.ImmediateData           = 1;
1614    }
1615 }
1616
1617 void genX(CmdBeginQuery)(
1618     VkCommandBuffer                             commandBuffer,
1619     VkQueryPool                                 queryPool,
1620     uint32_t                                    query,
1621     VkQueryControlFlags                         flags)
1622 {
1623    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1624    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1625
1626    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
1627     * that the pipelining of the depth write breaks. What we see is that
1628     * samples from the render pass clear leaks into the first query
1629     * immediately after the clear. Doing a pipecontrol with a post-sync
1630     * operation and DepthStallEnable seems to work around the issue.
1631     */
1632    if (cmd_buffer->state.need_query_wa) {
1633       cmd_buffer->state.need_query_wa = false;
1634       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1635          pc.DepthCacheFlushEnable   = true;
1636          pc.DepthStallEnable        = true;
1637       }
1638    }
1639
1640    switch (pool->type) {
1641    case VK_QUERY_TYPE_OCCLUSION:
1642       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1643                           query * sizeof(struct anv_query_pool_slot));
1644       break;
1645
1646    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1647    default:
1648       unreachable("");
1649    }
1650 }
1651
1652 void genX(CmdEndQuery)(
1653     VkCommandBuffer                             commandBuffer,
1654     VkQueryPool                                 queryPool,
1655     uint32_t                                    query)
1656 {
1657    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1658    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1659
1660    switch (pool->type) {
1661    case VK_QUERY_TYPE_OCCLUSION:
1662       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1663                           query * sizeof(struct anv_query_pool_slot) + 8);
1664
1665       emit_query_availability(&cmd_buffer->batch, &pool->bo,
1666                               query * sizeof(struct anv_query_pool_slot) + 16);
1667       break;
1668
1669    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1670    default:
1671       unreachable("");
1672    }
1673 }
1674
1675 #define TIMESTAMP 0x2358
1676
1677 void genX(CmdWriteTimestamp)(
1678     VkCommandBuffer                             commandBuffer,
1679     VkPipelineStageFlagBits                     pipelineStage,
1680     VkQueryPool                                 queryPool,
1681     uint32_t                                    query)
1682 {
1683    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1684    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1685    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
1686
1687    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1688
1689    switch (pipelineStage) {
1690    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1691       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1692          srm.RegisterAddress  = TIMESTAMP;
1693          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
1694       }
1695       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1696          srm.RegisterAddress  = TIMESTAMP + 4;
1697          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
1698       }
1699       break;
1700
1701    default:
1702       /* Everything else is bottom-of-pipe */
1703       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1704          pc.DestinationAddressType  = DAT_PPGTT;
1705          pc.PostSyncOperation       = WriteTimestamp;
1706          pc.Address = (struct anv_address) { &pool->bo, offset };
1707       }
1708       break;
1709    }
1710
1711    emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
1712 }
1713
1714 #if GEN_GEN > 7 || GEN_IS_HASWELL
1715
1716 #define alu_opcode(v)   __gen_uint((v),  20, 31)
1717 #define alu_operand1(v) __gen_uint((v),  10, 19)
1718 #define alu_operand2(v) __gen_uint((v),   0,  9)
1719 #define alu(opcode, operand1, operand2) \
1720    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
1721
1722 #define OPCODE_NOOP      0x000
1723 #define OPCODE_LOAD      0x080
1724 #define OPCODE_LOADINV   0x480
1725 #define OPCODE_LOAD0     0x081
1726 #define OPCODE_LOAD1     0x481
1727 #define OPCODE_ADD       0x100
1728 #define OPCODE_SUB       0x101
1729 #define OPCODE_AND       0x102
1730 #define OPCODE_OR        0x103
1731 #define OPCODE_XOR       0x104
1732 #define OPCODE_STORE     0x180
1733 #define OPCODE_STOREINV  0x580
1734
1735 #define OPERAND_R0   0x00
1736 #define OPERAND_R1   0x01
1737 #define OPERAND_R2   0x02
1738 #define OPERAND_R3   0x03
1739 #define OPERAND_R4   0x04
1740 #define OPERAND_SRCA 0x20
1741 #define OPERAND_SRCB 0x21
1742 #define OPERAND_ACCU 0x31
1743 #define OPERAND_ZF   0x32
1744 #define OPERAND_CF   0x33
1745
1746 #define CS_GPR(n) (0x2600 + (n) * 8)
1747
1748 static void
1749 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
1750                       struct anv_bo *bo, uint32_t offset)
1751 {
1752    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1753       lrm.RegisterAddress  = reg,
1754       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
1755    }
1756    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1757       lrm.RegisterAddress  = reg + 4;
1758       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1759    }
1760 }
1761
1762 static void
1763 store_query_result(struct anv_batch *batch, uint32_t reg,
1764                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
1765 {
1766    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1767       srm.RegisterAddress  = reg;
1768       srm.MemoryAddress    = (struct anv_address) { bo, offset };
1769    }
1770
1771    if (flags & VK_QUERY_RESULT_64_BIT) {
1772       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1773          srm.RegisterAddress  = reg + 4;
1774          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1775       }
1776    }
1777 }
1778
1779 void genX(CmdCopyQueryPoolResults)(
1780     VkCommandBuffer                             commandBuffer,
1781     VkQueryPool                                 queryPool,
1782     uint32_t                                    firstQuery,
1783     uint32_t                                    queryCount,
1784     VkBuffer                                    destBuffer,
1785     VkDeviceSize                                destOffset,
1786     VkDeviceSize                                destStride,
1787     VkQueryResultFlags                          flags)
1788 {
1789    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1790    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1791    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1792    uint32_t slot_offset, dst_offset;
1793
1794    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1795       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1796          pc.CommandStreamerStallEnable = true;
1797          pc.StallAtPixelScoreboard     = true;
1798       }
1799    }
1800
1801    dst_offset = buffer->offset + destOffset;
1802    for (uint32_t i = 0; i < queryCount; i++) {
1803
1804       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
1805       switch (pool->type) {
1806       case VK_QUERY_TYPE_OCCLUSION:
1807          emit_load_alu_reg_u64(&cmd_buffer->batch,
1808                                CS_GPR(0), &pool->bo, slot_offset);
1809          emit_load_alu_reg_u64(&cmd_buffer->batch,
1810                                CS_GPR(1), &pool->bo, slot_offset + 8);
1811
1812          /* FIXME: We need to clamp the result for 32 bit. */
1813
1814          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
1815          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
1816          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
1817          dw[3] = alu(OPCODE_SUB, 0, 0);
1818          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
1819          break;
1820
1821       case VK_QUERY_TYPE_TIMESTAMP:
1822          emit_load_alu_reg_u64(&cmd_buffer->batch,
1823                                CS_GPR(2), &pool->bo, slot_offset);
1824          break;
1825
1826       default:
1827          unreachable("unhandled query type");
1828       }
1829
1830       store_query_result(&cmd_buffer->batch,
1831                          CS_GPR(2), buffer->bo, dst_offset, flags);
1832
1833       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1834          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
1835                                &pool->bo, slot_offset + 16);
1836          if (flags & VK_QUERY_RESULT_64_BIT)
1837             store_query_result(&cmd_buffer->batch,
1838                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
1839          else
1840             store_query_result(&cmd_buffer->batch,
1841                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
1842       }
1843
1844       dst_offset += destStride;
1845    }
1846 }
1847
1848 #else
1849 void genX(CmdCopyQueryPoolResults)(
1850     VkCommandBuffer                             commandBuffer,
1851     VkQueryPool                                 queryPool,
1852     uint32_t                                    firstQuery,
1853     uint32_t                                    queryCount,
1854     VkBuffer                                    destBuffer,
1855     VkDeviceSize                                destOffset,
1856     VkDeviceSize                                destStride,
1857     VkQueryResultFlags                          flags)
1858 {
1859    anv_finishme("Queries not yet supported on Ivy Bridge");
1860 }
1861 #endif