OSDN Git Service

gallium/radeon: move pipeline stat context flags to common code
[android-x86/external-mesa.git] / src / gallium / drivers / radeonsi / si_state_draw.c
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Christian König <christian.koenig@amd.com>
25  */
26
27 #include "si_pipe.h"
28 #include "si_shader.h"
29 #include "radeon/r600_cs.h"
30 #include "sid.h"
31
32 #include "util/u_index_modify.h"
33 #include "util/u_upload_mgr.h"
34 #include "util/u_prim.h"
35
36 static unsigned si_conv_pipe_prim(unsigned mode)
37 {
38         static const unsigned prim_conv[] = {
39                 [PIPE_PRIM_POINTS]                      = V_008958_DI_PT_POINTLIST,
40                 [PIPE_PRIM_LINES]                       = V_008958_DI_PT_LINELIST,
41                 [PIPE_PRIM_LINE_LOOP]                   = V_008958_DI_PT_LINELOOP,
42                 [PIPE_PRIM_LINE_STRIP]                  = V_008958_DI_PT_LINESTRIP,
43                 [PIPE_PRIM_TRIANGLES]                   = V_008958_DI_PT_TRILIST,
44                 [PIPE_PRIM_TRIANGLE_STRIP]              = V_008958_DI_PT_TRISTRIP,
45                 [PIPE_PRIM_TRIANGLE_FAN]                = V_008958_DI_PT_TRIFAN,
46                 [PIPE_PRIM_QUADS]                       = V_008958_DI_PT_QUADLIST,
47                 [PIPE_PRIM_QUAD_STRIP]                  = V_008958_DI_PT_QUADSTRIP,
48                 [PIPE_PRIM_POLYGON]                     = V_008958_DI_PT_POLYGON,
49                 [PIPE_PRIM_LINES_ADJACENCY]             = V_008958_DI_PT_LINELIST_ADJ,
50                 [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = V_008958_DI_PT_LINESTRIP_ADJ,
51                 [PIPE_PRIM_TRIANGLES_ADJACENCY]         = V_008958_DI_PT_TRILIST_ADJ,
52                 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = V_008958_DI_PT_TRISTRIP_ADJ,
53                 [PIPE_PRIM_PATCHES]                     = V_008958_DI_PT_PATCH,
54                 [R600_PRIM_RECTANGLE_LIST]              = V_008958_DI_PT_RECTLIST
55         };
56         assert(mode < Elements(prim_conv));
57         return prim_conv[mode];
58 }
59
60 static unsigned si_conv_prim_to_gs_out(unsigned mode)
61 {
62         static const int prim_conv[] = {
63                 [PIPE_PRIM_POINTS]                      = V_028A6C_OUTPRIM_TYPE_POINTLIST,
64                 [PIPE_PRIM_LINES]                       = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
65                 [PIPE_PRIM_LINE_LOOP]                   = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
66                 [PIPE_PRIM_LINE_STRIP]                  = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
67                 [PIPE_PRIM_TRIANGLES]                   = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
68                 [PIPE_PRIM_TRIANGLE_STRIP]              = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
69                 [PIPE_PRIM_TRIANGLE_FAN]                = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
70                 [PIPE_PRIM_QUADS]                       = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
71                 [PIPE_PRIM_QUAD_STRIP]                  = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
72                 [PIPE_PRIM_POLYGON]                     = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
73                 [PIPE_PRIM_LINES_ADJACENCY]             = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
74                 [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
75                 [PIPE_PRIM_TRIANGLES_ADJACENCY]         = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
76                 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
77                 [PIPE_PRIM_PATCHES]                     = V_028A6C_OUTPRIM_TYPE_POINTLIST,
78                 [R600_PRIM_RECTANGLE_LIST]              = V_028A6C_OUTPRIM_TYPE_TRISTRIP
79         };
80         assert(mode < Elements(prim_conv));
81
82         return prim_conv[mode];
83 }
84
85 /**
86  * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
87  * LS.LDS_SIZE is shared by all 3 shader stages.
88  *
89  * The information about LDS and other non-compile-time parameters is then
90  * written to userdata SGPRs.
91  */
92 static void si_emit_derived_tess_state(struct si_context *sctx,
93                                        const struct pipe_draw_info *info,
94                                        unsigned *num_patches)
95 {
96         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
97         struct si_shader_ctx_state *ls = &sctx->vs_shader;
98         /* The TES pointer will only be used for sctx->last_tcs.
99          * It would be wrong to think that TCS = TES. */
100         struct si_shader_selector *tcs =
101                 sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
102         unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
103         unsigned num_tcs_input_cp = info->vertices_per_patch;
104         unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
105         unsigned num_tcs_patch_outputs;
106         unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
107         unsigned input_patch_size, output_patch_size, output_patch0_offset;
108         unsigned perpatch_output_offset, lds_size, ls_rsrc2;
109         unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
110
111         *num_patches = 1; /* TODO: calculate this */
112
113         if (sctx->last_ls == ls->current &&
114             sctx->last_tcs == tcs &&
115             sctx->last_tes_sh_base == tes_sh_base &&
116             sctx->last_num_tcs_input_cp == num_tcs_input_cp)
117                 return;
118
119         sctx->last_ls = ls->current;
120         sctx->last_tcs = tcs;
121         sctx->last_tes_sh_base = tes_sh_base;
122         sctx->last_num_tcs_input_cp = num_tcs_input_cp;
123
124         /* This calculates how shader inputs and outputs among VS, TCS, and TES
125          * are laid out in LDS. */
126         num_tcs_inputs = util_last_bit64(ls->cso->outputs_written);
127
128         if (sctx->tcs_shader.cso) {
129                 num_tcs_outputs = util_last_bit64(tcs->outputs_written);
130                 num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
131                 num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
132         } else {
133                 /* No TCS. Route varyings from LS to TES. */
134                 num_tcs_outputs = num_tcs_inputs;
135                 num_tcs_output_cp = num_tcs_input_cp;
136                 num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
137         }
138
139         input_vertex_size = num_tcs_inputs * 16;
140         output_vertex_size = num_tcs_outputs * 16;
141
142         input_patch_size = num_tcs_input_cp * input_vertex_size;
143
144         pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
145         output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
146
147         output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0;
148         perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
149
150         lds_size = output_patch0_offset + output_patch_size * *num_patches;
151         ls_rsrc2 = ls->current->config.rsrc2;
152
153         if (sctx->b.chip_class >= CIK) {
154                 assert(lds_size <= 65536);
155                 ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512);
156         } else {
157                 assert(lds_size <= 32768);
158                 ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
159         }
160
161         /* Due to a hw bug, RSRC2_LS must be written twice with another
162          * LS register written in between. */
163         if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
164                 radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
165         radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
166         radeon_emit(cs, ls->current->config.rsrc1);
167         radeon_emit(cs, ls_rsrc2);
168
169         /* Compute userdata SGPRs. */
170         assert(((input_vertex_size / 4) & ~0xff) == 0);
171         assert(((output_vertex_size / 4) & ~0xff) == 0);
172         assert(((input_patch_size / 4) & ~0x1fff) == 0);
173         assert(((output_patch_size / 4) & ~0x1fff) == 0);
174         assert(((output_patch0_offset / 16) & ~0xffff) == 0);
175         assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
176         assert(num_tcs_input_cp <= 32);
177         assert(num_tcs_output_cp <= 32);
178
179         tcs_in_layout = (input_patch_size / 4) |
180                         ((input_vertex_size / 4) << 13);
181         tcs_out_layout = (output_patch_size / 4) |
182                          ((output_vertex_size / 4) << 13);
183         tcs_out_offsets = (output_patch0_offset / 16) |
184                           ((perpatch_output_offset / 16) << 16);
185
186         /* Set them for LS. */
187         radeon_set_sh_reg(cs,
188                 R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4,
189                 tcs_in_layout);
190
191         /* Set them for TCS. */
192         radeon_set_sh_reg_seq(cs,
193                 R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OUT_OFFSETS * 4, 3);
194         radeon_emit(cs, tcs_out_offsets);
195         radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
196         radeon_emit(cs, tcs_in_layout);
197
198         /* Set them for TES. */
199         radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OUT_OFFSETS * 4, 2);
200         radeon_emit(cs, tcs_out_offsets);
201         radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
202 }
203
204 static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
205 {
206         switch (info->mode) {
207         case PIPE_PRIM_PATCHES:
208                 return info->count / info->vertices_per_patch;
209         case R600_PRIM_RECTANGLE_LIST:
210                 return info->count / 3;
211         default:
212                 return u_prims_for_vertices(info->mode, info->count);
213         }
214 }
215
216 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
217                                           const struct pipe_draw_info *info,
218                                           unsigned num_patches)
219 {
220         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
221         unsigned prim = info->mode;
222         unsigned primgroup_size = 128; /* recommended without a GS */
223         unsigned max_primgroup_in_wave = 2;
224
225         /* SWITCH_ON_EOP(0) is always preferable. */
226         bool wd_switch_on_eop = false;
227         bool ia_switch_on_eop = false;
228         bool ia_switch_on_eoi = false;
229         bool partial_vs_wave = false;
230         bool partial_es_wave = false;
231
232         if (sctx->gs_shader.cso)
233                 primgroup_size = 64; /* recommended with a GS */
234
235         if (sctx->tes_shader.cso) {
236                 unsigned num_cp_out =
237                         sctx->tcs_shader.cso ?
238                         sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
239                         info->vertices_per_patch;
240                 unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
241
242                 primgroup_size = MIN2(primgroup_size, max_size);
243
244                 /* primgroup_size must be set to a multiple of NUM_PATCHES */
245                 primgroup_size = (primgroup_size / num_patches) * num_patches;
246
247                 /* SWITCH_ON_EOI must be set if PrimID is used. */
248                 if ((sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
249                     sctx->tes_shader.cso->info.uses_primid)
250                         ia_switch_on_eoi = true;
251
252                 /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
253                 if ((sctx->b.family == CHIP_TAHITI ||
254                      sctx->b.family == CHIP_PITCAIRN ||
255                      sctx->b.family == CHIP_BONAIRE) &&
256                     sctx->gs_shader.cso)
257                         partial_vs_wave = true;
258         }
259
260         /* This is a hardware requirement. */
261         if ((rs && rs->line_stipple_enable) ||
262             (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) {
263                 ia_switch_on_eop = true;
264                 wd_switch_on_eop = true;
265         }
266
267         if (sctx->b.chip_class >= CIK) {
268                 /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
269                  * 4 shader engines. Set 1 to pass the assertion below.
270                  * The other cases are hardware requirements. */
271                 if (sctx->b.screen->info.max_se < 4 ||
272                     prim == PIPE_PRIM_POLYGON ||
273                     prim == PIPE_PRIM_LINE_LOOP ||
274                     prim == PIPE_PRIM_TRIANGLE_FAN ||
275                     prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
276                     info->primitive_restart ||
277                     info->count_from_stream_output)
278                         wd_switch_on_eop = true;
279
280                 /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
281                  * We don't know that for indirect drawing, so treat it as
282                  * always problematic. */
283                 if (sctx->b.family == CHIP_HAWAII &&
284                     (info->indirect || info->instance_count > 1))
285                         wd_switch_on_eop = true;
286
287                 /* Required on CIK and later. */
288                 if (sctx->b.screen->info.max_se > 2 && !wd_switch_on_eop)
289                         ia_switch_on_eoi = true;
290
291                 /* Required by Hawaii and, for some special cases, by VI. */
292                 if (ia_switch_on_eoi &&
293                     (sctx->b.family == CHIP_HAWAII ||
294                      (sctx->b.chip_class == VI &&
295                       (sctx->gs_shader.cso || max_primgroup_in_wave != 2))))
296                         partial_vs_wave = true;
297
298                 /* Instancing bug on Bonaire. */
299                 if (sctx->b.family == CHIP_BONAIRE && ia_switch_on_eoi &&
300                     (info->indirect || info->instance_count > 1))
301                         partial_vs_wave = true;
302
303                 /* If the WD switch is false, the IA switch must be false too. */
304                 assert(wd_switch_on_eop || !ia_switch_on_eop);
305         }
306
307         /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
308         if (ia_switch_on_eoi)
309                 partial_es_wave = true;
310
311         /* GS requirement. */
312         if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
313                 partial_es_wave = true;
314
315         /* Hw bug with single-primitive instances and SWITCH_ON_EOI
316          * on multi-SE chips. */
317         if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
318             (info->indirect ||
319              (info->instance_count > 1 &&
320               si_num_prims_for_vertices(info) <= 1)))
321                 sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
322
323         return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
324                 S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
325                 S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
326                 S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
327                 S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
328                 S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
329                 S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ?
330                                              max_primgroup_in_wave : 0);
331 }
332
333 static unsigned si_get_ls_hs_config(struct si_context *sctx,
334                                     const struct pipe_draw_info *info,
335                                     unsigned num_patches)
336 {
337         unsigned num_output_cp;
338
339         if (!sctx->tes_shader.cso)
340                 return 0;
341
342         num_output_cp = sctx->tcs_shader.cso ?
343                 sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
344                 info->vertices_per_patch;
345
346         return S_028B58_NUM_PATCHES(num_patches) |
347                 S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) |
348                 S_028B58_HS_NUM_OUTPUT_CP(num_output_cp);
349 }
350
351 static void si_emit_scratch_reloc(struct si_context *sctx)
352 {
353         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
354
355         if (!sctx->emit_scratch_reloc)
356                 return;
357
358         radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
359                                sctx->spi_tmpring_size);
360
361         if (sctx->scratch_buffer) {
362                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
363                                       sctx->scratch_buffer, RADEON_USAGE_READWRITE,
364                                       RADEON_PRIO_SCRATCH_BUFFER);
365
366         }
367         sctx->emit_scratch_reloc = false;
368 }
369
370 /* rast_prim is the primitive type after GS. */
371 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
372 {
373         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
374         unsigned rast_prim = sctx->current_rast_prim;
375         struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
376
377         /* Skip this if not rendering lines. */
378         if (rast_prim != PIPE_PRIM_LINES &&
379             rast_prim != PIPE_PRIM_LINE_LOOP &&
380             rast_prim != PIPE_PRIM_LINE_STRIP &&
381             rast_prim != PIPE_PRIM_LINES_ADJACENCY &&
382             rast_prim != PIPE_PRIM_LINE_STRIP_ADJACENCY)
383                 return;
384
385         if (rast_prim == sctx->last_rast_prim &&
386             rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
387                 return;
388
389         radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE,
390                 rs->pa_sc_line_stipple |
391                 S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 :
392                                          rast_prim == PIPE_PRIM_LINE_STRIP ? 2 : 0));
393
394         sctx->last_rast_prim = rast_prim;
395         sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
396 }
397
398 static void si_emit_draw_registers(struct si_context *sctx,
399                                    const struct pipe_draw_info *info)
400 {
401         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
402         unsigned prim = si_conv_pipe_prim(info->mode);
403         unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
404         unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
405
406         if (sctx->tes_shader.cso)
407                 si_emit_derived_tess_state(sctx, info, &num_patches);
408
409         ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
410         ls_hs_config = si_get_ls_hs_config(sctx, info, num_patches);
411
412         /* Draw state. */
413         if (prim != sctx->last_prim ||
414             ia_multi_vgt_param != sctx->last_multi_vgt_param ||
415             ls_hs_config != sctx->last_ls_hs_config) {
416                 if (sctx->b.chip_class >= CIK) {
417                         radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
418                         radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
419                         radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
420                         radeon_emit(cs, ls_hs_config); /* VGT_LS_HS_CONFIG */
421                 } else {
422                         radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
423                         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
424                         radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
425                 }
426                 sctx->last_prim = prim;
427                 sctx->last_multi_vgt_param = ia_multi_vgt_param;
428                 sctx->last_ls_hs_config = ls_hs_config;
429         }
430
431         if (gs_out_prim != sctx->last_gs_out_prim) {
432                 radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
433                 sctx->last_gs_out_prim = gs_out_prim;
434         }
435
436         /* Primitive restart. */
437         if (info->primitive_restart != sctx->last_primitive_restart_en) {
438                 radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info->primitive_restart);
439                 sctx->last_primitive_restart_en = info->primitive_restart;
440
441                 if (info->primitive_restart &&
442                     (info->restart_index != sctx->last_restart_index ||
443                      sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) {
444                         radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
445                                                info->restart_index);
446                         sctx->last_restart_index = info->restart_index;
447                 }
448         }
449 }
450
451 static void si_emit_draw_packets(struct si_context *sctx,
452                                  const struct pipe_draw_info *info,
453                                  const struct pipe_index_buffer *ib)
454 {
455         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
456         unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
457         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
458
459         if (info->count_from_stream_output) {
460                 struct r600_so_target *t =
461                         (struct r600_so_target*)info->count_from_stream_output;
462                 uint64_t va = t->buf_filled_size->gpu_address +
463                               t->buf_filled_size_offset;
464
465                 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
466                                        t->stride_in_dw);
467
468                 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
469                 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
470                             COPY_DATA_DST_SEL(COPY_DATA_REG) |
471                             COPY_DATA_WR_CONFIRM);
472                 radeon_emit(cs, va);     /* src address lo */
473                 radeon_emit(cs, va >> 32); /* src address hi */
474                 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
475                 radeon_emit(cs, 0); /* unused */
476
477                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
478                                       t->buf_filled_size, RADEON_USAGE_READ,
479                                       RADEON_PRIO_SO_FILLED_SIZE);
480         }
481
482         /* draw packet */
483         if (info->indexed) {
484                 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
485
486                 /* index type */
487                 switch (ib->index_size) {
488                 case 1:
489                         radeon_emit(cs, V_028A7C_VGT_INDEX_8);
490                         break;
491                 case 2:
492                         radeon_emit(cs, V_028A7C_VGT_INDEX_16 |
493                                     (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
494                                              V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
495                         break;
496                 case 4:
497                         radeon_emit(cs, V_028A7C_VGT_INDEX_32 |
498                                     (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
499                                              V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
500                         break;
501                 default:
502                         assert(!"unreachable");
503                         return;
504                 }
505         }
506
507         if (!info->indirect) {
508                 int base_vertex;
509
510                 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
511                 radeon_emit(cs, info->instance_count);
512
513                 /* Base vertex and start instance. */
514                 base_vertex = info->indexed ? info->index_bias : info->start;
515
516                 if (base_vertex != sctx->last_base_vertex ||
517                     sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
518                     info->start_instance != sctx->last_start_instance ||
519                     sh_base_reg != sctx->last_sh_base_reg) {
520                         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
521                         radeon_emit(cs, base_vertex);
522                         radeon_emit(cs, info->start_instance);
523
524                         sctx->last_base_vertex = base_vertex;
525                         sctx->last_start_instance = info->start_instance;
526                         sctx->last_sh_base_reg = sh_base_reg;
527                 }
528         } else {
529                 si_invalidate_draw_sh_constants(sctx);
530
531                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
532                                       (struct r600_resource *)info->indirect,
533                                       RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
534         }
535
536         if (info->indexed) {
537                 uint32_t index_max_size = (ib->buffer->width0 - ib->offset) /
538                                           ib->index_size;
539                 uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
540
541                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
542                                       (struct r600_resource *)ib->buffer,
543                                       RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
544
545                 if (info->indirect) {
546                         uint64_t indirect_va = r600_resource(info->indirect)->gpu_address;
547
548                         assert(indirect_va % 8 == 0);
549                         assert(index_va % 2 == 0);
550                         assert(info->indirect_offset % 4 == 0);
551
552                         radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
553                         radeon_emit(cs, 1);
554                         radeon_emit(cs, indirect_va);
555                         radeon_emit(cs, indirect_va >> 32);
556
557                         radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
558                         radeon_emit(cs, index_va);
559                         radeon_emit(cs, index_va >> 32);
560
561                         radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
562                         radeon_emit(cs, index_max_size);
563
564                         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
565                         radeon_emit(cs, info->indirect_offset);
566                         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
567                         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
568                         radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
569                 } else {
570                         index_va += info->start * ib->index_size;
571
572                         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
573                         radeon_emit(cs, index_max_size);
574                         radeon_emit(cs, index_va);
575                         radeon_emit(cs, (index_va >> 32UL) & 0xFF);
576                         radeon_emit(cs, info->count);
577                         radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
578                 }
579         } else {
580                 if (info->indirect) {
581                         uint64_t indirect_va = r600_resource(info->indirect)->gpu_address;
582
583                         assert(indirect_va % 8 == 0);
584                         assert(info->indirect_offset % 4 == 0);
585
586                         radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
587                         radeon_emit(cs, 1);
588                         radeon_emit(cs, indirect_va);
589                         radeon_emit(cs, indirect_va >> 32);
590
591                         radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
592                         radeon_emit(cs, info->indirect_offset);
593                         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
594                         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
595                         radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
596                 } else {
597                         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
598                         radeon_emit(cs, info->count);
599                         radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
600                                     S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
601                 }
602         }
603 }
604
605 void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
606 {
607         struct r600_common_context *sctx = &si_ctx->b;
608         struct radeon_winsys_cs *cs = sctx->gfx.cs;
609         uint32_t cp_coher_cntl = 0;
610         uint32_t compute =
611                 PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
612
613         /* SI has a bug that it always flushes ICACHE and KCACHE if either
614          * bit is set. An alternative way is to write SQC_CACHES, but that
615          * doesn't seem to work reliably. Since the bug doesn't affect
616          * correctness (it only does more work than necessary) and
617          * the performance impact is likely negligible, there is no plan
618          * to fix it.
619          */
620
621         if (sctx->flags & SI_CONTEXT_INV_ICACHE)
622                 cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
623         if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
624                 cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
625
626         if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
627                 cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
628         if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
629                 cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
630
631                 /* TODO: this might not be needed. */
632                 if (sctx->chip_class >= VI)
633                         cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
634         }
635
636         if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
637                 cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
638                                  S_0085F0_CB0_DEST_BASE_ENA(1) |
639                                  S_0085F0_CB1_DEST_BASE_ENA(1) |
640                                  S_0085F0_CB2_DEST_BASE_ENA(1) |
641                                  S_0085F0_CB3_DEST_BASE_ENA(1) |
642                                  S_0085F0_CB4_DEST_BASE_ENA(1) |
643                                  S_0085F0_CB5_DEST_BASE_ENA(1) |
644                                  S_0085F0_CB6_DEST_BASE_ENA(1) |
645                                  S_0085F0_CB7_DEST_BASE_ENA(1);
646
647                 /* Necessary for DCC */
648                 if (sctx->chip_class >= VI) {
649                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0) | compute);
650                         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
651                                         EVENT_INDEX(5));
652                         radeon_emit(cs, 0);
653                         radeon_emit(cs, 0);
654                         radeon_emit(cs, 0);
655                         radeon_emit(cs, 0);
656                 }
657         }
658         if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
659                 cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
660                                  S_0085F0_DB_DEST_BASE_ENA(1);
661         }
662
663         if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB_META) {
664                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
665                 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
666         }
667         if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB_META) {
668                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
669                 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
670         }
671         if (sctx->flags & SI_CONTEXT_FLUSH_WITH_INV_L2) {
672                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
673                 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) | EVENT_INDEX(7) |
674                                 EVENT_WRITE_INV_L2);
675         }
676
677         /* FLUSH_AND_INV events must be emitted before PS_PARTIAL_FLUSH.
678          * Otherwise, clearing CMASK (CB meta) with CP DMA isn't reliable.
679          *
680          * I think the reason is that FLUSH_AND_INV is only added to a queue
681          * and it is PS_PARTIAL_FLUSH that waits for it to complete.
682          */
683         if (sctx->flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
684                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
685                 radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
686         } else if (sctx->flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
687                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
688                 radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
689         }
690         if (sctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH) {
691                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
692                 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
693         }
694         if (sctx->flags & SI_CONTEXT_VGT_FLUSH) {
695                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
696                 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
697         }
698         if (sctx->flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
699                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
700                 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
701         }
702
703         /* SURFACE_SYNC must be emitted after partial flushes.
704          * It looks like SURFACE_SYNC flushes caches immediately and doesn't
705          * wait for any engines. This should be last.
706          */
707         if (cp_coher_cntl) {
708                 if (sctx->chip_class >= CIK) {
709                         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | compute);
710                         radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
711                         radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
712                         radeon_emit(cs, 0xff);            /* CP_COHER_SIZE_HI */
713                         radeon_emit(cs, 0);               /* CP_COHER_BASE */
714                         radeon_emit(cs, 0);               /* CP_COHER_BASE_HI */
715                         radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
716                 } else {
717                         radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0) | compute);
718                         radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
719                         radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
720                         radeon_emit(cs, 0);               /* CP_COHER_BASE */
721                         radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
722                 }
723         }
724
725         if (sctx->flags & R600_CONTEXT_START_PIPELINE_STATS) {
726                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
727                 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
728                                 EVENT_INDEX(0));
729         } else if (sctx->flags & R600_CONTEXT_STOP_PIPELINE_STATS) {
730                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
731                 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
732                                 EVENT_INDEX(0));
733         }
734
735         sctx->flags = 0;
736 }
737
738 static void si_get_draw_start_count(struct si_context *sctx,
739                                     const struct pipe_draw_info *info,
740                                     unsigned *start, unsigned *count)
741 {
742         if (info->indirect) {
743                 struct r600_resource *indirect =
744                         (struct r600_resource*)info->indirect;
745                 int *data = r600_buffer_map_sync_with_rings(&sctx->b,
746                                         indirect, PIPE_TRANSFER_READ);
747                 data += info->indirect_offset/sizeof(int);
748                 *start = data[2];
749                 *count = data[0];
750         } else {
751                 *start = info->start;
752                 *count = info->count;
753         }
754 }
755
756 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
757 {
758         struct si_context *sctx = (struct si_context *)ctx;
759         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
760         struct pipe_index_buffer ib = {};
761         unsigned mask, dirty_fb_counter;
762
763         if (!info->count && !info->indirect &&
764             (info->indexed || !info->count_from_stream_output))
765                 return;
766
767         if (!sctx->vs_shader.cso) {
768                 assert(0);
769                 return;
770         }
771         if (!sctx->ps_shader.cso && (!rs || !rs->rasterizer_discard)) {
772                 assert(0);
773                 return;
774         }
775         if (!!sctx->tes_shader.cso != (info->mode == PIPE_PRIM_PATCHES)) {
776                 assert(0);
777                 return;
778         }
779
780         /* Re-emit the framebuffer state if needed. */
781         dirty_fb_counter = p_atomic_read(&sctx->b.screen->dirty_fb_counter);
782         if (dirty_fb_counter != sctx->b.last_dirty_fb_counter) {
783                 sctx->b.last_dirty_fb_counter = dirty_fb_counter;
784                 sctx->framebuffer.dirty_cbufs |=
785                         ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
786                 sctx->framebuffer.dirty_zsbuf = true;
787                 si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
788         }
789
790         si_decompress_textures(sctx);
791
792         /* Set the rasterization primitive type.
793          *
794          * This must be done after si_decompress_textures, which can call
795          * draw_vbo recursively, and before si_update_shaders, which uses
796          * current_rast_prim for this draw_vbo call. */
797         if (sctx->gs_shader.cso)
798                 sctx->current_rast_prim = sctx->gs_shader.cso->gs_output_prim;
799         else if (sctx->tes_shader.cso)
800                 sctx->current_rast_prim =
801                         sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
802         else
803                 sctx->current_rast_prim = info->mode;
804
805         if (!si_update_shaders(sctx) ||
806             !si_upload_shader_descriptors(sctx))
807                 return;
808
809         if (info->indexed) {
810                 /* Initialize the index buffer struct. */
811                 pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer);
812                 ib.user_buffer = sctx->index_buffer.user_buffer;
813                 ib.index_size = sctx->index_buffer.index_size;
814                 ib.offset = sctx->index_buffer.offset;
815
816                 /* Translate or upload, if needed. */
817                 /* 8-bit indices are supported on VI. */
818                 if (sctx->b.chip_class <= CIK && ib.index_size == 1) {
819                         struct pipe_resource *out_buffer = NULL;
820                         unsigned out_offset, start, count, start_offset;
821                         void *ptr;
822
823                         si_get_draw_start_count(sctx, info, &start, &count);
824                         start_offset = start * ib.index_size;
825
826                         u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
827                                        &out_offset, &out_buffer, &ptr);
828                         if (!out_buffer) {
829                                 pipe_resource_reference(&ib.buffer, NULL);
830                                 return;
831                         }
832
833                         util_shorten_ubyte_elts_to_userptr(&sctx->b.b, &ib, 0,
834                                                            ib.offset + start_offset,
835                                                            count, ptr);
836
837                         pipe_resource_reference(&ib.buffer, NULL);
838                         ib.user_buffer = NULL;
839                         ib.buffer = out_buffer;
840                         /* info->start will be added by the drawing code */
841                         ib.offset = out_offset - start_offset;
842                         ib.index_size = 2;
843                 } else if (ib.user_buffer && !ib.buffer) {
844                         unsigned start, count, start_offset;
845
846                         si_get_draw_start_count(sctx, info, &start, &count);
847                         start_offset = start * ib.index_size;
848
849                         u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
850                                       256, (char*)ib.user_buffer + start_offset,
851                                       &ib.offset, &ib.buffer);
852                         if (!ib.buffer)
853                                 return;
854                         /* info->start will be added by the drawing code */
855                         ib.offset -= start_offset;
856                 }
857         }
858
859         /* VI reads index buffers through TC L2. */
860         if (info->indexed && sctx->b.chip_class <= CIK &&
861             r600_resource(ib.buffer)->TC_L2_dirty) {
862                 sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
863                 r600_resource(ib.buffer)->TC_L2_dirty = false;
864         }
865
866         /* Check flush flags. */
867         if (sctx->b.flags)
868                 si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush);
869
870         si_need_cs_space(sctx);
871
872         /* Emit states. */
873         mask = sctx->dirty_atoms;
874         while (mask) {
875                 struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
876
877                 atom->emit(&sctx->b, atom);
878         }
879         sctx->dirty_atoms = 0;
880
881         si_pm4_emit_dirty(sctx);
882         si_emit_scratch_reloc(sctx);
883         si_emit_rasterizer_prim_state(sctx);
884         si_emit_draw_registers(sctx, info);
885         si_emit_draw_packets(sctx, info, &ib);
886
887         if (sctx->trace_buf)
888                 si_trace_emit(sctx);
889
890         /* Workaround for a VGT hang when streamout is enabled.
891          * It must be done after drawing. */
892         if ((sctx->b.family == CHIP_HAWAII ||
893              sctx->b.family == CHIP_TONGA ||
894              sctx->b.family == CHIP_FIJI) &&
895             r600_get_strmout_en(&sctx->b)) {
896                 sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
897         }
898
899         /* Set the depth buffer as dirty. */
900         if (sctx->framebuffer.state.zsbuf) {
901                 struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
902                 struct r600_texture *rtex = (struct r600_texture *)surf->texture;
903
904                 rtex->dirty_level_mask |= 1 << surf->u.tex.level;
905
906                 if (rtex->surface.flags & RADEON_SURF_SBUFFER)
907                         rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
908         }
909         if (sctx->framebuffer.compressed_cb_mask) {
910                 struct pipe_surface *surf;
911                 struct r600_texture *rtex;
912                 unsigned mask = sctx->framebuffer.compressed_cb_mask;
913
914                 do {
915                         unsigned i = u_bit_scan(&mask);
916                         surf = sctx->framebuffer.state.cbufs[i];
917                         rtex = (struct r600_texture*)surf->texture;
918
919                         rtex->dirty_level_mask |= 1 << surf->u.tex.level;
920                 } while (mask);
921         }
922
923         pipe_resource_reference(&ib.buffer, NULL);
924         sctx->b.num_draw_calls++;
925 }
926
927 void si_trace_emit(struct si_context *sctx)
928 {
929         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
930
931         sctx->trace_id++;
932         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
933                               RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
934         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
935         radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
936                     S_370_WR_CONFIRM(1) |
937                     S_370_ENGINE_SEL(V_370_ME));
938         radeon_emit(cs, sctx->trace_buf->gpu_address);
939         radeon_emit(cs, sctx->trace_buf->gpu_address >> 32);
940         radeon_emit(cs, sctx->trace_id);
941         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
942         radeon_emit(cs, SI_ENCODE_TRACE_POINT(sctx->trace_id));
943 }