OSDN Git Service

ac: correct PKT3_COPY_DATA definitions
[android-x86/external-mesa.git] / src / amd / vulkan / radv_query.c
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37
38 #define TIMESTAMP_NOT_READY UINT64_MAX
39
40 static const int pipelinestat_block_size = 11 * 8;
41 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
42
43 static unsigned get_max_db(struct radv_device *device)
44 {
45         unsigned num_db = device->physical_device->rad_info.num_render_backends;
46         MAYBE_UNUSED unsigned rb_mask = device->physical_device->rad_info.enabled_rb_mask;
47
48         /* Otherwise we need to change the query reset procedure */
49         assert(rb_mask == ((1ull << num_db) - 1));
50
51         return num_db;
52 }
53
54 static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
55 {
56         nir_ssa_def *counter = nir_load_var(b, var);
57
58         nir_if *if_stmt = nir_if_create(b->shader);
59         if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, count));
60         nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
61
62         b->cursor = nir_after_cf_list(&if_stmt->then_list);
63
64         nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
65         nir_builder_instr_insert(b, &instr->instr);
66
67         b->cursor = nir_after_cf_node(&if_stmt->cf_node);
68         counter = nir_iadd(b, counter, nir_imm_int(b, 1));
69         nir_store_var(b, var, counter, 0x1);
70 }
71
72 static struct nir_ssa_def *
73 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
74 {
75         nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
76         nir_intrinsic_set_base(flags, 0);
77         nir_intrinsic_set_range(flags, 16);
78         flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
79         flags->num_components = 1;
80         nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
81         nir_builder_instr_insert(b, &flags->instr);
82         return &flags->dest.ssa;
83 }
84
85 static nir_shader *
86 build_occlusion_query_shader(struct radv_device *device) {
87         /* the shader this builds is roughly
88          *
89          * push constants {
90          *      uint32_t flags;
91          *      uint32_t dst_stride;
92          * };
93          *
94          * uint32_t src_stride = 16 * db_count;
95          *
96          * location(binding = 0) buffer dst_buf;
97          * location(binding = 1) buffer src_buf;
98          *
99          * void main() {
100          *      uint64_t result = 0;
101          *      uint64_t src_offset = src_stride * global_id.x;
102          *      uint64_t dst_offset = dst_stride * global_id.x;
103          *      bool available = true;
104          *      for (int i = 0; i < db_count; ++i) {
105          *              uint64_t start = src_buf[src_offset + 16 * i];
106          *              uint64_t end = src_buf[src_offset + 16 * i + 8];
107          *              if ((start & (1ull << 63)) && (end & (1ull << 63)))
108          *                      result += end - start;
109          *              else
110          *                      available = false;
111          *      }
112          *      uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
113          *      if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
114          *              if (flags & VK_QUERY_RESULT_64_BIT)
115          *                      dst_buf[dst_offset] = result;
116          *              else
117          *                      dst_buf[dst_offset] = (uint32_t)result.
118          *      }
119          *      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
120          *              dst_buf[dst_offset + elem_size] = available;
121          *      }
122          * }
123          */
124         nir_builder b;
125         nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
126         b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
127         b.shader->info.cs.local_size[0] = 64;
128         b.shader->info.cs.local_size[1] = 1;
129         b.shader->info.cs.local_size[2] = 1;
130
131         nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
132         nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
133         nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
134         nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
135         nir_variable *available = nir_local_variable_create(b.impl, glsl_int_type(), "available");
136         unsigned db_count = get_max_db(device);
137
138         nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
139
140         nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
141                                                                   nir_intrinsic_vulkan_resource_index);
142         dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
143         nir_intrinsic_set_desc_set(dst_buf, 0);
144         nir_intrinsic_set_binding(dst_buf, 0);
145         nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
146         nir_builder_instr_insert(&b, &dst_buf->instr);
147
148         nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
149                                                                   nir_intrinsic_vulkan_resource_index);
150         src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
151         nir_intrinsic_set_desc_set(src_buf, 0);
152         nir_intrinsic_set_binding(src_buf, 1);
153         nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
154         nir_builder_instr_insert(&b, &src_buf->instr);
155
156         nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
157         nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
158         nir_ssa_def *block_size = nir_imm_ivec4(&b,
159                                                 b.shader->info.cs.local_size[0],
160                                                 b.shader->info.cs.local_size[1],
161                                                 b.shader->info.cs.local_size[2], 0);
162         nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
163         global_id = nir_channel(&b, global_id, 0); // We only care about x here.
164
165         nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
166         nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
167         nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
168         nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
169
170
171         nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
172         nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
173         nir_store_var(&b, available, nir_imm_int(&b, 1), 0x1);
174
175         nir_loop *outer_loop = nir_loop_create(b.shader);
176         nir_builder_cf_insert(&b, &outer_loop->cf_node);
177         b.cursor = nir_after_cf_list(&outer_loop->body);
178
179         nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
180         radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
181
182         nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
183         load_offset = nir_iadd(&b, input_base, load_offset);
184
185         nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
186         load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
187         load->src[1] = nir_src_for_ssa(load_offset);
188         nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
189         load->num_components = 2;
190         nir_builder_instr_insert(&b, &load->instr);
191
192         nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
193         nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
194
195         nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
196         nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
197
198         nir_if *update_if = nir_if_create(b.shader);
199         update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done));
200         nir_cf_node_insert(b.cursor, &update_if->cf_node);
201
202         b.cursor = nir_after_cf_list(&update_if->then_list);
203
204         nir_store_var(&b, result,
205                       nir_iadd(&b, nir_load_var(&b, result),
206                                    nir_isub(&b, nir_load_var(&b, end),
207                                                 nir_load_var(&b, start))), 0x1);
208
209         b.cursor = nir_after_cf_list(&update_if->else_list);
210
211         nir_store_var(&b, available, nir_imm_int(&b, 0), 0x1);
212
213         b.cursor = nir_after_cf_node(&outer_loop->cf_node);
214
215         /* Store the result if complete or if partial results have been requested. */
216
217         nir_ssa_def *result_is_64bit = nir_iand(&b, flags,
218                                                 nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
219         nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
220
221         nir_if *store_if = nir_if_create(b.shader);
222         store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)), nir_load_var(&b, available)));
223         nir_cf_node_insert(b.cursor, &store_if->cf_node);
224
225         b.cursor = nir_after_cf_list(&store_if->then_list);
226
227         nir_if *store_64bit_if = nir_if_create(b.shader);
228         store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
229         nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
230
231         b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
232
233         nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
234         store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
235         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
236         store->src[2] = nir_src_for_ssa(output_base);
237         nir_intrinsic_set_write_mask(store, 0x1);
238         store->num_components = 1;
239         nir_builder_instr_insert(&b, &store->instr);
240
241         b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
242
243         store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
244         store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
245         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
246         store->src[2] = nir_src_for_ssa(output_base);
247         nir_intrinsic_set_write_mask(store, 0x1);
248         store->num_components = 1;
249         nir_builder_instr_insert(&b, &store->instr);
250
251         b.cursor = nir_after_cf_node(&store_if->cf_node);
252
253         /* Store the availability bit if requested. */
254
255         nir_if *availability_if = nir_if_create(b.shader);
256         availability_if->condition = nir_src_for_ssa(nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
257         nir_cf_node_insert(b.cursor, &availability_if->cf_node);
258
259         b.cursor = nir_after_cf_list(&availability_if->then_list);
260
261         store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
262         store->src[0] = nir_src_for_ssa(nir_load_var(&b, available));
263         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
264         store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
265         nir_intrinsic_set_write_mask(store, 0x1);
266         store->num_components = 1;
267         nir_builder_instr_insert(&b, &store->instr);
268
269         return b.shader;
270 }
271
272 static nir_shader *
273 build_pipeline_statistics_query_shader(struct radv_device *device) {
274         /* the shader this builds is roughly
275          *
276          * push constants {
277          *      uint32_t flags;
278          *      uint32_t dst_stride;
279          *      uint32_t stats_mask;
280          *      uint32_t avail_offset;
281          * };
282          *
283          * uint32_t src_stride = pipelinestat_block_size * 2;
284          *
285          * location(binding = 0) buffer dst_buf;
286          * location(binding = 1) buffer src_buf;
287          *
288          * void main() {
289          *      uint64_t src_offset = src_stride * global_id.x;
290          *      uint64_t dst_base = dst_stride * global_id.x;
291          *      uint64_t dst_offset = dst_base;
292          *      uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
293          *      uint32_t elem_count = stats_mask >> 16;
294          *      uint32_t available = src_buf[avail_offset + 4 * global_id.x];
295          *      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
296          *              dst_buf[dst_offset + elem_count * elem_size] = available;
297          *      }
298          *      if (available) {
299          *              // repeat 11 times:
300          *              if (stats_mask & (1 << 0)) {
301          *                      uint64_t start = src_buf[src_offset + 8 * indices[0]];
302          *                      uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
303          *                      uint64_t result = end - start;
304          *                      if (flags & VK_QUERY_RESULT_64_BIT)
305          *                              dst_buf[dst_offset] = result;
306          *                      else
307          *                              dst_buf[dst_offset] = (uint32_t)result.
308          *                      dst_offset += elem_size;
309          *              }
310          *      } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
311          *              // Set everything to 0 as we don't know what is valid.
312          *              for (int i = 0; i < elem_count; ++i)
313          *                      dst_buf[dst_base + elem_size * i] = 0;
314          *      }
315          * }
316          */
317         nir_builder b;
318         nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
319         b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
320         b.shader->info.cs.local_size[0] = 64;
321         b.shader->info.cs.local_size[1] = 1;
322         b.shader->info.cs.local_size[2] = 1;
323
324         nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
325
326         nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
327         nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
328         nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
329
330         nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
331                                                                   nir_intrinsic_vulkan_resource_index);
332         dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
333         nir_intrinsic_set_desc_set(dst_buf, 0);
334         nir_intrinsic_set_binding(dst_buf, 0);
335         nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
336         nir_builder_instr_insert(&b, &dst_buf->instr);
337
338         nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
339                                                                   nir_intrinsic_vulkan_resource_index);
340         src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
341         nir_intrinsic_set_desc_set(src_buf, 0);
342         nir_intrinsic_set_binding(src_buf, 1);
343         nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
344         nir_builder_instr_insert(&b, &src_buf->instr);
345
346         nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
347         nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
348         nir_ssa_def *block_size = nir_imm_ivec4(&b,
349                                                 b.shader->info.cs.local_size[0],
350                                                 b.shader->info.cs.local_size[1],
351                                                 b.shader->info.cs.local_size[2], 0);
352         nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
353         global_id = nir_channel(&b, global_id, 0); // We only care about x here.
354
355         nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
356         nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
357         nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
358         nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
359
360
361         avail_offset = nir_iadd(&b, avail_offset,
362                                     nir_imul(&b, global_id, nir_imm_int(&b, 4)));
363
364         nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
365         load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
366         load->src[1] = nir_src_for_ssa(avail_offset);
367         nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
368         load->num_components = 1;
369         nir_builder_instr_insert(&b, &load->instr);
370         nir_ssa_def *available = &load->dest.ssa;
371
372         nir_ssa_def *result_is_64bit = nir_iand(&b, flags,
373                                                 nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
374         nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
375         nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
376
377         /* Store the availability bit if requested. */
378
379         nir_if *availability_if = nir_if_create(b.shader);
380         availability_if->condition = nir_src_for_ssa(nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
381         nir_cf_node_insert(b.cursor, &availability_if->cf_node);
382
383         b.cursor = nir_after_cf_list(&availability_if->then_list);
384
385         nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
386         store->src[0] = nir_src_for_ssa(available);
387         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
388         store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)));
389         nir_intrinsic_set_write_mask(store, 0x1);
390         store->num_components = 1;
391         nir_builder_instr_insert(&b, &store->instr);
392
393         b.cursor = nir_after_cf_node(&availability_if->cf_node);
394
395         nir_if *available_if = nir_if_create(b.shader);
396         available_if->condition = nir_src_for_ssa(available);
397         nir_cf_node_insert(b.cursor, &available_if->cf_node);
398
399         b.cursor = nir_after_cf_list(&available_if->then_list);
400
401         nir_store_var(&b, output_offset, output_base, 0x1);
402         for (int i = 0; i < 11; ++i) {
403                 nir_if *store_if = nir_if_create(b.shader);
404                 store_if->condition = nir_src_for_ssa(nir_iand(&b, stats_mask, nir_imm_int(&b, 1u << i)));
405                 nir_cf_node_insert(b.cursor, &store_if->cf_node);
406
407                 b.cursor = nir_after_cf_list(&store_if->then_list);
408
409                 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
410                 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
411                 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
412                                                             nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
413                 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
414                 load->num_components = 1;
415                 nir_builder_instr_insert(&b, &load->instr);
416                 nir_ssa_def *start = &load->dest.ssa;
417
418                 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
419                 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
420                 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
421                                                             nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
422                 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
423                 load->num_components = 1;
424                 nir_builder_instr_insert(&b, &load->instr);
425                 nir_ssa_def *end = &load->dest.ssa;
426
427                 nir_ssa_def *result = nir_isub(&b, end, start);
428
429                 /* Store result */
430                 nir_if *store_64bit_if = nir_if_create(b.shader);
431                 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
432                 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
433
434                 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
435
436                 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
437                 store->src[0] = nir_src_for_ssa(result);
438                 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
439                 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
440                 nir_intrinsic_set_write_mask(store, 0x1);
441                 store->num_components = 1;
442                 nir_builder_instr_insert(&b, &store->instr);
443
444                 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
445
446                 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
447                 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
448                 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
449                 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
450                 nir_intrinsic_set_write_mask(store, 0x1);
451                 store->num_components = 1;
452                 nir_builder_instr_insert(&b, &store->instr);
453
454                 b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
455
456                 nir_store_var(&b, output_offset,
457                                   nir_iadd(&b, nir_load_var(&b, output_offset),
458                                                elem_size), 0x1);
459
460                 b.cursor = nir_after_cf_node(&store_if->cf_node);
461         }
462
463         b.cursor = nir_after_cf_list(&available_if->else_list);
464
465         available_if = nir_if_create(b.shader);
466         available_if->condition = nir_src_for_ssa(nir_iand(&b, flags,
467                                                                nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)));
468         nir_cf_node_insert(b.cursor, &available_if->cf_node);
469
470         b.cursor = nir_after_cf_list(&available_if->then_list);
471
472         /* Stores zeros in all outputs. */
473
474         nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
475         nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
476
477         nir_loop *loop = nir_loop_create(b.shader);
478         nir_builder_cf_insert(&b, &loop->cf_node);
479         b.cursor = nir_after_cf_list(&loop->body);
480
481         nir_ssa_def *current_counter = nir_load_var(&b, counter);
482         radv_break_on_count(&b, counter, elem_count);
483
484         nir_ssa_def *output_elem = nir_iadd(&b, output_base,
485                                                 nir_imul(&b, elem_size, current_counter));
486
487         nir_if *store_64bit_if = nir_if_create(b.shader);
488         store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
489         nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
490
491         b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
492
493         store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
494         store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
495         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
496         store->src[2] = nir_src_for_ssa(output_elem);
497         nir_intrinsic_set_write_mask(store, 0x1);
498         store->num_components = 1;
499         nir_builder_instr_insert(&b, &store->instr);
500
501         b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
502
503         store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
504         store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
505         store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
506         store->src[2] = nir_src_for_ssa(output_elem);
507         nir_intrinsic_set_write_mask(store, 0x1);
508         store->num_components = 1;
509         nir_builder_instr_insert(&b, &store->instr);
510
511         b.cursor = nir_after_cf_node(&loop->cf_node);
512         return b.shader;
513 }
514
515 static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
516 {
517         VkResult result;
518         struct radv_shader_module occlusion_cs = { .nir = NULL };
519         struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
520
521         mtx_lock(&device->meta_state.mtx);
522         if (device->meta_state.query.pipeline_statistics_query_pipeline) {
523                 mtx_unlock(&device->meta_state.mtx);
524                 return VK_SUCCESS;
525         }
526         occlusion_cs.nir = build_occlusion_query_shader(device);
527         pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
528
529         VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
530                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
531                 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
532                 .bindingCount = 2,
533                 .pBindings = (VkDescriptorSetLayoutBinding[]) {
534                         {
535                                 .binding = 0,
536                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
537                                 .descriptorCount = 1,
538                                 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
539                                 .pImmutableSamplers = NULL
540                         },
541                         {
542                                 .binding = 1,
543                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
544                                 .descriptorCount = 1,
545                                 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
546                                 .pImmutableSamplers = NULL
547                         },
548                 }
549         };
550
551         result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
552                                                 &occlusion_ds_create_info,
553                                                 &device->meta_state.alloc,
554                                                 &device->meta_state.query.ds_layout);
555         if (result != VK_SUCCESS)
556                 goto fail;
557
558         VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
559                 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
560                 .setLayoutCount = 1,
561                 .pSetLayouts = &device->meta_state.query.ds_layout,
562                 .pushConstantRangeCount = 1,
563                 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
564         };
565
566         result = radv_CreatePipelineLayout(radv_device_to_handle(device),
567                                           &occlusion_pl_create_info,
568                                           &device->meta_state.alloc,
569                                           &device->meta_state.query.p_layout);
570         if (result != VK_SUCCESS)
571                 goto fail;
572
573         VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
574                 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
575                 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
576                 .module = radv_shader_module_to_handle(&occlusion_cs),
577                 .pName = "main",
578                 .pSpecializationInfo = NULL,
579         };
580
581         VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
582                 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
583                 .stage = occlusion_pipeline_shader_stage,
584                 .flags = 0,
585                 .layout = device->meta_state.query.p_layout,
586         };
587
588         result = radv_CreateComputePipelines(radv_device_to_handle(device),
589                                              radv_pipeline_cache_to_handle(&device->meta_state.cache),
590                                              1, &occlusion_vk_pipeline_info, NULL,
591                                              &device->meta_state.query.occlusion_query_pipeline);
592         if (result != VK_SUCCESS)
593                 goto fail;
594
595         VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
596                 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
597                 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
598                 .module = radv_shader_module_to_handle(&pipeline_statistics_cs),
599                 .pName = "main",
600                 .pSpecializationInfo = NULL,
601         };
602
603         VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
604                 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
605                 .stage = pipeline_statistics_pipeline_shader_stage,
606                 .flags = 0,
607                 .layout = device->meta_state.query.p_layout,
608         };
609
610         result = radv_CreateComputePipelines(radv_device_to_handle(device),
611                                              radv_pipeline_cache_to_handle(&device->meta_state.cache),
612                                              1, &pipeline_statistics_vk_pipeline_info, NULL,
613                                              &device->meta_state.query.pipeline_statistics_query_pipeline);
614
615 fail:
616         if (result != VK_SUCCESS)
617                 radv_device_finish_meta_query_state(device);
618         ralloc_free(occlusion_cs.nir);
619         ralloc_free(pipeline_statistics_cs.nir);
620         mtx_unlock(&device->meta_state.mtx);
621         return result;
622 }
623
624 VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
625 {
626         if (on_demand)
627                 return VK_SUCCESS;
628
629         return radv_device_init_meta_query_state_internal(device);
630 }
631
632 void radv_device_finish_meta_query_state(struct radv_device *device)
633 {
634         if (device->meta_state.query.pipeline_statistics_query_pipeline)
635                 radv_DestroyPipeline(radv_device_to_handle(device),
636                                      device->meta_state.query.pipeline_statistics_query_pipeline,
637                                      &device->meta_state.alloc);
638
639         if (device->meta_state.query.occlusion_query_pipeline)
640                 radv_DestroyPipeline(radv_device_to_handle(device),
641                                      device->meta_state.query.occlusion_query_pipeline,
642                                      &device->meta_state.alloc);
643
644         if (device->meta_state.query.p_layout)
645                 radv_DestroyPipelineLayout(radv_device_to_handle(device),
646                                            device->meta_state.query.p_layout,
647                                            &device->meta_state.alloc);
648
649         if (device->meta_state.query.ds_layout)
650                 radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
651                                                 device->meta_state.query.ds_layout,
652                                                 &device->meta_state.alloc);
653 }
654
655 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
656                               VkPipeline *pipeline,
657                               struct radeon_winsys_bo *src_bo,
658                               struct radeon_winsys_bo *dst_bo,
659                               uint64_t src_offset, uint64_t dst_offset,
660                               uint32_t src_stride, uint32_t dst_stride,
661                               uint32_t count, uint32_t flags,
662                               uint32_t pipeline_stats_mask, uint32_t avail_offset)
663 {
664         struct radv_device *device = cmd_buffer->device;
665         struct radv_meta_saved_state saved_state;
666
667         if (!*pipeline) {
668                 VkResult ret = radv_device_init_meta_query_state_internal(device);
669                 if (ret != VK_SUCCESS) {
670                         cmd_buffer->record_result = ret;
671                         return;
672                 }
673         }
674
675         radv_meta_save(&saved_state, cmd_buffer,
676                        RADV_META_SAVE_COMPUTE_PIPELINE |
677                        RADV_META_SAVE_CONSTANTS |
678                        RADV_META_SAVE_DESCRIPTORS);
679
680         struct radv_buffer dst_buffer = {
681                 .bo = dst_bo,
682                 .offset = dst_offset,
683                 .size = dst_stride * count
684         };
685
686         struct radv_buffer src_buffer = {
687                 .bo = src_bo,
688                 .offset = src_offset,
689                 .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
690         };
691
692         radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
693                              VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
694
695         radv_meta_push_descriptor_set(cmd_buffer,
696                                       VK_PIPELINE_BIND_POINT_COMPUTE,
697                                       device->meta_state.query.p_layout,
698                                       0, /* set */
699                                       2, /* descriptorWriteCount */
700                                       (VkWriteDescriptorSet[]) {
701                                               {
702                                                       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
703                                                       .dstBinding = 0,
704                                                       .dstArrayElement = 0,
705                                                       .descriptorCount = 1,
706                                                       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
707                                                       .pBufferInfo = &(VkDescriptorBufferInfo) {
708                                                               .buffer = radv_buffer_to_handle(&dst_buffer),
709                                                               .offset = 0,
710                                                               .range = VK_WHOLE_SIZE
711                                                       }
712                                               },
713                                               {
714                                                       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
715                                                       .dstBinding = 1,
716                                                       .dstArrayElement = 0,
717                                                       .descriptorCount = 1,
718                                                       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
719                                                       .pBufferInfo = &(VkDescriptorBufferInfo) {
720                                                               .buffer = radv_buffer_to_handle(&src_buffer),
721                                                               .offset = 0,
722                                                               .range = VK_WHOLE_SIZE
723                                                       }
724                                               }
725                                       });
726
727         /* Encode the number of elements for easy access by the shader. */
728         pipeline_stats_mask &= 0x7ff;
729         pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
730
731         avail_offset -= src_offset;
732
733         struct {
734                 uint32_t flags;
735                 uint32_t dst_stride;
736                 uint32_t pipeline_stats_mask;
737                 uint32_t avail_offset;
738         } push_constants = {
739                 flags,
740                 dst_stride,
741                 pipeline_stats_mask,
742                 avail_offset
743         };
744
745         radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
746                                       device->meta_state.query.p_layout,
747                                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
748                                       &push_constants);
749
750         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
751                                         RADV_CMD_FLAG_INV_VMEM_L1;
752
753         if (flags & VK_QUERY_RESULT_WAIT_BIT)
754                 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
755
756         radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
757
758         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
759                                         RADV_CMD_FLAG_INV_VMEM_L1 |
760                                         RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
761
762         radv_meta_restore(&saved_state, cmd_buffer);
763 }
764
765 VkResult radv_CreateQueryPool(
766         VkDevice                                    _device,
767         const VkQueryPoolCreateInfo*                pCreateInfo,
768         const VkAllocationCallbacks*                pAllocator,
769         VkQueryPool*                                pQueryPool)
770 {
771         RADV_FROM_HANDLE(radv_device, device, _device);
772         struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator,
773                                                sizeof(*pool), 8,
774                                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
775         uint32_t initial_value = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP
776                                  ? TIMESTAMP_NOT_READY : 0;
777
778         if (!pool)
779                 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
780
781
782         switch(pCreateInfo->queryType) {
783         case VK_QUERY_TYPE_OCCLUSION:
784                 pool->stride = 16 * get_max_db(device);
785                 break;
786         case VK_QUERY_TYPE_PIPELINE_STATISTICS:
787                 pool->stride = pipelinestat_block_size * 2;
788                 break;
789         case VK_QUERY_TYPE_TIMESTAMP:
790                 pool->stride = 8;
791                 break;
792         default:
793                 unreachable("creating unhandled query type");
794         }
795
796         pool->type = pCreateInfo->queryType;
797         pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
798         pool->availability_offset = pool->stride * pCreateInfo->queryCount;
799         pool->size = pool->availability_offset;
800         if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
801                 pool->size += 4 * pCreateInfo->queryCount;
802
803         pool->bo = device->ws->buffer_create(device->ws, pool->size,
804                                              64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING);
805
806         if (!pool->bo) {
807                 vk_free2(&device->alloc, pAllocator, pool);
808                 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
809         }
810
811         pool->ptr = device->ws->buffer_map(pool->bo);
812
813         if (!pool->ptr) {
814                 device->ws->buffer_destroy(pool->bo);
815                 vk_free2(&device->alloc, pAllocator, pool);
816                 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
817         }
818         memset(pool->ptr, initial_value, pool->size);
819
820         *pQueryPool = radv_query_pool_to_handle(pool);
821         return VK_SUCCESS;
822 }
823
824 void radv_DestroyQueryPool(
825         VkDevice                                    _device,
826         VkQueryPool                                 _pool,
827         const VkAllocationCallbacks*                pAllocator)
828 {
829         RADV_FROM_HANDLE(radv_device, device, _device);
830         RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
831
832         if (!pool)
833                 return;
834
835         device->ws->buffer_destroy(pool->bo);
836         vk_free2(&device->alloc, pAllocator, pool);
837 }
838
839 VkResult radv_GetQueryPoolResults(
840         VkDevice                                    _device,
841         VkQueryPool                                 queryPool,
842         uint32_t                                    firstQuery,
843         uint32_t                                    queryCount,
844         size_t                                      dataSize,
845         void*                                       pData,
846         VkDeviceSize                                stride,
847         VkQueryResultFlags                          flags)
848 {
849         RADV_FROM_HANDLE(radv_device, device, _device);
850         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
851         char *data = pData;
852         VkResult result = VK_SUCCESS;
853
854         for(unsigned i = 0; i < queryCount; ++i, data += stride) {
855                 char *dest = data;
856                 unsigned query = firstQuery + i;
857                 char *src = pool->ptr + query * pool->stride;
858                 uint32_t available;
859
860                 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
861                         if (flags & VK_QUERY_RESULT_WAIT_BIT)
862                                 while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
863                                         ;
864                         available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
865                 }
866
867                 switch (pool->type) {
868                 case VK_QUERY_TYPE_TIMESTAMP: {
869                         available = *(uint64_t *)src != TIMESTAMP_NOT_READY;
870
871                         if (flags & VK_QUERY_RESULT_WAIT_BIT) {
872                                 while (*(volatile uint64_t *)src == TIMESTAMP_NOT_READY)
873                                         ;
874                                 available = *(uint64_t *)src != TIMESTAMP_NOT_READY;
875                         }
876
877                         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
878                                 result = VK_NOT_READY;
879                                 break;
880
881                         }
882
883                         if (flags & VK_QUERY_RESULT_64_BIT) {
884                                 *(uint64_t*)dest = *(uint64_t*)src;
885                                 dest += 8;
886                         } else {
887                                 *(uint32_t*)dest = *(uint32_t*)src;
888                                 dest += 4;
889                         }
890                         break;
891                 }
892                 case VK_QUERY_TYPE_OCCLUSION: {
893                         volatile uint64_t const *src64 = (volatile uint64_t const *)src;
894                         uint64_t sample_count = 0;
895                         int db_count = get_max_db(device);
896                         available = 1;
897
898                         for (int i = 0; i < db_count; ++i) {
899                                 uint64_t start, end;
900                                 do {
901                                         start = src64[2 * i];
902                                         end = src64[2 * i + 1];
903                                 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
904
905                                 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
906                                         available = 0;
907                                 else {
908                                         sample_count += end - start;
909                                 }
910                         }
911
912                         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
913                                 result = VK_NOT_READY;
914                                 break;
915
916                         }
917
918                         if (flags & VK_QUERY_RESULT_64_BIT) {
919                                 *(uint64_t*)dest = sample_count;
920                                 dest += 8;
921                         } else {
922                                 *(uint32_t*)dest = sample_count;
923                                 dest += 4;
924                         }
925                         break;
926                 }
927                 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
928                         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
929                                 result = VK_NOT_READY;
930                                 break;
931
932                         }
933
934                         const uint64_t *start = (uint64_t*)src;
935                         const uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
936                         if (flags & VK_QUERY_RESULT_64_BIT) {
937                                 uint64_t *dst = (uint64_t*)dest;
938                                 dest += util_bitcount(pool->pipeline_stats_mask) * 8;
939                                 for(int i = 0; i < 11; ++i)
940                                         if(pool->pipeline_stats_mask & (1u << i))
941                                                 *dst++ = stop[pipeline_statistics_indices[i]] -
942                                                          start[pipeline_statistics_indices[i]];
943
944                         } else {
945                                 uint32_t *dst = (uint32_t*)dest;
946                                 dest += util_bitcount(pool->pipeline_stats_mask) * 4;
947                                 for(int i = 0; i < 11; ++i)
948                                         if(pool->pipeline_stats_mask & (1u << i))
949                                                 *dst++ = stop[pipeline_statistics_indices[i]] -
950                                                          start[pipeline_statistics_indices[i]];
951                         }
952                         break;
953                 }
954                 default:
955                         unreachable("trying to get results of unhandled query type");
956                 }
957
958                 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
959                         if (flags & VK_QUERY_RESULT_64_BIT) {
960                                 *(uint64_t*)dest = available;
961                         } else {
962                                 *(uint32_t*)dest = available;
963                         }
964                 }
965         }
966
967         return result;
968 }
969
970 void radv_CmdCopyQueryPoolResults(
971     VkCommandBuffer                             commandBuffer,
972     VkQueryPool                                 queryPool,
973     uint32_t                                    firstQuery,
974     uint32_t                                    queryCount,
975     VkBuffer                                    dstBuffer,
976     VkDeviceSize                                dstOffset,
977     VkDeviceSize                                stride,
978     VkQueryResultFlags                          flags)
979 {
980         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
981         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
982         RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
983         struct radeon_cmdbuf *cs = cmd_buffer->cs;
984         unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;
985         uint64_t va = radv_buffer_get_va(pool->bo);
986         uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
987         dest_va += dst_buffer->offset + dstOffset;
988
989         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
990         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
991
992         switch (pool->type) {
993         case VK_QUERY_TYPE_OCCLUSION:
994                 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
995                         for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
996                                 unsigned query = firstQuery + i;
997                                 uint64_t src_va = va + query * pool->stride + pool->stride - 4;
998
999                                 /* Waits on the upper word of the last DB entry */
1000                                 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
1001                                 radeon_emit(cs, 5 | WAIT_REG_MEM_MEM_SPACE(1));
1002                                 radeon_emit(cs, src_va);
1003                                 radeon_emit(cs, src_va >> 32);
1004                                 radeon_emit(cs, 0x80000000); /* reference value */
1005                                 radeon_emit(cs, 0xffffffff); /* mask */
1006                                 radeon_emit(cs, 4); /* poll interval */
1007                         }
1008                 }
1009                 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1010                                   pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1011                                   dst_buffer->offset + dstOffset,
1012                                   get_max_db(cmd_buffer->device) * 16, stride,
1013                                   queryCount, flags, 0, 0);
1014                 break;
1015         case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1016                 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1017                         for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1018                                 unsigned query = firstQuery + i;
1019
1020                                 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1021
1022                                 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1023
1024                                 /* This waits on the ME. All copies below are done on the ME */
1025                                 si_emit_wait_fence(cs, avail_va, 1, 0xffffffff);
1026                         }
1027                 }
1028                 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1029                                   pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1030                                   dst_buffer->offset + dstOffset,
1031                                   pipelinestat_block_size * 2, stride, queryCount, flags,
1032                                   pool->pipeline_stats_mask,
1033                                   pool->availability_offset + 4 * firstQuery);
1034                 break;
1035         case VK_QUERY_TYPE_TIMESTAMP:
1036                 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1037                         unsigned query = firstQuery + i;
1038                         uint64_t local_src_va = va  + query * pool->stride;
1039
1040                         MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 19);
1041
1042
1043                         if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1044                                 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false));
1045                                 radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
1046                                 radeon_emit(cs, local_src_va);
1047                                 radeon_emit(cs, local_src_va >> 32);
1048                                 radeon_emit(cs, TIMESTAMP_NOT_READY >> 32);
1049                                 radeon_emit(cs, 0xffffffff);
1050                                 radeon_emit(cs, 4);
1051                         }
1052                         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1053                                 uint64_t avail_dest_va = dest_va + elem_size;
1054
1055                                 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1056                                 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1057                                                 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM));
1058                                 radeon_emit(cs, local_src_va);
1059                                 radeon_emit(cs, local_src_va >> 32);
1060                                 radeon_emit(cs, avail_dest_va);
1061                                 radeon_emit(cs, avail_dest_va >> 32);
1062                         }
1063
1064                         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1065                         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1066                                         COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
1067                                         ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0));
1068                         radeon_emit(cs, local_src_va);
1069                         radeon_emit(cs, local_src_va >> 32);
1070                         radeon_emit(cs, dest_va);
1071                         radeon_emit(cs, dest_va >> 32);
1072
1073
1074                         assert(cs->cdw <= cdw_max);
1075                 }
1076                 break;
1077         default:
1078                 unreachable("trying to get results of unhandled query type");
1079         }
1080
1081 }
1082
1083 void radv_CmdResetQueryPool(
1084         VkCommandBuffer                             commandBuffer,
1085         VkQueryPool                                 queryPool,
1086         uint32_t                                    firstQuery,
1087         uint32_t                                    queryCount)
1088 {
1089         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1090         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1091         uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1092                          ? TIMESTAMP_NOT_READY : 0;
1093         uint32_t flush_bits = 0;
1094
1095         flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1096                                        firstQuery * pool->stride,
1097                                        queryCount * pool->stride, value);
1098
1099         if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1100                 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1101                                                pool->availability_offset + firstQuery * 4,
1102                                                queryCount * 4, 0);
1103         }
1104
1105         if (flush_bits) {
1106                 /* Only need to flush caches for the compute shader path. */
1107                 cmd_buffer->pending_reset_query = true;
1108                 cmd_buffer->state.flush_bits |= flush_bits;
1109         }
1110 }
1111
1112 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1113                              uint64_t va,
1114                              VkQueryType query_type,
1115                              VkQueryControlFlags flags)
1116 {
1117         struct radeon_cmdbuf *cs = cmd_buffer->cs;
1118         switch (query_type) {
1119         case VK_QUERY_TYPE_OCCLUSION:
1120                 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1121
1122                 ++cmd_buffer->state.active_occlusion_queries;
1123                 if (cmd_buffer->state.active_occlusion_queries == 1) {
1124                         if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1125                                 /* This is the first occlusion query, enable
1126                                  * the hint if the precision bit is set.
1127                                  */
1128                                 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1129                         }
1130
1131                         radv_set_db_count_control(cmd_buffer);
1132                 } else {
1133                         if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1134                             !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1135                                 /* This is not the first query, but this one
1136                                  * needs to enable precision, DB_COUNT_CONTROL
1137                                  * has to be updated accordingly.
1138                                  */
1139                                 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1140
1141                                 radv_set_db_count_control(cmd_buffer);
1142                         }
1143                 }
1144
1145                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1146                 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1147                 radeon_emit(cs, va);
1148                 radeon_emit(cs, va >> 32);
1149                 break;
1150         case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1151                 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1152
1153                 ++cmd_buffer->state.active_pipeline_queries;
1154                 if (cmd_buffer->state.active_pipeline_queries == 1) {
1155                         cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1156                         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1157                 }
1158
1159                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1160                 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1161                 radeon_emit(cs, va);
1162                 radeon_emit(cs, va >> 32);
1163                 break;
1164         default:
1165                 unreachable("beginning unhandled query type");
1166         }
1167
1168 }
1169
1170 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1171                            uint64_t va, uint64_t avail_va,
1172                            VkQueryType query_type)
1173 {
1174         struct radeon_cmdbuf *cs = cmd_buffer->cs;
1175         switch (query_type) {
1176         case VK_QUERY_TYPE_OCCLUSION:
1177                 radeon_check_space(cmd_buffer->device->ws, cs, 14);
1178
1179                 cmd_buffer->state.active_occlusion_queries--;
1180                 if (cmd_buffer->state.active_occlusion_queries == 0) {
1181                         radv_set_db_count_control(cmd_buffer);
1182
1183                         /* Reset the perfect occlusion queries hint now that no
1184                          * queries are active.
1185                          */
1186                         cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1187                 }
1188
1189                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1190                 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1191                 radeon_emit(cs, va + 8);
1192                 radeon_emit(cs, (va + 8) >> 32);
1193
1194                 break;
1195         case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1196                 radeon_check_space(cmd_buffer->device->ws, cs, 16);
1197
1198                 cmd_buffer->state.active_pipeline_queries--;
1199                 if (cmd_buffer->state.active_pipeline_queries == 0) {
1200                         cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1201                         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1202                 }
1203                 va += pipelinestat_block_size;
1204
1205                 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1206                 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1207                 radeon_emit(cs, va);
1208                 radeon_emit(cs, va >> 32);
1209
1210                 si_cs_emit_write_event_eop(cs,
1211                                            cmd_buffer->device->physical_device->rad_info.chip_class,
1212                                            radv_cmd_buffer_uses_mec(cmd_buffer),
1213                                            V_028A90_BOTTOM_OF_PIPE_TS, 0,
1214                                            EOP_DATA_SEL_VALUE_32BIT,
1215                                            avail_va, 0, 1,
1216                                            cmd_buffer->gfx9_eop_bug_va);
1217                 break;
1218         default:
1219                 unreachable("ending unhandled query type");
1220         }
1221 }
1222
1223 void radv_CmdBeginQuery(
1224     VkCommandBuffer                             commandBuffer,
1225     VkQueryPool                                 queryPool,
1226     uint32_t                                    query,
1227     VkQueryControlFlags                         flags)
1228 {
1229         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1230         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1231         struct radeon_cmdbuf *cs = cmd_buffer->cs;
1232         uint64_t va = radv_buffer_get_va(pool->bo);
1233
1234         radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1235
1236         if (cmd_buffer->pending_reset_query) {
1237                 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1238                         /* Only need to flush caches if the query pool size is
1239                          * large enough to be resetted using the compute shader
1240                          * path. Small pools don't need any cache flushes
1241                          * because we use a CP dma clear.
1242                          */
1243                         si_emit_cache_flush(cmd_buffer);
1244                         cmd_buffer->pending_reset_query = false;
1245                 }
1246         }
1247
1248         va += pool->stride * query;
1249
1250         emit_begin_query(cmd_buffer, va, pool->type, flags);
1251 }
1252
1253
1254 void radv_CmdEndQuery(
1255     VkCommandBuffer                             commandBuffer,
1256     VkQueryPool                                 queryPool,
1257     uint32_t                                    query)
1258 {
1259         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1260         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1261         uint64_t va = radv_buffer_get_va(pool->bo);
1262         uint64_t avail_va = va + pool->availability_offset + 4 * query;
1263         va += pool->stride * query;
1264
1265         /* Do not need to add the pool BO to the list because the query must
1266          * currently be active, which means the BO is already in the list.
1267          */
1268         emit_end_query(cmd_buffer, va, avail_va, pool->type);
1269
1270         /*
1271          * For multiview we have to emit a query for each bit in the mask,
1272          * however the first query we emit will get the totals for all the
1273          * operations, so we don't want to get a real value in the other
1274          * queries. This emits a fake begin/end sequence so the waiting
1275          * code gets a completed query value and doesn't hang, but the
1276          * query returns 0.
1277          */
1278         if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1279                 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1280
1281
1282                 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
1283                         va += pool->stride;
1284                         avail_va += 4;
1285                         emit_begin_query(cmd_buffer, va, pool->type, 0);
1286                         emit_end_query(cmd_buffer, va, avail_va, pool->type);
1287                 }
1288         }
1289 }
1290
1291 void radv_CmdWriteTimestamp(
1292     VkCommandBuffer                             commandBuffer,
1293     VkPipelineStageFlagBits                     pipelineStage,
1294     VkQueryPool                                 queryPool,
1295     uint32_t                                    query)
1296 {
1297         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1298         RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1299         bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
1300         struct radeon_cmdbuf *cs = cmd_buffer->cs;
1301         uint64_t va = radv_buffer_get_va(pool->bo);
1302         uint64_t query_va = va + pool->stride * query;
1303
1304         radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1305
1306         int num_queries = 1;
1307         if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
1308                 num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
1309
1310         MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
1311
1312         for (unsigned i = 0; i < num_queries; i++) {
1313                 switch(pipelineStage) {
1314                 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1315                         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1316                         radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
1317                                     COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
1318                                     COPY_DATA_DST_SEL(V_370_MEM_ASYNC));
1319                         radeon_emit(cs, 0);
1320                         radeon_emit(cs, 0);
1321                         radeon_emit(cs, query_va);
1322                         radeon_emit(cs, query_va >> 32);
1323                         break;
1324                 default:
1325                         si_cs_emit_write_event_eop(cs,
1326                                                    cmd_buffer->device->physical_device->rad_info.chip_class,
1327                                                    mec,
1328                                                    V_028A90_BOTTOM_OF_PIPE_TS, 0,
1329                                                    EOP_DATA_SEL_TIMESTAMP,
1330                                                    query_va, 0, 0,
1331                                                    cmd_buffer->gfx9_eop_bug_va);
1332                         break;
1333                 }
1334                 query_va += pool->stride;
1335         }
1336         assert(cmd_buffer->cs->cdw <= cdw_max);
1337 }