OSDN Git Service

radeonsi: initial WIP SI code
[android-x86/external-mesa.git] / src / gallium / drivers / radeonsi / r600_hw_context.c
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Jerome Glisse
25  */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pipe.h"
28 #include "sid.h"
29 #include "util/u_memory.h"
30 #include <errno.h>
31
32 #define GROUP_FORCE_NEW_BLOCK   0
33
34 /* Get backends mask */
35 void r600_get_backend_mask(struct r600_context *ctx)
36 {
37         struct radeon_winsys_cs *cs = ctx->cs;
38         struct r600_resource *buffer;
39         uint32_t *results;
40         unsigned num_backends = ctx->screen->info.r600_num_backends;
41         unsigned i, mask = 0;
42
43         /* if backend_map query is supported by the kernel */
44         if (ctx->screen->info.r600_backend_map_valid) {
45                 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
46                 unsigned backend_map = ctx->screen->info.r600_backend_map;
47                 unsigned item_width, item_mask;
48
49                 if (ctx->chip_class >= CAYMAN) {
50                         item_width = 4;
51                         item_mask = 0x7;
52                 }
53
54                 while(num_tile_pipes--) {
55                         i = backend_map & item_mask;
56                         mask |= (1<<i);
57                         backend_map >>= item_width;
58                 }
59                 if (mask != 0) {
60                         ctx->backend_mask = mask;
61                         return;
62                 }
63         }
64
65         /* otherwise backup path for older kernels */
66
67         /* create buffer for event data */
68         buffer = (struct r600_resource*)
69                 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
70                                    PIPE_USAGE_STAGING, ctx->max_db*16);
71         if (!buffer)
72                 goto err;
73
74         /* initialize buffer with zeroes */
75         results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
76         if (results) {
77                 uint64_t va = 0;
78
79                 memset(results, 0, ctx->max_db * 4 * 4);
80                 ctx->ws->buffer_unmap(buffer->buf);
81
82                 /* emit EVENT_WRITE for ZPASS_DONE */
83                 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
84                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
85                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
86                 cs->buf[cs->cdw++] = va;
87                 cs->buf[cs->cdw++] = va >> 32;
88
89                 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
90                 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
91
92                 /* analyze results */
93                 results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_READ);
94                 if (results) {
95                         for(i = 0; i < ctx->max_db; i++) {
96                                 /* at least highest bit will be set if backend is used */
97                                 if (results[i*4 + 1])
98                                         mask |= (1<<i);
99                         }
100                         ctx->ws->buffer_unmap(buffer->buf);
101                 }
102         }
103
104         pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
105
106         if (mask != 0) {
107                 ctx->backend_mask = mask;
108                 return;
109         }
110
111 err:
112         /* fallback to old method - set num_backends lower bits to 1 */
113         ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
114         return;
115 }
116
117 static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
118 {
119         struct radeon_winsys_cs *cs = ctx->cs;
120
121         if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
122                 return;
123
124         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
125         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
126
127         ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
128 }
129
130 void r600_init_cs(struct r600_context *ctx)
131 {
132         struct radeon_winsys_cs *cs = ctx->cs;
133
134         /* All asics require this one */
135         cs->buf[cs->cdw++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0);
136         cs->buf[cs->cdw++] = 0x80000000;
137         cs->buf[cs->cdw++] = 0x80000000;
138
139         ctx->init_dwords = cs->cdw;
140 }
141
142 static void r600_init_block(struct r600_context *ctx,
143                             struct r600_block *block,
144                             const struct r600_reg *reg, int index, int nreg,
145                             unsigned opcode, unsigned offset_base)
146 {
147         int i = index;
148         int j, n = nreg;
149
150         /* initialize block */
151         block->flags = 0;
152         block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
153         block->start_offset = reg[i].offset;
154         block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
155         block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
156         block->reg = &block->pm4[block->pm4_ndwords];
157         block->pm4_ndwords += n;
158         block->nreg = n;
159         block->nreg_dirty = n;
160         LIST_INITHEAD(&block->list);
161         LIST_INITHEAD(&block->enable_list);
162
163         for (j = 0; j < n; j++) {
164                 if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
165                         block->flags |= REG_FLAG_DIRTY_ALWAYS;
166                 }
167                 if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
168                         if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
169                                 block->status |= R600_BLOCK_STATUS_ENABLED;
170                                 LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
171                                 LIST_ADDTAIL(&block->list,&ctx->dirty);
172                         }
173                 }
174                 if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
175                         block->flags |= REG_FLAG_FLUSH_CHANGE;
176                 }
177
178                 if (reg[i+j].flags & REG_FLAG_NEED_BO) {
179                         block->nbo++;
180                         assert(block->nbo < R600_BLOCK_MAX_BO);
181                         block->pm4_bo_index[j] = block->nbo;
182                         block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
183                         block->pm4[block->pm4_ndwords++] = 0x00000000;
184                         block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
185                 }
186         }
187         /* check that we stay in limit */
188         assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
189 }
190
191 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
192                            unsigned opcode, unsigned offset_base)
193 {
194         struct r600_block *block;
195         struct r600_range *range;
196         int offset;
197
198         for (unsigned i = 0, n = 0; i < nreg; i += n) {
199                 /* ignore new block balise */
200                 if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
201                         n = 1;
202                         continue;
203                 }
204
205                 /* register that need relocation are in their own group */
206                 /* find number of consecutive registers */
207                 n = 0;
208                 offset = reg[i].offset;
209                 while (reg[i + n].offset == offset) {
210                         n++;
211                         offset += 4;
212                         if ((n + i) >= nreg)
213                                 break;
214                         if (n >= (R600_BLOCK_MAX_REG - 2))
215                                 break;
216                 }
217
218                 /* allocate new block */
219                 block = calloc(1, sizeof(struct r600_block));
220                 if (block == NULL) {
221                         return -ENOMEM;
222                 }
223                 ctx->nblocks++;
224                 for (int j = 0; j < n; j++) {
225                         range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
226                         /* create block table if it doesn't exist */
227                         if (!range->blocks)
228                                 range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
229                         if (!range->blocks)
230                                 return -1;
231
232                         range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
233                 }
234
235                 r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
236
237         }
238         return 0;
239 }
240
241
242 /* initialize */
243 void r600_context_fini(struct r600_context *ctx)
244 {
245         struct r600_block *block;
246         struct r600_range *range;
247
248         for (int i = 0; i < NUM_RANGES; i++) {
249                 if (!ctx->range[i].blocks)
250                         continue;
251                 for (int j = 0; j < (1 << HASH_SHIFT); j++) {
252                         block = ctx->range[i].blocks[j];
253                         if (block) {
254                                 for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
255                                         range = &ctx->range[CTX_RANGE_ID(offset)];
256                                         range->blocks[CTX_BLOCK_ID(offset)] = NULL;
257                                 }
258                                 for (int k = 1; k <= block->nbo; k++) {
259                                         pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL);
260                                 }
261                                 free(block);
262                         }
263                 }
264                 free(ctx->range[i].blocks);
265         }
266         free(ctx->range);
267         free(ctx->blocks);
268         ctx->ws->cs_destroy(ctx->cs);
269 }
270
271 int r600_setup_block_table(struct r600_context *ctx)
272 {
273         /* setup block table */
274         int c = 0;
275         ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
276         if (!ctx->blocks)
277                 return -ENOMEM;
278         for (int i = 0; i < NUM_RANGES; i++) {
279                 if (!ctx->range[i].blocks)
280                         continue;
281                 for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
282                         if (!ctx->range[i].blocks[j])
283                                 continue;
284
285                         add = 1;
286                         for (int k = 0; k < c; k++) {
287                                 if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
288                                         add = 0;
289                                         break;
290                                 }
291                         }
292                         if (add) {
293                                 assert(c < ctx->nblocks);
294                                 ctx->blocks[c++] = ctx->range[i].blocks[j];
295                                 j += (ctx->range[i].blocks[j]->nreg) - 1;
296                         }
297                 }
298         }
299
300         return 0;
301 }
302
303 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
304                         boolean count_draw_in)
305 {
306         struct r600_atom *state;
307
308         /* The number of dwords we already used in the CS so far. */
309         num_dw += ctx->cs->cdw;
310
311         if (count_draw_in) {
312                 /* The number of dwords all the dirty states would take. */
313                 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
314                         num_dw += state->num_dw;
315                 }
316
317                 num_dw += ctx->pm4_dirty_cdwords;
318
319                 /* The upper-bound of how much a draw command would take. */
320                 num_dw += R600_MAX_DRAW_CS_DWORDS;
321         }
322
323         /* Count in queries_suspend. */
324         num_dw += ctx->num_cs_dw_queries_suspend;
325
326         /* Count in streamout_end at the end of CS. */
327         num_dw += ctx->num_cs_dw_streamout_end;
328
329         /* Count in render_condition(NULL) at the end of CS. */
330         if (ctx->predicate_drawing) {
331                 num_dw += 3;
332         }
333
334         /* Count in framebuffer cache flushes at the end of CS. */
335         num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
336
337         /* Save 16 dwords for the fence mechanism. */
338         num_dw += 16;
339
340         /* Flush if there's not enough space. */
341         if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
342                 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
343         }
344 }
345
346 void r600_context_dirty_block(struct r600_context *ctx,
347                               struct r600_block *block,
348                               int dirty, int index)
349 {
350         if ((index + 1) > block->nreg_dirty)
351                 block->nreg_dirty = index + 1;
352
353         if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
354                 block->status |= R600_BLOCK_STATUS_DIRTY;
355                 ctx->pm4_dirty_cdwords += block->pm4_ndwords;
356                 if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
357                         block->status |= R600_BLOCK_STATUS_ENABLED;
358                         LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
359                 }
360                 LIST_ADDTAIL(&block->list,&ctx->dirty);
361
362                 if (block->flags & REG_FLAG_FLUSH_CHANGE) {
363                         r600_context_ps_partial_flush(ctx);
364                 }
365         }
366 }
367
368 void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
369 {
370         struct r600_block *block;
371         int dirty;
372         for (int i = 0; i < state->nregs; i++) {
373                 unsigned id, reloc_id;
374                 struct r600_pipe_reg *reg = &state->regs[i];
375
376                 block = reg->block;
377                 id = reg->id;
378
379                 dirty = block->status & R600_BLOCK_STATUS_DIRTY;
380
381                 if (reg->value != block->reg[id]) {
382                         block->reg[id] = reg->value;
383                         dirty |= R600_BLOCK_STATUS_DIRTY;
384                 }
385                 if (block->flags & REG_FLAG_DIRTY_ALWAYS)
386                         dirty |= R600_BLOCK_STATUS_DIRTY;
387                 if (block->pm4_bo_index[id]) {
388                         /* find relocation */
389                         reloc_id = block->pm4_bo_index[id];
390                         pipe_resource_reference((struct pipe_resource**)&block->reloc[reloc_id].bo, &reg->bo->b.b.b);
391                         block->reloc[reloc_id].bo_usage = reg->bo_usage;
392                         /* always force dirty for relocs for now */
393                         dirty |= R600_BLOCK_STATUS_DIRTY;
394                 }
395
396                 if (dirty)
397                         r600_context_dirty_block(ctx, block, dirty, id);
398         }
399 }
400
401 struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
402 {
403         struct r600_range *range;
404         struct r600_block *block;
405         unsigned id;
406
407         range = &ctx->range[CTX_RANGE_ID(offset)];
408         block = range->blocks[CTX_BLOCK_ID(offset)];
409         offset -= block->start_offset;
410         id = block->pm4_bo_index[offset >> 2];
411         if (block->reloc[id].bo) {
412                 return block->reloc[id].bo;
413         }
414         return NULL;
415 }
416
417 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block)
418 {
419         struct radeon_winsys_cs *cs = ctx->cs;
420         int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
421         int cp_dwords = block->pm4_ndwords, start_dword = 0;
422         int new_dwords = 0;
423         int nbo = block->nbo;
424
425         if (block->nreg_dirty == 0 && optional) {
426                 goto out;
427         }
428
429         if (nbo) {
430                 ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
431
432                 for (int j = 0; j < block->nreg; j++) {
433                         if (block->pm4_bo_index[j]) {
434                                 /* find relocation */
435                                 struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
436                                 block->pm4[reloc->bo_pm4_index] =
437                                         r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
438                                 nbo--;
439                                 if (nbo == 0)
440                                         break;
441                         }
442                 }
443                 ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
444         }
445
446         optional &= (block->nreg_dirty != block->nreg);
447         if (optional) {
448                 new_dwords = block->nreg_dirty;
449                 start_dword = cs->cdw;
450                 cp_dwords = new_dwords + 2;
451         }
452         memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4);
453         cs->cdw += cp_dwords;
454
455         if (optional) {
456                 uint32_t newword;
457
458                 newword = cs->buf[start_dword];
459                 newword &= PKT_COUNT_C;
460                 newword |= PKT_COUNT_S(new_dwords);
461                 cs->buf[start_dword] = newword;
462         }
463 out:
464         block->status ^= R600_BLOCK_STATUS_DIRTY;
465         block->nreg_dirty = 0;
466         LIST_DELINIT(&block->list);
467 }
468
469 void r600_inval_shader_cache(struct r600_context *ctx)
470 {
471         ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
472         ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
473         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
474 }
475
476 void r600_inval_texture_cache(struct r600_context *ctx)
477 {
478         ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
479         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
480 }
481
482 void r600_inval_vertex_cache(struct r600_context *ctx)
483 {
484         /* Some GPUs don't have the vertex cache and must use the texture cache instead. */
485         ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
486         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
487 }
488
489 void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
490 {
491         if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
492                 return;
493
494         ctx->atom_surface_sync.flush_flags |=
495                 r600_get_cb_flush_flags(ctx) |
496                 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
497
498         if (flush_now) {
499                 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
500         } else {
501                 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
502         }
503
504         ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
505 }
506
507 void r600_context_flush(struct r600_context *ctx, unsigned flags)
508 {
509         struct radeon_winsys_cs *cs = ctx->cs;
510         struct r600_block *enable_block = NULL;
511         bool queries_suspended = false;
512         bool streamout_suspended = false;
513
514         if (cs->cdw == ctx->init_dwords)
515                 return;
516
517         /* suspend queries */
518         if (ctx->num_cs_dw_queries_suspend) {
519                 r600_context_queries_suspend(ctx);
520                 queries_suspended = true;
521         }
522
523         if (ctx->num_cs_dw_streamout_end) {
524                 r600_context_streamout_end(ctx);
525                 streamout_suspended = true;
526         }
527
528         r600_flush_framebuffer(ctx, true);
529
530         /* partial flush is needed to avoid lockups on some chips with user fences */
531         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
532         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
533
534         /* Flush the CS. */
535         ctx->ws->cs_flush(ctx->cs, flags);
536
537         ctx->pm4_dirty_cdwords = 0;
538         ctx->flags = 0;
539
540         r600_init_cs(ctx);
541
542         if (streamout_suspended) {
543                 ctx->streamout_start = TRUE;
544                 ctx->streamout_append_bitmask = ~0;
545         }
546
547         /* resume queries */
548         if (queries_suspended) {
549                 r600_context_queries_resume(ctx);
550         }
551
552         /* set all valid group as dirty so they get reemited on
553          * next draw command
554          */
555         LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
556                 if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
557                         LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
558                         enable_block->status |= R600_BLOCK_STATUS_DIRTY;
559                 }
560                 ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords;
561                 enable_block->nreg_dirty = enable_block->nreg;
562         }
563 }
564
565 void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value)
566 {
567         struct radeon_winsys_cs *cs = ctx->cs;
568         uint64_t va;
569
570         r600_need_cs_space(ctx, 10, FALSE);
571
572         va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
573         va = va + (offset << 2);
574
575         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
576         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
577         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
578         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
579         cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
580         /* DATA_SEL | INT_EN | ADDRESS_HI */
581         cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
582         cs->buf[cs->cdw++] = value;                   /* DATA_LO */
583         cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
584         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
585         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
586 }
587
588 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
589                                        bool test_status_bit)
590 {
591         uint32_t *current_result = (uint32_t*)map;
592         uint64_t start, end;
593
594         start = (uint64_t)current_result[start_index] |
595                 (uint64_t)current_result[start_index+1] << 32;
596         end = (uint64_t)current_result[end_index] |
597               (uint64_t)current_result[end_index+1] << 32;
598
599         if (!test_status_bit ||
600             ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
601                 return end - start;
602         }
603         return 0;
604 }
605
606 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
607 {
608         unsigned results_base = query->results_start;
609         char *map;
610
611         map = ctx->ws->buffer_map(query->buffer->buf, ctx->cs,
612                                   PIPE_TRANSFER_READ |
613                                   (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
614         if (!map)
615                 return FALSE;
616
617         /* count all results across all data blocks */
618         switch (query->type) {
619         case PIPE_QUERY_OCCLUSION_COUNTER:
620                 while (results_base != query->results_end) {
621                         query->result.u64 +=
622                                 r600_query_read_result(map + results_base, 0, 2, true);
623                         results_base = (results_base + 16) % query->buffer->b.b.b.width0;
624                 }
625                 break;
626         case PIPE_QUERY_OCCLUSION_PREDICATE:
627                 while (results_base != query->results_end) {
628                         query->result.b = query->result.b ||
629                                 r600_query_read_result(map + results_base, 0, 2, true) != 0;
630                         results_base = (results_base + 16) % query->buffer->b.b.b.width0;
631                 }
632                 break;
633         case PIPE_QUERY_TIME_ELAPSED:
634                 while (results_base != query->results_end) {
635                         query->result.u64 +=
636                                 r600_query_read_result(map + results_base, 0, 2, false);
637                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
638                 }
639                 break;
640         case PIPE_QUERY_PRIMITIVES_EMITTED:
641                 /* SAMPLE_STREAMOUTSTATS stores this structure:
642                  * {
643                  *    u64 NumPrimitivesWritten;
644                  *    u64 PrimitiveStorageNeeded;
645                  * }
646                  * We only need NumPrimitivesWritten here. */
647                 while (results_base != query->results_end) {
648                         query->result.u64 +=
649                                 r600_query_read_result(map + results_base, 2, 6, true);
650                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
651                 }
652                 break;
653         case PIPE_QUERY_PRIMITIVES_GENERATED:
654                 /* Here we read PrimitiveStorageNeeded. */
655                 while (results_base != query->results_end) {
656                         query->result.u64 +=
657                                 r600_query_read_result(map + results_base, 0, 4, true);
658                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
659                 }
660                 break;
661         case PIPE_QUERY_SO_STATISTICS:
662                 while (results_base != query->results_end) {
663                         query->result.so.num_primitives_written +=
664                                 r600_query_read_result(map + results_base, 2, 6, true);
665                         query->result.so.primitives_storage_needed +=
666                                 r600_query_read_result(map + results_base, 0, 4, true);
667                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
668                 }
669                 break;
670         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
671                 while (results_base != query->results_end) {
672                         query->result.b = query->result.b ||
673                                 r600_query_read_result(map + results_base, 2, 6, true) !=
674                                 r600_query_read_result(map + results_base, 0, 4, true);
675                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
676                 }
677                 break;
678         default:
679                 assert(0);
680         }
681
682         query->results_start = query->results_end;
683         ctx->ws->buffer_unmap(query->buffer->buf);
684         return TRUE;
685 }
686
687 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
688 {
689         struct radeon_winsys_cs *cs = ctx->cs;
690         unsigned new_results_end, i;
691         uint32_t *results;
692         uint64_t va;
693
694         r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
695
696         new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0;
697
698         /* collect current results if query buffer is full */
699         if (new_results_end == query->results_start) {
700                 r600_query_result(ctx, query, TRUE);
701         }
702
703         switch (query->type) {
704         case PIPE_QUERY_OCCLUSION_COUNTER:
705         case PIPE_QUERY_OCCLUSION_PREDICATE:
706                 results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
707                 if (results) {
708                         results = (uint32_t*)((char*)results + query->results_end);
709                         memset(results, 0, query->result_size);
710
711                         /* Set top bits for unused backends */
712                         for (i = 0; i < ctx->max_db; i++) {
713                                 if (!(ctx->backend_mask & (1<<i))) {
714                                         results[(i * 4)+1] = 0x80000000;
715                                         results[(i * 4)+3] = 0x80000000;
716                                 }
717                         }
718                         ctx->ws->buffer_unmap(query->buffer->buf);
719                 }
720                 break;
721         case PIPE_QUERY_TIME_ELAPSED:
722                 break;
723         case PIPE_QUERY_PRIMITIVES_EMITTED:
724         case PIPE_QUERY_PRIMITIVES_GENERATED:
725         case PIPE_QUERY_SO_STATISTICS:
726         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
727                 results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
728                 results = (uint32_t*)((char*)results + query->results_end);
729                 memset(results, 0, query->result_size);
730                 ctx->ws->buffer_unmap(query->buffer->buf);
731                 break;
732         default:
733                 assert(0);
734         }
735
736         /* emit begin query */
737         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
738         va += query->results_end;
739
740         switch (query->type) {
741         case PIPE_QUERY_OCCLUSION_COUNTER:
742         case PIPE_QUERY_OCCLUSION_PREDICATE:
743                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
744                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
745                 cs->buf[cs->cdw++] = va;
746                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
747                 break;
748         case PIPE_QUERY_PRIMITIVES_EMITTED:
749         case PIPE_QUERY_PRIMITIVES_GENERATED:
750         case PIPE_QUERY_SO_STATISTICS:
751         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
752                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
753                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
754                 cs->buf[cs->cdw++] = query->results_end;
755                 cs->buf[cs->cdw++] = 0;
756                 break;
757         case PIPE_QUERY_TIME_ELAPSED:
758                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
759                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
760                 cs->buf[cs->cdw++] = va;
761                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
762                 cs->buf[cs->cdw++] = 0;
763                 cs->buf[cs->cdw++] = 0;
764                 break;
765         default:
766                 assert(0);
767         }
768         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
769         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
770
771         ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
772 }
773
774 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
775 {
776         struct radeon_winsys_cs *cs = ctx->cs;
777         uint64_t va;
778
779         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
780         /* emit end query */
781         switch (query->type) {
782         case PIPE_QUERY_OCCLUSION_COUNTER:
783         case PIPE_QUERY_OCCLUSION_PREDICATE:
784                 va += query->results_end + 8;
785                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
786                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
787                 cs->buf[cs->cdw++] = va;
788                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
789                 break;
790         case PIPE_QUERY_PRIMITIVES_EMITTED:
791         case PIPE_QUERY_PRIMITIVES_GENERATED:
792         case PIPE_QUERY_SO_STATISTICS:
793         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
794                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
795                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
796                 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
797                 cs->buf[cs->cdw++] = 0;
798                 break;
799         case PIPE_QUERY_TIME_ELAPSED:
800                 va += query->results_end + query->result_size/2;
801                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
802                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
803                 cs->buf[cs->cdw++] = va;
804                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
805                 cs->buf[cs->cdw++] = 0;
806                 cs->buf[cs->cdw++] = 0;
807                 break;
808         default:
809                 assert(0);
810         }
811         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
812         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
813
814         query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0;
815         ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
816 }
817
818 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
819                             int flag_wait)
820 {
821         struct radeon_winsys_cs *cs = ctx->cs;
822         uint64_t va;
823
824         if (operation == PREDICATION_OP_CLEAR) {
825                 r600_need_cs_space(ctx, 3, FALSE);
826
827                 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
828                 cs->buf[cs->cdw++] = 0;
829                 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
830         } else {
831                 unsigned results_base = query->results_start;
832                 unsigned count;
833                 uint32_t op;
834
835                 /* find count of the query data blocks */
836                 count = (query->buffer->b.b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.b.width0;
837                 count /= query->result_size;
838
839                 r600_need_cs_space(ctx, 5 * count, TRUE);
840
841                 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
842                                 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
843                 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
844
845                 /* emit predicate packets for all data blocks */
846                 while (results_base != query->results_end) {
847                         cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
848                         cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
849                         cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
850                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
851                         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
852                                                                              RADEON_USAGE_READ);
853                         results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
854
855                         /* set CONTINUE bit for all packets except the first */
856                         op |= PREDICATION_CONTINUE;
857                 }
858         }
859 }
860
861 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
862 {
863         struct r600_query *query;
864         unsigned buffer_size = 4096;
865
866         query = CALLOC_STRUCT(r600_query);
867         if (query == NULL)
868                 return NULL;
869
870         query->type = query_type;
871
872         switch (query_type) {
873         case PIPE_QUERY_OCCLUSION_COUNTER:
874         case PIPE_QUERY_OCCLUSION_PREDICATE:
875                 query->result_size = 16 * ctx->max_db;
876                 query->num_cs_dw = 6;
877                 break;
878         case PIPE_QUERY_TIME_ELAPSED:
879                 query->result_size = 16;
880                 query->num_cs_dw = 8;
881                 break;
882         case PIPE_QUERY_PRIMITIVES_EMITTED:
883         case PIPE_QUERY_PRIMITIVES_GENERATED:
884         case PIPE_QUERY_SO_STATISTICS:
885         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
886                 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
887                 query->result_size = 32;
888                 query->num_cs_dw = 6;
889                 break;
890         default:
891                 assert(0);
892                 FREE(query);
893                 return NULL;
894         }
895
896         /* adjust buffer size to simplify offsets wrapping math */
897         buffer_size -= buffer_size % query->result_size;
898
899         /* Queries are normally read by the CPU after
900          * being written by the gpu, hence staging is probably a good
901          * usage pattern.
902          */
903         query->buffer = (struct r600_resource*)
904                 pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buffer_size);
905         if (!query->buffer) {
906                 FREE(query);
907                 return NULL;
908         }
909         return query;
910 }
911
912 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
913 {
914         pipe_resource_reference((struct pipe_resource**)&query->buffer, NULL);
915         free(query);
916 }
917
918 boolean r600_context_query_result(struct r600_context *ctx,
919                                 struct r600_query *query,
920                                 boolean wait, void *vresult)
921 {
922         boolean *result_b = (boolean*)vresult;
923         uint64_t *result_u64 = (uint64_t*)vresult;
924         struct pipe_query_data_so_statistics *result_so =
925                 (struct pipe_query_data_so_statistics*)vresult;
926
927         if (!r600_query_result(ctx, query, wait))
928                 return FALSE;
929
930         switch (query->type) {
931         case PIPE_QUERY_OCCLUSION_COUNTER:
932         case PIPE_QUERY_PRIMITIVES_EMITTED:
933         case PIPE_QUERY_PRIMITIVES_GENERATED:
934                 *result_u64 = query->result.u64;
935                 break;
936         case PIPE_QUERY_OCCLUSION_PREDICATE:
937         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
938                 *result_b = query->result.b;
939                 break;
940         case PIPE_QUERY_TIME_ELAPSED:
941                 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
942                 break;
943         case PIPE_QUERY_SO_STATISTICS:
944                 *result_so = query->result.so;
945                 break;
946         default:
947                 assert(0);
948         }
949         return TRUE;
950 }
951
952 void r600_context_queries_suspend(struct r600_context *ctx)
953 {
954         struct r600_query *query;
955
956         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
957                 r600_query_end(ctx, query);
958         }
959         assert(ctx->num_cs_dw_queries_suspend == 0);
960 }
961
962 void r600_context_queries_resume(struct r600_context *ctx)
963 {
964         struct r600_query *query;
965
966         assert(ctx->num_cs_dw_queries_suspend == 0);
967
968         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
969                 r600_query_begin(ctx, query);
970         }
971 }
972
973 void r600_context_streamout_begin(struct r600_context *ctx)
974 {
975         struct radeon_winsys_cs *cs = ctx->cs;
976         struct r600_so_target **t = ctx->so_targets;
977         unsigned *strides = ctx->vs_shader_so_strides;
978         unsigned buffer_en, i;
979
980         buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
981                     (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
982                     (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
983                     (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
984
985         ctx->num_cs_dw_streamout_end =
986                 12 + /* flush_vgt_streamout */
987                 util_bitcount(buffer_en) * 8 +
988                 3;
989
990         r600_need_cs_space(ctx,
991                            12 + /* flush_vgt_streamout */
992                            6 + /* enables */
993                            util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
994                            util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
995                            ctx->num_cs_dw_streamout_end, TRUE);
996
997         if (ctx->chip_class >= CAYMAN) {
998                 evergreen_flush_vgt_streamout(ctx);
999                 evergreen_set_streamout_enable(ctx, buffer_en);
1000         }
1001
1002         for (i = 0; i < ctx->num_so_targets; i++) {
1003 #if 0
1004                 if (t[i]) {
1005                         t[i]->stride = strides[i];
1006                         t[i]->so_index = i;
1007
1008                         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
1009                         cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
1010                                                         16*i - SI_CONTEXT_REG_OFFSET) >> 2;
1011                         cs->buf[cs->cdw++] = (t[i]->b.buffer_offset +
1012                                                         t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
1013                         cs->buf[cs->cdw++] = strides[i] >> 2;              /* VTX_STRIDE (in DW) */
1014                         cs->buf[cs->cdw++] = 0;                    /* BUFFER_BASE */
1015
1016                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1017                         cs->buf[cs->cdw++] =
1018                                 r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
1019                                                       RADEON_USAGE_WRITE);
1020
1021                         if (ctx->streamout_append_bitmask & (1 << i)) {
1022                                 /* Append. */
1023                                 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1024                                 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1025                                                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
1026                                 cs->buf[cs->cdw++] = 0; /* unused */
1027                                 cs->buf[cs->cdw++] = 0; /* unused */
1028                                 cs->buf[cs->cdw++] = 0; /* src address lo */
1029                                 cs->buf[cs->cdw++] = 0; /* src address hi */
1030
1031                                 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1032                                 cs->buf[cs->cdw++] =
1033                                         r600_context_bo_reloc(ctx,  t[i]->filled_size,
1034                                                               RADEON_USAGE_READ);
1035                         } else {
1036                                 /* Start from the beginning. */
1037                                 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1038                                 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1039                                                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
1040                                 cs->buf[cs->cdw++] = 0; /* unused */
1041                                 cs->buf[cs->cdw++] = 0; /* unused */
1042                                 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
1043                                 cs->buf[cs->cdw++] = 0; /* unused */
1044                         }
1045                 }
1046 #endif
1047         }
1048 }
1049
1050 void r600_context_streamout_end(struct r600_context *ctx)
1051 {
1052         struct radeon_winsys_cs *cs = ctx->cs;
1053         struct r600_so_target **t = ctx->so_targets;
1054         unsigned i, flush_flags = 0;
1055
1056         evergreen_flush_vgt_streamout(ctx);
1057
1058         for (i = 0; i < ctx->num_so_targets; i++) {
1059 #if 0
1060                 if (t[i]) {
1061                         cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
1062                         cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
1063                                                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
1064                                                        STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
1065                         cs->buf[cs->cdw++] = 0; /* dst address lo */
1066                         cs->buf[cs->cdw++] = 0; /* dst address hi */
1067                         cs->buf[cs->cdw++] = 0; /* unused */
1068                         cs->buf[cs->cdw++] = 0; /* unused */
1069
1070                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1071                         cs->buf[cs->cdw++] =
1072                                 r600_context_bo_reloc(ctx,  t[i]->filled_size,
1073                                                       RADEON_USAGE_WRITE);
1074
1075                         flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
1076                 }
1077 #endif
1078         }
1079
1080         evergreen_set_streamout_enable(ctx, 0);
1081
1082         ctx->atom_surface_sync.flush_flags |= flush_flags;
1083         r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
1084
1085         ctx->num_cs_dw_streamout_end = 0;
1086
1087         /* XXX print some debug info */
1088         for (i = 0; i < ctx->num_so_targets; i++) {
1089                 if (!t[i])
1090                         continue;
1091
1092                 uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ);
1093                 printf("FILLED_SIZE%i: %u\n", i, *ptr);
1094                 ctx->ws->buffer_unmap(t[i]->filled_size->buf);
1095         }
1096 }
1097
1098 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
1099 {
1100         struct radeon_winsys_cs *cs = ctx->cs;
1101         r600_need_cs_space(ctx, 14 + 21, TRUE);
1102
1103         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1104         cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
1105         cs->buf[cs->cdw++] = 0;
1106
1107         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
1108         cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
1109         cs->buf[cs->cdw++] = t->stride >> 2;
1110
1111 #if 0
1112         cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1113         cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
1114         cs->buf[cs->cdw++] = 0; /* src address lo */
1115         cs->buf[cs->cdw++] = 0; /* src address hi */
1116         cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
1117         cs->buf[cs->cdw++] = 0; /* unused */
1118 #endif
1119
1120         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1121         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
1122
1123 #if 0 /* I have not found this useful yet. */
1124         cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
1125         cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
1126         cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
1127         cs->buf[cs->cdw++] = 0; /* unused */
1128         cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
1129         cs->buf[cs->cdw++] = 0; /* unused */
1130
1131         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1132         cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
1133         cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
1134
1135         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
1136         cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
1137         cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
1138
1139         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
1140         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
1141                                                              RADEON_USAGE_WRITE);
1142
1143         cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
1144         cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
1145         cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
1146         cs->buf[cs->cdw++] = 0;
1147         cs->buf[cs->cdw++] = 0; /* reference value */
1148         cs->buf[cs->cdw++] = 0xffffffff; /* mask */
1149         cs->buf[cs->cdw++] = 4; /* poll interval */
1150 #endif
1151 }