OSDN Git Service

r600g,radeonsi: don't append to streamout buffers that haven't been used yet
[android-x86/external-mesa.git] / src / gallium / drivers / radeon / r600_streamout.c
1 /*
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors: Marek Olšák <maraeo@gmail.com>
24  *
25  */
26
27 #include "r600_pipe_common.h"
28 #include "r600_cs.h"
29
30 #include "util/u_memory.h"
31
32 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable);
33
34 static struct pipe_stream_output_target *
35 r600_create_so_target(struct pipe_context *ctx,
36                       struct pipe_resource *buffer,
37                       unsigned buffer_offset,
38                       unsigned buffer_size)
39 {
40         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
41         struct r600_so_target *t;
42         struct r600_resource *rbuffer = (struct r600_resource*)buffer;
43
44         t = CALLOC_STRUCT(r600_so_target);
45         if (!t) {
46                 return NULL;
47         }
48
49         u_suballocator_alloc(rctx->allocator_so_filled_size, 4,
50                              &t->buf_filled_size_offset,
51                              (struct pipe_resource**)&t->buf_filled_size);
52         if (!t->buf_filled_size) {
53                 FREE(t);
54                 return NULL;
55         }
56
57         t->b.reference.count = 1;
58         t->b.context = ctx;
59         pipe_resource_reference(&t->b.buffer, buffer);
60         t->b.buffer_offset = buffer_offset;
61         t->b.buffer_size = buffer_size;
62
63         util_range_add(&rbuffer->valid_buffer_range, buffer_offset,
64                        buffer_offset + buffer_size);
65         return &t->b;
66 }
67
68 static void r600_so_target_destroy(struct pipe_context *ctx,
69                                    struct pipe_stream_output_target *target)
70 {
71         struct r600_so_target *t = (struct r600_so_target*)target;
72         pipe_resource_reference(&t->b.buffer, NULL);
73         pipe_resource_reference((struct pipe_resource**)&t->buf_filled_size, NULL);
74         FREE(t);
75 }
76
77 void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
78 {
79         struct r600_atom *begin = &rctx->streamout.begin_atom;
80         unsigned num_bufs = util_bitcount(rctx->streamout.enabled_mask);
81         unsigned num_bufs_appended = util_bitcount(rctx->streamout.enabled_mask &
82                                                    rctx->streamout.append_bitmask);
83
84         if (!num_bufs)
85                 return;
86
87         rctx->streamout.num_dw_for_end =
88                 12 + /* flush_vgt_streamout */
89                 num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */
90
91         begin->num_dw = 12 + /* flush_vgt_streamout */
92                         3; /* VGT_STRMOUT_BUFFER_CONFIG */
93
94         if (rctx->chip_class >= SI) {
95                 begin->num_dw += num_bufs * 4; /* SET_CONTEXT_REG */
96         } else {
97                 begin->num_dw += num_bufs * 7; /* SET_CONTEXT_REG */
98
99                 if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740)
100                         begin->num_dw += num_bufs * 5; /* STRMOUT_BASE_UPDATE */
101         }
102
103         begin->num_dw +=
104                 num_bufs_appended * 8 + /* STRMOUT_BUFFER_UPDATE */
105                 (num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */
106                 (rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */
107
108         begin->dirty = true;
109
110         r600_set_streamout_enable(rctx, true);
111 }
112
113 void r600_set_streamout_targets(struct pipe_context *ctx,
114                                 unsigned num_targets,
115                                 struct pipe_stream_output_target **targets,
116                                 const unsigned *offsets)
117 {
118         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
119         unsigned i;
120         unsigned append_bitmask = 0;
121
122         /* Stop streamout. */
123         if (rctx->streamout.num_targets && rctx->streamout.begin_emitted) {
124                 r600_emit_streamout_end(rctx);
125         }
126
127         /* Set the new targets. */
128         for (i = 0; i < num_targets; i++) {
129                 pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], targets[i]);
130                 r600_context_add_resource_size(ctx, targets[i]->buffer);
131                 if (offsets[i] == ((unsigned)-1))
132                         append_bitmask |=  1 << i;
133         }
134         for (; i < rctx->streamout.num_targets; i++) {
135                 pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], NULL);
136         }
137
138         rctx->streamout.enabled_mask = (num_targets >= 1 && targets[0] ? 1 : 0) |
139                                        (num_targets >= 2 && targets[1] ? 2 : 0) |
140                                        (num_targets >= 3 && targets[2] ? 4 : 0) |
141                                        (num_targets >= 4 && targets[3] ? 8 : 0);
142
143         rctx->streamout.num_targets = num_targets;
144         rctx->streamout.append_bitmask = append_bitmask;
145
146         if (num_targets) {
147                 r600_streamout_buffers_dirty(rctx);
148         } else {
149                 rctx->streamout.begin_atom.dirty = false;
150                 r600_set_streamout_enable(rctx, false);
151         }
152 }
153
154 static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
155 {
156         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
157         unsigned reg_strmout_cntl;
158
159         /* The register is at different places on different ASICs. */
160         if (rctx->chip_class >= CIK) {
161                 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
162         } else if (rctx->chip_class >= EVERGREEN) {
163                 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
164         } else {
165                 reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL;
166         }
167
168         if (rctx->chip_class >= CIK) {
169                 cik_write_uconfig_reg(cs, reg_strmout_cntl, 0);
170         } else {
171                 r600_write_config_reg(cs, reg_strmout_cntl, 0);
172         }
173
174         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
175         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
176
177         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
178         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
179         radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
180         radeon_emit(cs, 0);
181         radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* reference value */
182         radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* mask */
183         radeon_emit(cs, 4); /* poll interval */
184 }
185
186 static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
187 {
188         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
189         struct r600_so_target **t = rctx->streamout.targets;
190         unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
191         unsigned i, update_flags = 0;
192
193         r600_flush_vgt_streamout(rctx);
194
195         r600_write_context_reg(cs, rctx->chip_class >= EVERGREEN ?
196                                        R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
197                                        R_028B20_VGT_STRMOUT_BUFFER_EN,
198                                rctx->streamout.enabled_mask);
199
200         for (i = 0; i < rctx->streamout.num_targets; i++) {
201                 if (!t[i])
202                         continue;
203
204                 t[i]->stride_in_dw = stride_in_dw[i];
205
206                 if (rctx->chip_class >= SI) {
207                         /* SI binds streamout buffers as shader resources.
208                          * VGT only counts primitives and tells the shader
209                          * through SGPRs what to do. */
210                         r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
211                         radeon_emit(cs, (t[i]->b.buffer_offset +
212                                          t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
213                         radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
214                 } else {
215                         uint64_t va = r600_resource(t[i]->b.buffer)->gpu_address;
216
217                         update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
218
219                         r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
220                         radeon_emit(cs, (t[i]->b.buffer_offset +
221                                          t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
222                         radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
223                         radeon_emit(cs, va >> 8);                       /* BUFFER_BASE */
224
225                         r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
226                                         RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
227
228                         /* R7xx requires this packet after updating BUFFER_BASE.
229                          * Without this, R7xx locks up. */
230                         if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) {
231                                 radeon_emit(cs, PKT3(PKT3_STRMOUT_BASE_UPDATE, 1, 0));
232                                 radeon_emit(cs, i);
233                                 radeon_emit(cs, va >> 8);
234
235                                 r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
236                                                 RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
237                         }
238                 }
239
240                 if (rctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
241                         uint64_t va = t[i]->buf_filled_size->gpu_address +
242                                       t[i]->buf_filled_size_offset;
243
244                         /* Append. */
245                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
246                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
247                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
248                         radeon_emit(cs, 0); /* unused */
249                         radeon_emit(cs, 0); /* unused */
250                         radeon_emit(cs, va); /* src address lo */
251                         radeon_emit(cs, va >> 32); /* src address hi */
252
253                         r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
254                                         RADEON_USAGE_READ, RADEON_PRIO_MIN);
255                 } else {
256                         /* Start from the beginning. */
257                         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
258                         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
259                                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
260                         radeon_emit(cs, 0); /* unused */
261                         radeon_emit(cs, 0); /* unused */
262                         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
263                         radeon_emit(cs, 0); /* unused */
264                 }
265         }
266
267         if (rctx->family > CHIP_R600 && rctx->family < CHIP_RV770) {
268                 radeon_emit(cs, PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0));
269                 radeon_emit(cs, update_flags);
270         }
271         rctx->streamout.begin_emitted = true;
272 }
273
274 void r600_emit_streamout_end(struct r600_common_context *rctx)
275 {
276         struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
277         struct r600_so_target **t = rctx->streamout.targets;
278         unsigned i;
279         uint64_t va;
280
281         r600_flush_vgt_streamout(rctx);
282
283         for (i = 0; i < rctx->streamout.num_targets; i++) {
284                 if (!t[i])
285                         continue;
286
287                 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
288                 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
289                 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
290                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
291                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
292                 radeon_emit(cs, va);     /* dst address lo */
293                 radeon_emit(cs, va >> 32); /* dst address hi */
294                 radeon_emit(cs, 0); /* unused */
295                 radeon_emit(cs, 0); /* unused */
296
297                 r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
298                                 RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
299
300                 /* Zero the buffer size. The counters (primitives generated,
301                  * primitives emitted) may be enabled even if there is not
302                  * buffer bound. This ensures that the primitives-emitted query
303                  * won't increment. */
304                 r600_write_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
305
306                 t[i]->buf_filled_size_valid = true;
307         }
308
309         rctx->streamout.begin_emitted = false;
310         rctx->flags |= R600_CONTEXT_STREAMOUT_FLUSH;
311 }
312
313 /* STREAMOUT CONFIG DERIVED STATE
314  *
315  * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
316  * The buffer mask is an independent state, so no writes occur if there
317  * are no buffers bound.
318  */
319
320 static bool r600_get_strmout_en(struct r600_common_context *rctx)
321 {
322         return rctx->streamout.streamout_enabled ||
323                rctx->streamout.prims_gen_query_enabled;
324 }
325
326 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
327                                        struct r600_atom *atom)
328 {
329         r600_write_context_reg(rctx->rings.gfx.cs,
330                                rctx->chip_class >= EVERGREEN ?
331                                        R_028B94_VGT_STRMOUT_CONFIG :
332                                        R_028AB0_VGT_STRMOUT_EN,
333                                S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)));
334 }
335
336 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
337 {
338         bool old_strmout_en = r600_get_strmout_en(rctx);
339
340         rctx->streamout.streamout_enabled = enable;
341         if (old_strmout_en != r600_get_strmout_en(rctx))
342                 rctx->streamout.enable_atom.dirty = true;
343 }
344
345 void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
346                                              unsigned type, int diff)
347 {
348         if (type == PIPE_QUERY_PRIMITIVES_GENERATED) {
349                 bool old_strmout_en = r600_get_strmout_en(rctx);
350
351                 rctx->streamout.num_prims_gen_queries += diff;
352                 assert(rctx->streamout.num_prims_gen_queries >= 0);
353
354                 rctx->streamout.prims_gen_query_enabled =
355                         rctx->streamout.num_prims_gen_queries != 0;
356
357                 if (old_strmout_en != r600_get_strmout_en(rctx))
358                         rctx->streamout.enable_atom.dirty = true;
359         }
360 }
361
362 void r600_streamout_init(struct r600_common_context *rctx)
363 {
364         rctx->b.create_stream_output_target = r600_create_so_target;
365         rctx->b.stream_output_target_destroy = r600_so_target_destroy;
366         rctx->streamout.begin_atom.emit = r600_emit_streamout_begin;
367         rctx->streamout.enable_atom.emit = r600_emit_streamout_enable;
368         rctx->streamout.enable_atom.num_dw = 3;
369 }