OSDN Git Service

vc4: Add more useful debug for the undefined-source case
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <stdio.h>
25 #include <inttypes.h>
26
27 #include "vc4_context.h"
28 #include "vc4_qir.h"
29 #include "vc4_qpu.h"
30
31 static void
32 vc4_dump_program(struct qcompile *c)
33 {
34         fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
35
36         for (int i = 0; i < c->qpu_inst_count; i++) {
37                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
38                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
39                 fprintf(stderr, "\n");
40         }
41 }
42
43 struct queued_qpu_inst {
44         struct simple_node link;
45         uint64_t inst;
46 };
47
48 static void
49 queue(struct qcompile *c, uint64_t inst)
50 {
51         struct queued_qpu_inst *q = calloc(1, sizeof(*q));
52         q->inst = inst;
53         insert_at_tail(&c->qpu_inst_list, &q->link);
54 }
55
56 static uint64_t *
57 last_inst(struct qcompile *c)
58 {
59         struct queued_qpu_inst *q =
60                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
61         return &q->inst;
62 }
63
64 /**
65  * This is used to resolve the fact that we might register-allocate two
66  * different operands of an instruction to the same physical register file
67  * even though instructions have only one field for the register file source
68  * address.
69  *
70  * In that case, we need to move one to a temporary that can be used in the
71  * instruction, instead.
72  */
73 static void
74 fixup_raddr_conflict(struct qcompile *c,
75                struct qpu_reg src0, struct qpu_reg *src1)
76 {
77         if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
78             (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
79             src0.addr != src1->addr) {
80                 queue(c, qpu_inst(qpu_a_MOV(qpu_r3(), *src1),
81                                   qpu_m_NOP()));
82                 *src1 = qpu_r3();
83         }
84 }
85
86 static void
87 serialize_one_inst(struct qcompile *c, uint64_t inst)
88 {
89         if (c->qpu_inst_count >= c->qpu_inst_size) {
90                 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
91                 c->qpu_insts = realloc(c->qpu_insts,
92                                        c->qpu_inst_size * sizeof(uint64_t));
93         }
94         c->qpu_insts[c->qpu_inst_count++] = inst;
95 }
96
97 static void
98 serialize_insts(struct qcompile *c)
99 {
100         int last_sfu_write = -10;
101
102         while (!is_empty_list(&c->qpu_inst_list)) {
103                 struct queued_qpu_inst *q =
104                         (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
105                 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
106                 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
107                 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
108
109                 if (c->qpu_inst_count > 0) {
110                         uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
111                                                           1];
112                         uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
113                                                                 QPU_WADDR_ADD);
114                         uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
115                                                                 QPU_WADDR_MUL);
116
117                         if (last_inst & QPU_WS) {
118                                 last_waddr_a = last_waddr_mul;
119                                 last_waddr_b = last_waddr_add;
120                         } else {
121                                 last_waddr_a = last_waddr_add;
122                                 last_waddr_b = last_waddr_mul;
123                         }
124                 }
125
126                 uint32_t src_muxes[] = {
127                         QPU_GET_FIELD(q->inst, QPU_ADD_A),
128                         QPU_GET_FIELD(q->inst, QPU_ADD_B),
129                         QPU_GET_FIELD(q->inst, QPU_MUL_A),
130                         QPU_GET_FIELD(q->inst, QPU_MUL_B),
131                 };
132
133                 /* "An instruction must not read from a location in physical
134                  *  regfile A or B that was written to by the previous
135                  *  instruction."
136                  */
137                 bool needs_raddr_vs_waddr_nop = false;
138                 bool reads_r4 = false;
139                 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
140                         if ((raddr_a < 32 &&
141                              src_muxes[i] == QPU_MUX_A &&
142                              last_waddr_a == raddr_a) ||
143                             (raddr_b < 32 &&
144                              src_muxes[i] == QPU_MUX_B &&
145                              last_waddr_b == raddr_b)) {
146                                 needs_raddr_vs_waddr_nop = true;
147                         }
148                         if (src_muxes[i] == QPU_MUX_R4)
149                                 reads_r4 = true;
150                 }
151
152                 if (needs_raddr_vs_waddr_nop) {
153                         serialize_one_inst(c, qpu_inst(qpu_a_NOP(),
154                                                        qpu_m_NOP()));
155                 }
156
157                 /* "After an SFU lookup instruction, accumulator r4 must not
158                  *  be read in the following two instructions. Any other
159                  *  instruction that results in r4 being written (that is, TMU
160                  *  read, TLB read, SFU lookup) cannot occur in the two
161                  *  instructions following an SFU lookup."
162                  */
163                 if (reads_r4) {
164                         while (c->qpu_inst_count - last_sfu_write < 3) {
165                                 serialize_one_inst(c, qpu_inst(qpu_a_NOP(),
166                                                                qpu_m_NOP()));
167                         }
168                 }
169
170                 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
171                 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
172                 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
173                     (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
174                         last_sfu_write = c->qpu_inst_count;
175                 }
176
177                 serialize_one_inst(c, q->inst);
178
179                 remove_from_list(&q->link);
180                 free(q);
181         }
182 }
183
184 void
185 vc4_generate_code(struct qcompile *c)
186 {
187         struct qpu_reg allocate_to_qpu_reg[3 + 32 + 32];
188         bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
189         int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
190         int *reg_uses_remaining =
191                 calloc(c->num_temps, sizeof(*reg_uses_remaining));
192
193         for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
194                 reg_in_use[i] = false;
195         for (int i = 0; i < c->num_temps; i++)
196                 reg_allocated[i] = -1;
197         for (int i = 0; i < 3; i++)
198                 allocate_to_qpu_reg[i] = qpu_rn(i);
199         for (int i = 0; i < 32; i++)
200                 allocate_to_qpu_reg[i + 3] = qpu_ra(i);
201         for (int i = 0; i < 32; i++)
202                 allocate_to_qpu_reg[i + 3 + 32] = qpu_rb(i);
203
204         make_empty_list(&c->qpu_inst_list);
205
206         struct simple_node *node;
207         foreach(node, &c->instructions) {
208                 struct qinst *qinst = (struct qinst *)node;
209
210                 if (qinst->dst.file == QFILE_TEMP)
211                         reg_uses_remaining[qinst->dst.index]++;
212                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
213                         if (qinst->src[i].file == QFILE_TEMP)
214                                 reg_uses_remaining[qinst->src[i].index]++;
215                 }
216         }
217
218         switch (c->stage) {
219         case QSTAGE_VERT:
220         case QSTAGE_COORD:
221                 queue(c, qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00));
222                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
223                 break;
224         case QSTAGE_FRAG:
225                 break;
226         }
227
228         foreach(node, &c->instructions) {
229                 struct qinst *qinst = (struct qinst *)node;
230
231 #if 0
232                 fprintf(stderr, "translating qinst to qpu: ");
233                 qir_dump_inst(qinst);
234                 fprintf(stderr, "\n");
235 #endif
236
237                 static const struct {
238                         uint32_t op;
239                         bool is_mul;
240                 } translate[] = {
241 #define A(name) [QOP_##name] = {QPU_A_##name, false}
242 #define M(name) [QOP_##name] = {QPU_M_##name, true}
243                         A(FADD),
244                         A(FSUB),
245                         A(FMIN),
246                         A(FMAX),
247                         A(FMINABS),
248                         A(FMAXABS),
249                         A(FTOI),
250
251                         M(FMUL),
252                 };
253
254                 static const uint32_t compareflags[] = {
255                         [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS,
256                         [QOP_SNE - QOP_SEQ] = QPU_COND_ZC,
257                         [QOP_SLT - QOP_SEQ] = QPU_COND_NS,
258                         [QOP_SGE - QOP_SEQ] = QPU_COND_NC,
259                 };
260
261                 struct qpu_reg src[4];
262                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
263                         int index = qinst->src[i].index;
264                         switch (qinst->src[i].file) {
265                         case QFILE_NULL:
266                                 src[i] = qpu_rn(0);
267                                 break;
268                         case QFILE_TEMP:
269                                 if (reg_allocated[index] == -1) {
270                                         fprintf(stderr, "undefined reg use: ");
271                                         qir_dump_inst(qinst);
272                                         fprintf(stderr, "\n");
273
274                                         src[i] = qpu_rn(0);
275                                 } else {
276                                         src[i] = allocate_to_qpu_reg[reg_allocated[index]];
277                                         reg_uses_remaining[index]--;
278                                         if (reg_uses_remaining[index] == 0)
279                                                 reg_in_use[reg_allocated[index]] = false;
280                                 }
281                                 break;
282                         case QFILE_UNIF:
283                                 src[i] = qpu_unif();
284                                 break;
285                         case QFILE_VARY:
286                                 src[i] = qpu_vary();
287                                 break;
288                         }
289                 }
290
291                 struct qpu_reg dst;
292                 switch (qinst->dst.file) {
293                 case QFILE_NULL:
294                         dst = qpu_ra(QPU_W_NOP);
295                         break;
296
297                 case QFILE_TEMP:
298                         if (reg_allocated[qinst->dst.index] == -1) {
299                                 int alloc;
300                                 for (alloc = 0;
301                                      alloc < ARRAY_SIZE(reg_in_use);
302                                      alloc++) {
303                                         /* The pack flags require an A-file register. */
304                                         if (qinst->op == QOP_PACK_SCALED &&
305                                             allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
306                                                 continue;
307                                         }
308
309                                         if (!reg_in_use[alloc])
310                                                 break;
311                                 }
312                                 assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
313                                 reg_in_use[alloc] = true;
314                                 reg_allocated[qinst->dst.index] = alloc;
315                         }
316
317                         dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
318
319                         reg_uses_remaining[qinst->dst.index]--;
320                         if (reg_uses_remaining[qinst->dst.index] == 0) {
321                                 reg_in_use[reg_allocated[qinst->dst.index]] =
322                                         false;
323                         }
324                         break;
325
326                 case QFILE_VARY:
327                 case QFILE_UNIF:
328                         assert(!"not reached");
329                         break;
330                 }
331
332                 switch (qinst->op) {
333                 case QOP_MOV:
334                         /* Skip emitting the MOV if it's a no-op. */
335                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
336                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
337                                 queue(c, qpu_inst(qpu_a_MOV(dst, src[0]),
338                                                   qpu_m_NOP()));
339                         }
340                         break;
341
342                 case QOP_CMP:
343                         queue(c, qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_NOP),
344                                                     src[0]),
345                                           qpu_m_NOP()));
346                         *last_inst(c) |= QPU_SF;
347
348                         if (dst.mux <= QPU_MUX_R3) {
349                                 fixup_raddr_conflict(c, src[1], &src[2]);
350                                 queue(c, qpu_inst(qpu_a_MOV(dst, src[1]),
351                                                   qpu_m_MOV(dst, src[2])));
352                                 *last_inst(c) = ((*last_inst(c) & ~(QPU_COND_ADD_MASK |
353                                                                     QPU_COND_MUL_MASK))
354                                                  | QPU_SET_FIELD(QPU_COND_NS,
355                                                                  QPU_COND_ADD)
356                                                  | QPU_SET_FIELD(QPU_COND_NC,
357                                                                  QPU_COND_MUL));
358                         } else {
359                                 if (dst.mux == src[1].mux &&
360                                     dst.addr == src[1].addr) {
361                                         queue(c, qpu_inst(qpu_a_MOV(dst, src[1]),
362                                                           qpu_m_NOP()));
363
364                                         queue(c, qpu_inst(qpu_a_MOV(dst, src[2]),
365                                                           qpu_m_NOP()));
366                                         *last_inst(c) = ((*last_inst(c) & ~(QPU_COND_ADD_MASK))
367                                                          | QPU_SET_FIELD(QPU_COND_NC,
368                                                                          QPU_COND_ADD));
369                                 } else {
370                                         queue(c, qpu_inst(qpu_a_MOV(dst, src[2]),
371                                                           qpu_m_NOP()));
372
373                                         queue(c, qpu_inst(qpu_a_MOV(dst, src[1]),
374                                                           qpu_m_NOP()));
375                                         *last_inst(c) = ((*last_inst(c) & ~(QPU_COND_ADD_MASK))
376                                                          | QPU_SET_FIELD(QPU_COND_NS,
377                                                                          QPU_COND_ADD));
378                                 }
379                         }
380                         break;
381
382                 case QOP_SEQ:
383                 case QOP_SNE:
384                 case QOP_SGE:
385                 case QOP_SLT:
386                         fixup_raddr_conflict(c, src[0], &src[1]);
387                         queue(c, qpu_inst(qpu_a_SUB(qpu_ra(QPU_W_NOP),
388                                                     src[0], src[1]),
389                                           qpu_m_NOP()));
390                         *last_inst(c) |= QPU_SF;
391
392                         queue(c, qpu_load_imm_f(dst, 0.0));
393                         queue(c, qpu_load_imm_f(dst, 1.0));
394                         *last_inst(c) = ((*last_inst(c) & ~QPU_COND_ADD_MASK)
395                                          | QPU_SET_FIELD(compareflags[qinst->op - QOP_SEQ],
396                                                          QPU_COND_ADD));
397
398                         break;
399
400                 case QOP_VPM_WRITE:
401                         queue(c, qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]),
402                                           qpu_m_NOP()));
403                         break;
404
405                 case QOP_VPM_READ:
406                         queue(c, qpu_inst(qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)),
407                                           qpu_m_NOP()));
408                         break;
409
410                 case QOP_RCP:
411                 case QOP_RSQ:
412                 case QOP_EXP2:
413                 case QOP_LOG2:
414                         switch (qinst->op) {
415                         case QOP_RCP:
416                                 queue(c, qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
417                                                             src[0]),
418                                                   qpu_m_NOP()));
419                                 break;
420                         case QOP_RSQ:
421                                 queue(c, qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
422                                                             src[0]),
423                                                   qpu_m_NOP()));
424                                 break;
425                         case QOP_EXP2:
426                                 queue(c, qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
427                                                             src[0]),
428                                                   qpu_m_NOP()));
429                                 break;
430                         case QOP_LOG2:
431                                 queue(c, qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
432                                                             src[0]),
433                                                   qpu_m_NOP()));
434                                 break;
435                         default:
436                                 abort();
437                         }
438
439                         queue(c, qpu_inst(qpu_a_MOV(dst, qpu_r4()),
440                                           qpu_m_NOP()));
441
442                         break;
443
444                 case QOP_PACK_COLORS:
445                         for (int i = 0; i < 4; i++) {
446                                 queue(c, qpu_inst(qpu_a_NOP(),
447                                                   qpu_m_MOV(qpu_r3(), src[i])));
448                                 *last_inst(c) |= QPU_PM;
449                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
450                                                                QPU_PACK);
451                         }
452
453                         queue(c, qpu_inst(qpu_a_MOV(dst, qpu_r3()),
454                                           qpu_m_NOP()));
455
456                         break;
457
458                 case QOP_TLB_COLOR_WRITE:
459                         queue(c, qpu_inst(qpu_a_MOV(qpu_tlbc(),
460                                                     src[0]),
461                                           qpu_m_NOP()));
462                         break;
463
464                 case QOP_VARY_ADD_C:
465                         queue(c, qpu_inst(qpu_a_FADD(dst,
466                                                      src[0], qpu_r5()),
467                                           qpu_m_NOP()));
468                         break;
469
470                 case QOP_PACK_SCALED:
471                         queue(c, qpu_inst(qpu_a_MOV(dst, src[0]),
472                                           qpu_m_NOP()));
473                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_A_16A,
474                                                        QPU_PACK);
475
476                         queue(c, qpu_inst(qpu_a_MOV(dst, src[1]),
477                                           qpu_m_NOP()));
478                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_A_16B,
479                                                        QPU_PACK);
480
481                         break;
482
483                 default:
484                         assert(qinst->op < ARRAY_SIZE(translate));
485                         assert(translate[qinst->op].op != 0); /* NOPs */
486
487                         /* If we have only one source, put it in the second
488                          * argument slot as well so that we don't take up
489                          * another raddr just to get unused data.
490                          */
491                         if (qir_get_op_nsrc(qinst->op) == 1)
492                                 src[1] = src[0];
493
494                         fixup_raddr_conflict(c, src[0], &src[1]);
495
496                         if (translate[qinst->op].is_mul) {
497                                 queue(c, qpu_inst(qpu_a_NOP(),
498                                                   qpu_m_alu2(translate[qinst->op].op,
499                                                              dst,
500                                                              src[0], src[1])));
501                         } else {
502                                 queue(c, qpu_inst(qpu_a_alu2(translate[qinst->op].op,
503                                                              dst,
504                                                              src[0], src[1]),
505                                                   qpu_m_NOP()));
506                         }
507                         break;
508                 }
509         }
510
511         serialize_insts(c);
512
513         /* thread end can't have VPM write */
514         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
515                           QPU_WADDR_ADD) == QPU_W_VPM ||
516             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
517                           QPU_WADDR_MUL) == QPU_W_VPM) {
518                 serialize_one_inst(c, qpu_inst(qpu_a_NOP(), qpu_m_NOP()));
519         }
520
521         c->qpu_insts[c->qpu_inst_count - 1] =
522                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
523                             QPU_SIG_PROG_END);
524         serialize_one_inst(c, qpu_inst(qpu_a_NOP(), qpu_m_NOP()));
525         serialize_one_inst(c, qpu_inst(qpu_a_NOP(), qpu_m_NOP()));
526
527         switch (c->stage) {
528         case QSTAGE_VERT:
529         case QSTAGE_COORD:
530                 break;
531         case QSTAGE_FRAG:
532                 c->qpu_insts[2] = qpu_set_sig(c->qpu_insts[2],
533                                               QPU_SIG_WAIT_FOR_SCOREBOARD);
534                 c->qpu_insts[c->qpu_inst_count - 1] =
535                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
536                                     QPU_SIG_SCOREBOARD_UNLOCK);
537                 break;
538         }
539
540         if (vc4_debug & VC4_DEBUG_QPU)
541                 vc4_dump_program(c);
542
543         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
544 }