OSDN Git Service

vc4: Add disasm for A-file unpack operations.
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33         fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35         for (int i = 0; i < c->qpu_inst_count; i++) {
36                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38                 fprintf(stderr, "\n");
39         }
40 }
41
42 struct queued_qpu_inst {
43         struct simple_node link;
44         uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50         struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51         q->inst = inst;
52         insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58         struct queued_qpu_inst *q =
59                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60         return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70  * This is used to resolve the fact that we might register-allocate two
71  * different operands of an instruction to the same physical register file
72  * even though instructions have only one field for the register file source
73  * address.
74  *
75  * In that case, we need to move one to a temporary that can be used in the
76  * instruction, instead.
77  */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80                struct qpu_reg src0, struct qpu_reg *src1)
81 {
82         if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83             (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84             src0.addr != src1->addr) {
85                 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86                 *src1 = qpu_r3();
87         }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93         if (c->qpu_inst_count >= c->qpu_inst_size) {
94                 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95                 c->qpu_insts = realloc(c->qpu_insts,
96                                        c->qpu_inst_size * sizeof(uint64_t));
97         }
98         c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104         int last_sfu_write = -10;
105         bool scoreboard_wait_emitted = false;
106
107         while (!is_empty_list(&c->qpu_inst_list)) {
108                 struct queued_qpu_inst *q =
109                         (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110                 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111                 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112                 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114                 if (c->qpu_inst_count > 0) {
115                         uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116                                                           1];
117                         uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118                                                                 QPU_WADDR_ADD);
119                         uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120                                                                 QPU_WADDR_MUL);
121
122                         if (last_inst & QPU_WS) {
123                                 last_waddr_a = last_waddr_mul;
124                                 last_waddr_b = last_waddr_add;
125                         } else {
126                                 last_waddr_a = last_waddr_add;
127                                 last_waddr_b = last_waddr_mul;
128                         }
129                 }
130
131                 uint32_t src_muxes[] = {
132                         QPU_GET_FIELD(q->inst, QPU_ADD_A),
133                         QPU_GET_FIELD(q->inst, QPU_ADD_B),
134                         QPU_GET_FIELD(q->inst, QPU_MUL_A),
135                         QPU_GET_FIELD(q->inst, QPU_MUL_B),
136                 };
137
138                 /* "An instruction must not read from a location in physical
139                  *  regfile A or B that was written to by the previous
140                  *  instruction."
141                  */
142                 bool needs_raddr_vs_waddr_nop = false;
143                 bool reads_r4 = false;
144                 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145                         if ((raddr_a < 32 &&
146                              src_muxes[i] == QPU_MUX_A &&
147                              last_waddr_a == raddr_a) ||
148                             (raddr_b < 32 &&
149                              src_muxes[i] == QPU_MUX_B &&
150                              last_waddr_b == raddr_b)) {
151                                 needs_raddr_vs_waddr_nop = true;
152                         }
153                         if (src_muxes[i] == QPU_MUX_R4)
154                                 reads_r4 = true;
155                 }
156
157                 if (needs_raddr_vs_waddr_nop) {
158                         serialize_one_inst(c, qpu_NOP());
159                 }
160
161                 /* "After an SFU lookup instruction, accumulator r4 must not
162                  *  be read in the following two instructions. Any other
163                  *  instruction that results in r4 being written (that is, TMU
164                  *  read, TLB read, SFU lookup) cannot occur in the two
165                  *  instructions following an SFU lookup."
166                  */
167                 if (reads_r4) {
168                         while (c->qpu_inst_count - last_sfu_write < 3) {
169                                 serialize_one_inst(c, qpu_NOP());
170                         }
171                 }
172
173                 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174                 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175                 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176                     (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177                         last_sfu_write = c->qpu_inst_count;
178                 }
179
180                 /* "A scoreboard wait must not occur in the first two
181                  *  instructions of a fragment shader. This is either the
182                  *  explicit Wait for Scoreboard signal or an implicit wait
183                  *  with the first tile-buffer read or write instruction."
184                  */
185                 if (!scoreboard_wait_emitted &&
186                     (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187                      waddr_a == QPU_W_TLB_COLOR_MS ||
188                      waddr_m == QPU_W_TLB_COLOR_MS ||
189                      waddr_a == QPU_W_TLB_COLOR_ALL ||
190                      waddr_m == QPU_W_TLB_COLOR_ALL ||
191                      QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192                         while (c->qpu_inst_count < 3 ||
193                                QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194                                              QPU_SIG) != QPU_SIG_NONE) {
195                                 serialize_one_inst(c, qpu_NOP());
196                         }
197                         c->qpu_insts[c->qpu_inst_count - 1] =
198                                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199                                             QPU_SIG_WAIT_FOR_SCOREBOARD);
200                         scoreboard_wait_emitted = true;
201                 }
202
203                 serialize_one_inst(c, q->inst);
204
205                 remove_from_list(&q->link);
206                 free(q);
207         }
208 }
209
210 void
211 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
212 {
213         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
214         bool discard = false;
215
216         make_empty_list(&c->qpu_inst_list);
217
218         switch (c->stage) {
219         case QSTAGE_VERT:
220         case QSTAGE_COORD:
221                 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
222                                          (0x00001a00 +
223                                           0x00100000 * c->num_inputs)));
224                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
225                 break;
226         case QSTAGE_FRAG:
227                 break;
228         }
229
230         struct simple_node *node;
231         foreach(node, &c->instructions) {
232                 struct qinst *qinst = (struct qinst *)node;
233
234 #if 0
235                 fprintf(stderr, "translating qinst to qpu: ");
236                 qir_dump_inst(qinst);
237                 fprintf(stderr, "\n");
238 #endif
239
240                 static const struct {
241                         uint32_t op;
242                         bool is_mul;
243                 } translate[] = {
244 #define A(name) [QOP_##name] = {QPU_A_##name, false}
245 #define M(name) [QOP_##name] = {QPU_M_##name, true}
246                         A(FADD),
247                         A(FSUB),
248                         A(FMIN),
249                         A(FMAX),
250                         A(FMINABS),
251                         A(FMAXABS),
252                         A(FTOI),
253                         A(ITOF),
254                         A(ADD),
255                         A(SUB),
256                         A(SHL),
257                         A(SHR),
258                         A(ASR),
259                         A(MIN),
260                         A(MAX),
261                         A(AND),
262                         A(OR),
263                         A(XOR),
264                         A(NOT),
265
266                         M(FMUL),
267                         M(MUL24),
268                 };
269
270                 struct qpu_reg src[4];
271                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
272                         int index = qinst->src[i].index;
273                         switch (qinst->src[i].file) {
274                         case QFILE_NULL:
275                                 src[i] = qpu_rn(0);
276                                 break;
277                         case QFILE_TEMP:
278                                 src[i] = temp_registers[index];
279                                 break;
280                         case QFILE_UNIF:
281                                 src[i] = qpu_unif();
282                                 break;
283                         case QFILE_VARY:
284                                 src[i] = qpu_vary();
285                                 break;
286                         }
287                 }
288
289                 struct qpu_reg dst;
290                 switch (qinst->dst.file) {
291                 case QFILE_NULL:
292                         dst = qpu_ra(QPU_W_NOP);
293                         break;
294                 case QFILE_TEMP:
295                         dst = temp_registers[qinst->dst.index];
296                         break;
297                 case QFILE_VARY:
298                 case QFILE_UNIF:
299                         assert(!"not reached");
300                         break;
301                 }
302
303                 switch (qinst->op) {
304                 case QOP_MOV:
305                         /* Skip emitting the MOV if it's a no-op. */
306                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
307                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
308                                 queue(c, qpu_a_MOV(dst, src[0]));
309                         }
310                         break;
311
312                 case QOP_SF:
313                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
314                         *last_inst(c) |= QPU_SF;
315                         break;
316
317                 case QOP_SEL_X_0_ZS:
318                 case QOP_SEL_X_0_ZC:
319                 case QOP_SEL_X_0_NS:
320                 case QOP_SEL_X_0_NC:
321                         queue(c, qpu_a_MOV(dst, src[0]));
322                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
323                                           QPU_COND_ZS);
324
325                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
326                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
327                                               1) + QPU_COND_ZS);
328                         break;
329
330                 case QOP_SEL_X_Y_ZS:
331                 case QOP_SEL_X_Y_ZC:
332                 case QOP_SEL_X_Y_NS:
333                 case QOP_SEL_X_Y_NC:
334                         queue(c, qpu_a_MOV(dst, src[0]));
335                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
336                                           QPU_COND_ZS);
337
338                         queue(c, qpu_a_MOV(dst, src[1]));
339                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
340                                               1) + QPU_COND_ZS);
341
342                         break;
343
344                 case QOP_VPM_WRITE:
345                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
346                         break;
347
348                 case QOP_VPM_READ:
349                         queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
350                         break;
351
352                 case QOP_RCP:
353                 case QOP_RSQ:
354                 case QOP_EXP2:
355                 case QOP_LOG2:
356                         switch (qinst->op) {
357                         case QOP_RCP:
358                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
359                                                    src[0]));
360                                 break;
361                         case QOP_RSQ:
362                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
363                                                    src[0]));
364                                 break;
365                         case QOP_EXP2:
366                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
367                                                    src[0]));
368                                 break;
369                         case QOP_LOG2:
370                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
371                                                    src[0]));
372                                 break;
373                         default:
374                                 abort();
375                         }
376
377                         queue(c, qpu_a_MOV(dst, qpu_r4()));
378
379                         break;
380
381                 case QOP_PACK_COLORS:
382                         for (int i = 0; i < 4; i++) {
383                                 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
384                                 *last_inst(c) |= QPU_PM;
385                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
386                                                                QPU_PACK);
387                         }
388
389                         queue(c, qpu_a_MOV(dst, qpu_r3()));
390
391                         break;
392
393                 case QOP_FRAG_X:
394                         queue(c, qpu_a_ITOF(dst,
395                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
396                         break;
397
398                 case QOP_FRAG_Y:
399                         queue(c, qpu_a_ITOF(dst,
400                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
401                         break;
402
403                 case QOP_FRAG_Z:
404                 case QOP_FRAG_W:
405                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
406                          * the register to the Z/W payload.
407                          */
408                         break;
409
410                 case QOP_TLB_DISCARD_SETUP:
411                         discard = true;
412                         queue(c, qpu_a_MOV(src[0], src[0]));
413                         *last_inst(c) |= QPU_SF;
414                         break;
415
416                 case QOP_TLB_STENCIL_SETUP:
417                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
418                         break;
419
420                 case QOP_TLB_Z_WRITE:
421                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
422                         if (discard) {
423                                 set_last_cond_add(c, QPU_COND_ZS);
424                         }
425                         break;
426
427                 case QOP_TLB_COLOR_READ:
428                         queue(c, qpu_NOP());
429                         *last_inst(c) = qpu_set_sig(*last_inst(c),
430                                                     QPU_SIG_COLOR_LOAD);
431
432                         break;
433
434                 case QOP_TLB_COLOR_WRITE:
435                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
436                         if (discard) {
437                                 set_last_cond_add(c, QPU_COND_ZS);
438                         }
439                         break;
440
441                 case QOP_VARY_ADD_C:
442                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
443                         break;
444
445                 case QOP_PACK_SCALED: {
446                         uint64_t a = (qpu_a_MOV(dst, src[0]) |
447                                       QPU_SET_FIELD(QPU_PACK_A_16A,
448                                                     QPU_PACK));
449                         uint64_t b = (qpu_a_MOV(dst, src[1]) |
450                                       QPU_SET_FIELD(QPU_PACK_A_16B,
451                                                     QPU_PACK));
452
453                         if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
454                                 queue(c, b);
455                                 queue(c, a);
456                         } else {
457                                 queue(c, a);
458                                 queue(c, b);
459                         }
460                         break;
461                 }
462
463                 case QOP_TEX_S:
464                 case QOP_TEX_T:
465                 case QOP_TEX_R:
466                 case QOP_TEX_B:
467                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
468                                                   (qinst->op - QOP_TEX_S)),
469                                            src[0]));
470                         break;
471
472                 case QOP_TEX_RESULT:
473                         queue(c, qpu_NOP());
474                         *last_inst(c) = qpu_set_sig(*last_inst(c),
475                                                     QPU_SIG_LOAD_TMU0);
476
477                         break;
478
479                 case QOP_R4_UNPACK_A:
480                 case QOP_R4_UNPACK_B:
481                 case QOP_R4_UNPACK_C:
482                 case QOP_R4_UNPACK_D:
483                         assert(src[0].mux == QPU_MUX_R4);
484                         queue(c, qpu_a_MOV(dst, src[0]));
485                         *last_inst(c) |= QPU_PM;
486                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
487                                                        (qinst->op -
488                                                         QOP_R4_UNPACK_A),
489                                                        QPU_UNPACK);
490
491                         break;
492
493                 default:
494                         assert(qinst->op < ARRAY_SIZE(translate));
495                         assert(translate[qinst->op].op != 0); /* NOPs */
496
497                         /* If we have only one source, put it in the second
498                          * argument slot as well so that we don't take up
499                          * another raddr just to get unused data.
500                          */
501                         if (qir_get_op_nsrc(qinst->op) == 1)
502                                 src[1] = src[0];
503
504                         fixup_raddr_conflict(c, src[0], &src[1]);
505
506                         if (translate[qinst->op].is_mul) {
507                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
508                                                     dst,
509                                                     src[0], src[1]));
510                         } else {
511                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
512                                                     dst,
513                                                     src[0], src[1]));
514                         }
515                         break;
516                 }
517         }
518
519         serialize_insts(c);
520
521         /* thread end can't have VPM write */
522         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
523                           QPU_WADDR_ADD) == QPU_W_VPM ||
524             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
525                           QPU_WADDR_MUL) == QPU_W_VPM) {
526                 serialize_one_inst(c, qpu_NOP());
527         }
528
529         /* thread end can't have uniform read */
530         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
531                           QPU_RADDR_A) == QPU_R_UNIF ||
532             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
533                           QPU_RADDR_B) == QPU_R_UNIF) {
534                 serialize_one_inst(c, qpu_NOP());
535         }
536
537         c->qpu_insts[c->qpu_inst_count - 1] =
538                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
539                             QPU_SIG_PROG_END);
540         serialize_one_inst(c, qpu_NOP());
541         serialize_one_inst(c, qpu_NOP());
542
543         switch (c->stage) {
544         case QSTAGE_VERT:
545         case QSTAGE_COORD:
546                 break;
547         case QSTAGE_FRAG:
548                 c->qpu_insts[c->qpu_inst_count - 1] =
549                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
550                                     QPU_SIG_SCOREBOARD_UNLOCK);
551                 break;
552         }
553
554         if (vc4_debug & VC4_DEBUG_QPU)
555                 vc4_dump_program(c);
556
557         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
558
559         free(temp_registers);
560 }