OSDN Git Service

vc4: Add support for 8-bit unnormalized vertex attrs.
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34         fprintf(stderr, "%s prog %d/%d QPU:\n",
35                 qir_get_stage_name(c->stage),
36                 c->program_id, c->variant_id);
37
38         for (int i = 0; i < c->qpu_inst_count; i++) {
39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                 fprintf(stderr, "\n");
42         }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49         q->inst = inst;
50         insert_at_tail(&c->qpu_inst_list, &q->link);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56         struct queued_qpu_inst *q =
57                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
58         return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68  * Some special registers can be read from either file, which lets us resolve
69  * raddr conflicts without extra MOVs.
70  */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74         switch (src->addr) {
75         case QPU_R_UNIF:
76         case QPU_R_VARY:
77                 if (src->mux == QPU_MUX_A)
78                         src->mux = QPU_MUX_B;
79                 else
80                         src->mux = QPU_MUX_A;
81                 return true;
82
83         default:
84                 return false;
85         }
86 }
87
88 /**
89  * This is used to resolve the fact that we might register-allocate two
90  * different operands of an instruction to the same physical register file
91  * even though instructions have only one field for the register file source
92  * address.
93  *
94  * In that case, we need to move one to a temporary that can be used in the
95  * instruction, instead.
96  */
97 static bool
98 fixup_raddr_conflict(struct vc4_compile *c,
99                      struct qpu_reg dst,
100                      struct qpu_reg *src0, struct qpu_reg *src1,
101                      bool r3_live)
102 {
103         if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
104             src0->mux != src1->mux ||
105             src0->addr == src1->addr) {
106                 return false;
107         }
108
109         if (swap_file(src0) || swap_file(src1))
110                 return false;
111
112         if (src0->mux == QPU_MUX_A) {
113                 /* If we're conflicting over the A regfile, then we can just
114                  * use the reserved rb31.
115                  */
116                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
117                 *src1 = qpu_rb(31);
118                 return false;
119         } else {
120                 /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
121                  * rb31, then store our desired value in r3, and tell the
122                  * caller to put rb31 back into r3 when we're done.
123                  */
124                 if (r3_live)
125                         queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
126                 queue(c, qpu_a_MOV(qpu_r3(), *src1));
127
128                 *src1 = qpu_r3();
129
130                 return r3_live && dst.mux != QPU_MUX_R3;
131         }
132 }
133
134 void
135 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
136 {
137         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
138         bool discard = false;
139         uint32_t inputs_remaining = c->num_inputs;
140         uint32_t vpm_read_fifo_count = 0;
141         uint32_t vpm_read_offset = 0;
142         bool written_r3 = false;
143         bool needs_restore;
144
145         make_empty_list(&c->qpu_inst_list);
146
147         switch (c->stage) {
148         case QSTAGE_VERT:
149         case QSTAGE_COORD:
150                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
151                  * load up to 16 dwords (4 vec4s) per vertex.
152                  */
153                 while (inputs_remaining) {
154                         uint32_t num_entries = MIN2(inputs_remaining, 16);
155                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
156                                                  vpm_read_offset |
157                                                  0x00001a00 |
158                                                  ((num_entries & 0xf) << 20)));
159                         inputs_remaining -= num_entries;
160                         vpm_read_offset += num_entries;
161                         vpm_read_fifo_count++;
162                 }
163                 assert(vpm_read_fifo_count <= 4);
164
165                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
166                 break;
167         case QSTAGE_FRAG:
168                 break;
169         }
170
171         struct simple_node *node;
172         foreach(node, &c->instructions) {
173                 struct qinst *qinst = (struct qinst *)node;
174
175 #if 0
176                 fprintf(stderr, "translating qinst to qpu: ");
177                 qir_dump_inst(qinst);
178                 fprintf(stderr, "\n");
179 #endif
180
181                 static const struct {
182                         uint32_t op;
183                         bool is_mul;
184                 } translate[] = {
185 #define A(name) [QOP_##name] = {QPU_A_##name, false}
186 #define M(name) [QOP_##name] = {QPU_M_##name, true}
187                         A(FADD),
188                         A(FSUB),
189                         A(FMIN),
190                         A(FMAX),
191                         A(FMINABS),
192                         A(FMAXABS),
193                         A(FTOI),
194                         A(ITOF),
195                         A(ADD),
196                         A(SUB),
197                         A(SHL),
198                         A(SHR),
199                         A(ASR),
200                         A(MIN),
201                         A(MAX),
202                         A(AND),
203                         A(OR),
204                         A(XOR),
205                         A(NOT),
206
207                         M(FMUL),
208                         M(MUL24),
209                 };
210
211                 struct qpu_reg src[4];
212                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
213                         int index = qinst->src[i].index;
214                         switch (qinst->src[i].file) {
215                         case QFILE_NULL:
216                                 src[i] = qpu_rn(0);
217                                 break;
218                         case QFILE_TEMP:
219                                 src[i] = temp_registers[index];
220                                 break;
221                         case QFILE_UNIF:
222                                 src[i] = qpu_unif();
223                                 break;
224                         case QFILE_VARY:
225                                 src[i] = qpu_vary();
226                                 break;
227                         }
228                 }
229
230                 struct qpu_reg dst;
231                 switch (qinst->dst.file) {
232                 case QFILE_NULL:
233                         dst = qpu_ra(QPU_W_NOP);
234                         break;
235                 case QFILE_TEMP:
236                         dst = temp_registers[qinst->dst.index];
237                         break;
238                 case QFILE_VARY:
239                 case QFILE_UNIF:
240                         assert(!"not reached");
241                         break;
242                 }
243
244                 switch (qinst->op) {
245                 case QOP_MOV:
246                         /* Skip emitting the MOV if it's a no-op. */
247                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
248                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
249                                 queue(c, qpu_a_MOV(dst, src[0]));
250                         }
251                         break;
252
253                 case QOP_SF:
254                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
255                         *last_inst(c) |= QPU_SF;
256                         break;
257
258                 case QOP_SEL_X_0_ZS:
259                 case QOP_SEL_X_0_ZC:
260                 case QOP_SEL_X_0_NS:
261                 case QOP_SEL_X_0_NC:
262                         queue(c, qpu_a_MOV(dst, src[0]));
263                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
264                                           QPU_COND_ZS);
265
266                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
267                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
268                                               1) + QPU_COND_ZS);
269                         break;
270
271                 case QOP_SEL_X_Y_ZS:
272                 case QOP_SEL_X_Y_ZC:
273                 case QOP_SEL_X_Y_NS:
274                 case QOP_SEL_X_Y_NC:
275                         queue(c, qpu_a_MOV(dst, src[0]));
276                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
277                                           QPU_COND_ZS);
278
279                         queue(c, qpu_a_MOV(dst, src[1]));
280                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
281                                               1) + QPU_COND_ZS);
282
283                         break;
284
285                 case QOP_VPM_WRITE:
286                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
287                         break;
288
289                 case QOP_VPM_READ:
290                         queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
291                         break;
292
293                 case QOP_RCP:
294                 case QOP_RSQ:
295                 case QOP_EXP2:
296                 case QOP_LOG2:
297                         switch (qinst->op) {
298                         case QOP_RCP:
299                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
300                                                    src[0]));
301                                 break;
302                         case QOP_RSQ:
303                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
304                                                    src[0]));
305                                 break;
306                         case QOP_EXP2:
307                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
308                                                    src[0]));
309                                 break;
310                         case QOP_LOG2:
311                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
312                                                    src[0]));
313                                 break;
314                         default:
315                                 abort();
316                         }
317
318                         queue(c, qpu_a_MOV(dst, qpu_r4()));
319
320                         break;
321
322                 case QOP_PACK_COLORS: {
323                         /* We have to be careful not to start writing over one
324                          * of our source values when incrementally writing the
325                          * destination.  So, if the dst is one of the srcs, we
326                          * pack that one first (and we pack 4 channels at once
327                          * for the first pack).
328                          */
329                         struct qpu_reg first_pack = src[0];
330                         for (int i = 0; i < 4; i++) {
331                                 if (src[i].mux == dst.mux &&
332                                     src[i].addr == dst.addr) {
333                                         first_pack = dst;
334                                         break;
335                                 }
336                         }
337                         queue(c, qpu_m_MOV(dst, first_pack));
338                         *last_inst(c) |= QPU_PM;
339                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
340                                                        QPU_PACK);
341
342                         for (int i = 0; i < 4; i++) {
343                                 if (src[i].mux == first_pack.mux &&
344                                     src[i].addr == first_pack.addr) {
345                                         continue;
346                                 }
347
348                                 queue(c, qpu_m_MOV(dst, src[i]));
349                                 *last_inst(c) |= QPU_PM;
350                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
351                                                                QPU_PACK);
352                         }
353
354                         break;
355                 }
356
357                 case QOP_FRAG_X:
358                         queue(c, qpu_a_ITOF(dst,
359                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
360                         break;
361
362                 case QOP_FRAG_Y:
363                         queue(c, qpu_a_ITOF(dst,
364                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
365                         break;
366
367                 case QOP_FRAG_REV_FLAG:
368                         queue(c, qpu_a_ITOF(dst,
369                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
370                         break;
371
372                 case QOP_FRAG_Z:
373                 case QOP_FRAG_W:
374                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
375                          * the register to the Z/W payload.
376                          */
377                         break;
378
379                 case QOP_TLB_DISCARD_SETUP:
380                         discard = true;
381                         queue(c, qpu_a_MOV(src[0], src[0]));
382                         *last_inst(c) |= QPU_SF;
383                         break;
384
385                 case QOP_TLB_STENCIL_SETUP:
386                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
387                         break;
388
389                 case QOP_TLB_Z_WRITE:
390                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
391                         if (discard) {
392                                 set_last_cond_add(c, QPU_COND_ZS);
393                         }
394                         break;
395
396                 case QOP_TLB_COLOR_READ:
397                         queue(c, qpu_NOP());
398                         *last_inst(c) = qpu_set_sig(*last_inst(c),
399                                                     QPU_SIG_COLOR_LOAD);
400
401                         break;
402
403                 case QOP_TLB_COLOR_WRITE:
404                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
405                         if (discard) {
406                                 set_last_cond_add(c, QPU_COND_ZS);
407                         }
408                         break;
409
410                 case QOP_VARY_ADD_C:
411                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
412                         break;
413
414                 case QOP_PACK_SCALED: {
415                         uint64_t a = (qpu_a_MOV(dst, src[0]) |
416                                       QPU_SET_FIELD(QPU_PACK_A_16A,
417                                                     QPU_PACK));
418                         uint64_t b = (qpu_a_MOV(dst, src[1]) |
419                                       QPU_SET_FIELD(QPU_PACK_A_16B,
420                                                     QPU_PACK));
421
422                         if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
423                                 queue(c, b);
424                                 queue(c, a);
425                         } else {
426                                 queue(c, a);
427                                 queue(c, b);
428                         }
429                         break;
430                 }
431
432                 case QOP_TEX_S:
433                 case QOP_TEX_T:
434                 case QOP_TEX_R:
435                 case QOP_TEX_B:
436                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
437                                                   (qinst->op - QOP_TEX_S)),
438                                            src[0]));
439                         break;
440
441                 case QOP_TEX_DIRECT:
442                         needs_restore = fixup_raddr_conflict(c, dst,
443                                                              &src[0], &src[1],
444                                                              written_r3);
445                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
446                         if (needs_restore)
447                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
448                         break;
449
450                 case QOP_TEX_RESULT:
451                         queue(c, qpu_NOP());
452                         *last_inst(c) = qpu_set_sig(*last_inst(c),
453                                                     QPU_SIG_LOAD_TMU0);
454
455                         break;
456
457                 case QOP_R4_UNPACK_A:
458                 case QOP_R4_UNPACK_B:
459                 case QOP_R4_UNPACK_C:
460                 case QOP_R4_UNPACK_D:
461                         assert(src[0].mux == QPU_MUX_R4);
462                         queue(c, qpu_a_MOV(dst, src[0]));
463                         *last_inst(c) |= QPU_PM;
464                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
465                                                        (qinst->op -
466                                                         QOP_R4_UNPACK_A),
467                                                        QPU_UNPACK);
468
469                         break;
470
471                 case QOP_UNPACK_8A_F:
472                 case QOP_UNPACK_8B_F:
473                 case QOP_UNPACK_8C_F:
474                 case QOP_UNPACK_8D_F:
475                         assert(src[0].mux == QPU_MUX_A);
476
477                         /* Since we're setting the pack bits, if the
478                          * destination is in A it would get re-packed.
479                          */
480                         queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
481                                              qpu_rb(31) : dst),
482                                             src[0], src[0]));
483                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
484                                                        (qinst->op -
485                                                         QOP_UNPACK_8A_F),
486                                                        QPU_UNPACK);
487
488                         if (dst.mux == QPU_MUX_A) {
489                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
490                         }
491                         break;
492
493                 case QOP_UNPACK_8A_I:
494                 case QOP_UNPACK_8B_I:
495                 case QOP_UNPACK_8C_I:
496                 case QOP_UNPACK_8D_I:
497                         assert(src[0].mux == QPU_MUX_A);
498
499                         /* Since we're setting the pack bits, if the
500                          * destination is in A it would get re-packed.
501                          */
502                         queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
503                                             qpu_rb(31) : dst), src[0]));
504                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
505                                                        (qinst->op -
506                                                         QOP_UNPACK_8A_I),
507                                                        QPU_UNPACK);
508
509                         if (dst.mux == QPU_MUX_A) {
510                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
511                         }
512                         break;
513
514                 default:
515                         assert(qinst->op < ARRAY_SIZE(translate));
516                         assert(translate[qinst->op].op != 0); /* NOPs */
517
518                         /* If we have only one source, put it in the second
519                          * argument slot as well so that we don't take up
520                          * another raddr just to get unused data.
521                          */
522                         if (qir_get_op_nsrc(qinst->op) == 1)
523                                 src[1] = src[0];
524
525                         needs_restore = fixup_raddr_conflict(c, dst,
526                                                              &src[0], &src[1],
527                                                              written_r3);
528
529                         if (translate[qinst->op].is_mul) {
530                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
531                                                     dst,
532                                                     src[0], src[1]));
533                         } else {
534                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
535                                                     dst,
536                                                     src[0], src[1]));
537                         }
538                         if (needs_restore)
539                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
540
541                         break;
542                 }
543
544                 if (dst.mux == QPU_MUX_R3)
545                         written_r3 = true;
546         }
547
548         qpu_schedule_instructions(c);
549
550         /* thread end can't have VPM write or read */
551         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
552                           QPU_WADDR_ADD) == QPU_W_VPM ||
553             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
554                           QPU_WADDR_MUL) == QPU_W_VPM ||
555             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
556                           QPU_RADDR_A) == QPU_R_VPM ||
557             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
558                           QPU_RADDR_B) == QPU_R_VPM) {
559                 qpu_serialize_one_inst(c, qpu_NOP());
560         }
561
562         /* thread end can't have uniform read */
563         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
564                           QPU_RADDR_A) == QPU_R_UNIF ||
565             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
566                           QPU_RADDR_B) == QPU_R_UNIF) {
567                 qpu_serialize_one_inst(c, qpu_NOP());
568         }
569
570         /* thread end can't have TLB operations */
571         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
572                 qpu_serialize_one_inst(c, qpu_NOP());
573
574         c->qpu_insts[c->qpu_inst_count - 1] =
575                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
576                             QPU_SIG_PROG_END);
577         qpu_serialize_one_inst(c, qpu_NOP());
578         qpu_serialize_one_inst(c, qpu_NOP());
579
580         switch (c->stage) {
581         case QSTAGE_VERT:
582         case QSTAGE_COORD:
583                 break;
584         case QSTAGE_FRAG:
585                 c->qpu_insts[c->qpu_inst_count - 1] =
586                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
587                                     QPU_SIG_SCOREBOARD_UNLOCK);
588                 break;
589         }
590
591         if (vc4_debug & VC4_DEBUG_QPU)
592                 vc4_dump_program(c);
593
594         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
595
596         free(temp_registers);
597 }