OSDN Git Service

vc4: Fix names of the 16-bit unpacks
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34         fprintf(stderr, "%s prog %d/%d QPU:\n",
35                 qir_get_stage_name(c->stage),
36                 c->program_id, c->variant_id);
37
38         for (int i = 0; i < c->qpu_inst_count; i++) {
39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                 fprintf(stderr, "\n");
42         }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49         q->inst = inst;
50         list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56         struct queued_qpu_inst *q =
57                 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58         return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68  * Some special registers can be read from either file, which lets us resolve
69  * raddr conflicts without extra MOVs.
70  */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74         switch (src->addr) {
75         case QPU_R_UNIF:
76         case QPU_R_VARY:
77                 if (src->mux == QPU_MUX_SMALL_IMM) {
78                         return false;
79                 } else {
80                         if (src->mux == QPU_MUX_A)
81                                 src->mux = QPU_MUX_B;
82                         else
83                                 src->mux = QPU_MUX_A;
84                         return true;
85                 }
86
87         default:
88                 return false;
89         }
90 }
91
92 /**
93  * This is used to resolve the fact that we might register-allocate two
94  * different operands of an instruction to the same physical register file
95  * even though instructions have only one field for the register file source
96  * address.
97  *
98  * In that case, we need to move one to a temporary that can be used in the
99  * instruction, instead.  We reserve ra31/rb31 for this purpose.
100  */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103                      struct qpu_reg dst,
104                      struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109         if (mux0 <= QPU_MUX_R5 ||
110             mux0 != mux1 ||
111             (src0->addr == src1->addr &&
112              src0->mux == src1->mux)) {
113                 return;
114         }
115
116         if (swap_file(src0) || swap_file(src1))
117                 return;
118
119         if (mux0 == QPU_MUX_A) {
120                 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121                 *src0 = qpu_rb(31);
122         } else {
123                 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124                 *src0 = qpu_ra(31);
125         }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132         bool discard = false;
133         uint32_t inputs_remaining = c->num_inputs;
134         uint32_t vpm_read_fifo_count = 0;
135         uint32_t vpm_read_offset = 0;
136         int last_vpm_read_index = -1;
137         /* Map from the QIR ops enum order to QPU unpack bits. */
138         static const uint32_t unpack_map[] = {
139                 QPU_UNPACK_8A,
140                 QPU_UNPACK_8B,
141                 QPU_UNPACK_8C,
142                 QPU_UNPACK_8D,
143                 QPU_UNPACK_16A,
144                 QPU_UNPACK_16B,
145         };
146
147         list_inithead(&c->qpu_inst_list);
148
149         switch (c->stage) {
150         case QSTAGE_VERT:
151         case QSTAGE_COORD:
152                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
153                  * load up to 16 dwords (4 vec4s) per vertex.
154                  */
155                 while (inputs_remaining) {
156                         uint32_t num_entries = MIN2(inputs_remaining, 16);
157                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
158                                                  vpm_read_offset |
159                                                  0x00001a00 |
160                                                  ((num_entries & 0xf) << 20)));
161                         inputs_remaining -= num_entries;
162                         vpm_read_offset += num_entries;
163                         vpm_read_fifo_count++;
164                 }
165                 assert(vpm_read_fifo_count <= 4);
166
167                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
168                 break;
169         case QSTAGE_FRAG:
170                 break;
171         }
172
173         list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
174 #if 0
175                 fprintf(stderr, "translating qinst to qpu: ");
176                 qir_dump_inst(qinst);
177                 fprintf(stderr, "\n");
178 #endif
179
180                 static const struct {
181                         uint32_t op;
182                 } translate[] = {
183 #define A(name) [QOP_##name] = {QPU_A_##name}
184 #define M(name) [QOP_##name] = {QPU_M_##name}
185                         A(FADD),
186                         A(FSUB),
187                         A(FMIN),
188                         A(FMAX),
189                         A(FMINABS),
190                         A(FMAXABS),
191                         A(FTOI),
192                         A(ITOF),
193                         A(ADD),
194                         A(SUB),
195                         A(SHL),
196                         A(SHR),
197                         A(ASR),
198                         A(MIN),
199                         A(MAX),
200                         A(AND),
201                         A(OR),
202                         A(XOR),
203                         A(NOT),
204
205                         M(FMUL),
206                         M(V8MULD),
207                         M(V8MIN),
208                         M(V8MAX),
209                         M(V8ADDS),
210                         M(V8SUBS),
211                         M(MUL24),
212                 };
213
214                 struct qpu_reg src[4];
215                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
216                         int index = qinst->src[i].index;
217                         switch (qinst->src[i].file) {
218                         case QFILE_NULL:
219                                 src[i] = qpu_rn(0);
220                                 break;
221                         case QFILE_TEMP:
222                                 src[i] = temp_registers[index];
223                                 break;
224                         case QFILE_UNIF:
225                                 src[i] = qpu_unif();
226                                 break;
227                         case QFILE_VARY:
228                                 src[i] = qpu_vary();
229                                 break;
230                         case QFILE_SMALL_IMM:
231                                 src[i].mux = QPU_MUX_SMALL_IMM;
232                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
233                                 /* This should only have returned a valid
234                                  * small immediate field, not ~0 for failure.
235                                  */
236                                 assert(src[i].addr <= 47);
237                                 break;
238                         case QFILE_VPM:
239                                 assert((int)qinst->src[i].index >=
240                                        last_vpm_read_index);
241                                 (void)last_vpm_read_index;
242                                 last_vpm_read_index = qinst->src[i].index;
243                                 src[i] = qpu_ra(QPU_R_VPM);
244                                 break;
245                         }
246                 }
247
248                 struct qpu_reg dst;
249                 switch (qinst->dst.file) {
250                 case QFILE_NULL:
251                         dst = qpu_ra(QPU_W_NOP);
252                         break;
253                 case QFILE_TEMP:
254                         dst = temp_registers[qinst->dst.index];
255                         break;
256                 case QFILE_VPM:
257                         dst = qpu_ra(QPU_W_VPM);
258                         break;
259                 case QFILE_VARY:
260                 case QFILE_UNIF:
261                 case QFILE_SMALL_IMM:
262                         assert(!"not reached");
263                         break;
264                 }
265
266                 switch (qinst->op) {
267                 case QOP_MOV:
268                         /* Skip emitting the MOV if it's a no-op. */
269                         if (qir_is_raw_mov(qinst) ||
270                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
271                                 queue(c, qpu_a_MOV(dst, src[0]));
272                         }
273                         break;
274
275                 case QOP_SEL_X_0_ZS:
276                 case QOP_SEL_X_0_ZC:
277                 case QOP_SEL_X_0_NS:
278                 case QOP_SEL_X_0_NC:
279                         queue(c, qpu_a_MOV(dst, src[0]));
280                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
281                                           QPU_COND_ZS);
282
283                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
284                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
285                                               1) + QPU_COND_ZS);
286                         break;
287
288                 case QOP_SEL_X_Y_ZS:
289                 case QOP_SEL_X_Y_ZC:
290                 case QOP_SEL_X_Y_NS:
291                 case QOP_SEL_X_Y_NC:
292                         queue(c, qpu_a_MOV(dst, src[0]));
293                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
294                                           QPU_COND_ZS);
295
296                         queue(c, qpu_a_MOV(dst, src[1]));
297                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
298                                               1) + QPU_COND_ZS);
299
300                         break;
301
302                 case QOP_RCP:
303                 case QOP_RSQ:
304                 case QOP_EXP2:
305                 case QOP_LOG2:
306                         switch (qinst->op) {
307                         case QOP_RCP:
308                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
309                                                    src[0]));
310                                 break;
311                         case QOP_RSQ:
312                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
313                                                    src[0]));
314                                 break;
315                         case QOP_EXP2:
316                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
317                                                    src[0]));
318                                 break;
319                         case QOP_LOG2:
320                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
321                                                    src[0]));
322                                 break;
323                         default:
324                                 abort();
325                         }
326
327                         if (dst.mux != QPU_MUX_R4)
328                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
329
330                         break;
331
332                 case QOP_PACK_8888_F:
333                         queue(c, qpu_m_MOV(dst, src[0]));
334                         *last_inst(c) |= QPU_PM;
335                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
336                                                        QPU_PACK);
337                         break;
338
339                 case QOP_PACK_8A_F:
340                 case QOP_PACK_8B_F:
341                 case QOP_PACK_8C_F:
342                 case QOP_PACK_8D_F:
343                         queue(c,
344                               qpu_m_MOV(dst, src[0]) |
345                               QPU_PM |
346                               QPU_SET_FIELD(QPU_PACK_MUL_8A +
347                                             qinst->op - QOP_PACK_8A_F,
348                                             QPU_PACK));
349                         break;
350
351                 case QOP_FRAG_X:
352                         queue(c, qpu_a_ITOF(dst,
353                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
354                         break;
355
356                 case QOP_FRAG_Y:
357                         queue(c, qpu_a_ITOF(dst,
358                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
359                         break;
360
361                 case QOP_FRAG_REV_FLAG:
362                         queue(c, qpu_a_ITOF(dst,
363                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
364                         break;
365
366                 case QOP_FRAG_Z:
367                 case QOP_FRAG_W:
368                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
369                          * the register to the Z/W payload.
370                          */
371                         break;
372
373                 case QOP_TLB_DISCARD_SETUP:
374                         discard = true;
375                         queue(c, qpu_a_MOV(src[0], src[0]));
376                         *last_inst(c) |= QPU_SF;
377                         break;
378
379                 case QOP_TLB_STENCIL_SETUP:
380                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
381                         break;
382
383                 case QOP_TLB_Z_WRITE:
384                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
385                         if (discard) {
386                                 set_last_cond_add(c, QPU_COND_ZS);
387                         }
388                         break;
389
390                 case QOP_TLB_COLOR_READ:
391                         queue(c, qpu_NOP());
392                         *last_inst(c) = qpu_set_sig(*last_inst(c),
393                                                     QPU_SIG_COLOR_LOAD);
394
395                         if (dst.mux != QPU_MUX_R4)
396                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
397                         break;
398
399                 case QOP_TLB_COLOR_WRITE:
400                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
401                         if (discard) {
402                                 set_last_cond_add(c, QPU_COND_ZS);
403                         }
404                         break;
405
406                 case QOP_VARY_ADD_C:
407                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
408                         break;
409
410                 case QOP_TEX_S:
411                 case QOP_TEX_T:
412                 case QOP_TEX_R:
413                 case QOP_TEX_B:
414                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
415                                                   (qinst->op - QOP_TEX_S)),
416                                            src[0]));
417                         break;
418
419                 case QOP_TEX_DIRECT:
420                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
421                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
422                         break;
423
424                 case QOP_TEX_RESULT:
425                         queue(c, qpu_NOP());
426                         *last_inst(c) = qpu_set_sig(*last_inst(c),
427                                                     QPU_SIG_LOAD_TMU0);
428                         if (dst.mux != QPU_MUX_R4)
429                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
430                         break;
431
432                 case QOP_UNPACK_8A_F:
433                 case QOP_UNPACK_8B_F:
434                 case QOP_UNPACK_8C_F:
435                 case QOP_UNPACK_8D_F:
436                 case QOP_UNPACK_16A_F:
437                 case QOP_UNPACK_16B_F: {
438                         if (src[0].mux == QPU_MUX_R4) {
439                                 queue(c, qpu_a_MOV(dst, src[0]));
440                                 *last_inst(c) |= QPU_PM;
441                                 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
442                                                                (qinst->op -
443                                                                 QOP_UNPACK_8A_F),
444                                                                QPU_UNPACK);
445                         } else {
446                                 assert(src[0].mux == QPU_MUX_A);
447
448                                 /* Since we're setting the pack bits, if the
449                                  * destination is in A it would get re-packed.
450                                  */
451                                 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
452                                                      qpu_rb(31) : dst),
453                                                     src[0], src[0]));
454                                 *last_inst(c) |=
455                                         QPU_SET_FIELD(unpack_map[qinst->op -
456                                                                  QOP_UNPACK_8A_F],
457                                                       QPU_UNPACK);
458
459                                 if (dst.mux == QPU_MUX_A) {
460                                         queue(c, qpu_a_MOV(dst, qpu_rb(31)));
461                                 }
462                         }
463                 }
464                         break;
465
466                 case QOP_UNPACK_8A_I:
467                 case QOP_UNPACK_8B_I:
468                 case QOP_UNPACK_8C_I:
469                 case QOP_UNPACK_8D_I:
470                 case QOP_UNPACK_16A_I:
471                 case QOP_UNPACK_16B_I: {
472                         assert(src[0].mux == QPU_MUX_A);
473
474                         /* Since we're setting the pack bits, if the
475                          * destination is in A it would get re-packed.
476                          */
477                         queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
478                                             qpu_rb(31) : dst), src[0]));
479                         *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
480                                                                   QOP_UNPACK_8A_I],
481                                                        QPU_UNPACK);
482
483                         if (dst.mux == QPU_MUX_A) {
484                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
485                         }
486                 }
487                         break;
488
489                 default:
490                         assert(qinst->op < ARRAY_SIZE(translate));
491                         assert(translate[qinst->op].op != 0); /* NOPs */
492
493                         /* If we have only one source, put it in the second
494                          * argument slot as well so that we don't take up
495                          * another raddr just to get unused data.
496                          */
497                         if (qir_get_op_nsrc(qinst->op) == 1)
498                                 src[1] = src[0];
499
500                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
501
502                         if (qir_is_mul(qinst)) {
503                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
504                                                     dst,
505                                                     src[0], src[1]));
506                                 if (qinst->dst.pack) {
507                                         *last_inst(c) |= QPU_PM;
508                                         *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
509                                                                        QPU_PACK);
510                                 }
511                         } else {
512                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
513                                                     dst,
514                                                     src[0], src[1]));
515                                 if (qinst->dst.pack) {
516                                         assert(dst.mux == QPU_MUX_A);
517                                         *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
518                                                                        QPU_PACK);
519                                 }
520                         }
521
522                         break;
523                 }
524
525                 if (qinst->sf) {
526                         assert(!qir_is_multi_instruction(qinst));
527                         *last_inst(c) |= QPU_SF;
528                 }
529         }
530
531         qpu_schedule_instructions(c);
532
533         /* thread end can't have VPM write or read */
534         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
535                           QPU_WADDR_ADD) == QPU_W_VPM ||
536             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
537                           QPU_WADDR_MUL) == QPU_W_VPM ||
538             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
539                           QPU_RADDR_A) == QPU_R_VPM ||
540             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
541                           QPU_RADDR_B) == QPU_R_VPM) {
542                 qpu_serialize_one_inst(c, qpu_NOP());
543         }
544
545         /* thread end can't have uniform read */
546         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
547                           QPU_RADDR_A) == QPU_R_UNIF ||
548             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
549                           QPU_RADDR_B) == QPU_R_UNIF) {
550                 qpu_serialize_one_inst(c, qpu_NOP());
551         }
552
553         /* thread end can't have TLB operations */
554         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
555                 qpu_serialize_one_inst(c, qpu_NOP());
556
557         c->qpu_insts[c->qpu_inst_count - 1] =
558                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
559                             QPU_SIG_PROG_END);
560         qpu_serialize_one_inst(c, qpu_NOP());
561         qpu_serialize_one_inst(c, qpu_NOP());
562
563         switch (c->stage) {
564         case QSTAGE_VERT:
565         case QSTAGE_COORD:
566                 break;
567         case QSTAGE_FRAG:
568                 c->qpu_insts[c->qpu_inst_count - 1] =
569                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
570                                     QPU_SIG_SCOREBOARD_UNLOCK);
571                 break;
572         }
573
574         if (vc4_debug & VC4_DEBUG_QPU)
575                 vc4_dump_program(c);
576
577         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
578
579         free(temp_registers);
580 }