OSDN Git Service

vc4: Fold the 16-bit integer pack into the instructions generating it.
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34         fprintf(stderr, "%s prog %d/%d QPU:\n",
35                 qir_get_stage_name(c->stage),
36                 c->program_id, c->variant_id);
37
38         for (int i = 0; i < c->qpu_inst_count; i++) {
39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                 fprintf(stderr, "\n");
42         }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49         q->inst = inst;
50         list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56         struct queued_qpu_inst *q =
57                 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58         return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68  * Some special registers can be read from either file, which lets us resolve
69  * raddr conflicts without extra MOVs.
70  */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74         switch (src->addr) {
75         case QPU_R_UNIF:
76         case QPU_R_VARY:
77                 if (src->mux == QPU_MUX_SMALL_IMM) {
78                         return false;
79                 } else {
80                         if (src->mux == QPU_MUX_A)
81                                 src->mux = QPU_MUX_B;
82                         else
83                                 src->mux = QPU_MUX_A;
84                         return true;
85                 }
86
87         default:
88                 return false;
89         }
90 }
91
92 /**
93  * This is used to resolve the fact that we might register-allocate two
94  * different operands of an instruction to the same physical register file
95  * even though instructions have only one field for the register file source
96  * address.
97  *
98  * In that case, we need to move one to a temporary that can be used in the
99  * instruction, instead.  We reserve ra31/rb31 for this purpose.
100  */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103                      struct qpu_reg dst,
104                      struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109         if (mux0 <= QPU_MUX_R5 ||
110             mux0 != mux1 ||
111             (src0->addr == src1->addr &&
112              src0->mux == src1->mux)) {
113                 return;
114         }
115
116         if (swap_file(src0) || swap_file(src1))
117                 return;
118
119         if (mux0 == QPU_MUX_A) {
120                 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121                 *src0 = qpu_rb(31);
122         } else {
123                 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124                 *src0 = qpu_ra(31);
125         }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132         bool discard = false;
133         uint32_t inputs_remaining = c->num_inputs;
134         uint32_t vpm_read_fifo_count = 0;
135         uint32_t vpm_read_offset = 0;
136         int last_vpm_read_index = -1;
137         /* Map from the QIR ops enum order to QPU unpack bits. */
138         static const uint32_t unpack_map[] = {
139                 QPU_UNPACK_8A,
140                 QPU_UNPACK_8B,
141                 QPU_UNPACK_8C,
142                 QPU_UNPACK_8D,
143                 QPU_UNPACK_16A_TO_F32,
144                 QPU_UNPACK_16B_TO_F32,
145         };
146
147         list_inithead(&c->qpu_inst_list);
148
149         switch (c->stage) {
150         case QSTAGE_VERT:
151         case QSTAGE_COORD:
152                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
153                  * load up to 16 dwords (4 vec4s) per vertex.
154                  */
155                 while (inputs_remaining) {
156                         uint32_t num_entries = MIN2(inputs_remaining, 16);
157                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
158                                                  vpm_read_offset |
159                                                  0x00001a00 |
160                                                  ((num_entries & 0xf) << 20)));
161                         inputs_remaining -= num_entries;
162                         vpm_read_offset += num_entries;
163                         vpm_read_fifo_count++;
164                 }
165                 assert(vpm_read_fifo_count <= 4);
166
167                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
168                 break;
169         case QSTAGE_FRAG:
170                 break;
171         }
172
173         list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
174 #if 0
175                 fprintf(stderr, "translating qinst to qpu: ");
176                 qir_dump_inst(qinst);
177                 fprintf(stderr, "\n");
178 #endif
179
180                 static const struct {
181                         uint32_t op;
182                 } translate[] = {
183 #define A(name) [QOP_##name] = {QPU_A_##name}
184 #define M(name) [QOP_##name] = {QPU_M_##name}
185                         A(FADD),
186                         A(FSUB),
187                         A(FMIN),
188                         A(FMAX),
189                         A(FMINABS),
190                         A(FMAXABS),
191                         A(FTOI),
192                         A(ITOF),
193                         A(ADD),
194                         A(SUB),
195                         A(SHL),
196                         A(SHR),
197                         A(ASR),
198                         A(MIN),
199                         A(MAX),
200                         A(AND),
201                         A(OR),
202                         A(XOR),
203                         A(NOT),
204
205                         M(FMUL),
206                         M(MUL24),
207                 };
208
209                 struct qpu_reg src[4];
210                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
211                         int index = qinst->src[i].index;
212                         switch (qinst->src[i].file) {
213                         case QFILE_NULL:
214                                 src[i] = qpu_rn(0);
215                                 break;
216                         case QFILE_TEMP:
217                                 src[i] = temp_registers[index];
218                                 break;
219                         case QFILE_UNIF:
220                                 src[i] = qpu_unif();
221                                 break;
222                         case QFILE_VARY:
223                                 src[i] = qpu_vary();
224                                 break;
225                         case QFILE_SMALL_IMM:
226                                 src[i].mux = QPU_MUX_SMALL_IMM;
227                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
228                                 /* This should only have returned a valid
229                                  * small immediate field, not ~0 for failure.
230                                  */
231                                 assert(src[i].addr <= 47);
232                                 break;
233                         case QFILE_VPM:
234                                 assert((int)qinst->src[i].index >=
235                                        last_vpm_read_index);
236                                 (void)last_vpm_read_index;
237                                 last_vpm_read_index = qinst->src[i].index;
238                                 src[i] = qpu_ra(QPU_R_VPM);
239                                 break;
240                         }
241                 }
242
243                 struct qpu_reg dst;
244                 switch (qinst->dst.file) {
245                 case QFILE_NULL:
246                         dst = qpu_ra(QPU_W_NOP);
247                         break;
248                 case QFILE_TEMP:
249                         dst = temp_registers[qinst->dst.index];
250                         break;
251                 case QFILE_VPM:
252                         dst = qpu_ra(QPU_W_VPM);
253                         break;
254                 case QFILE_VARY:
255                 case QFILE_UNIF:
256                 case QFILE_SMALL_IMM:
257                         assert(!"not reached");
258                         break;
259                 }
260
261                 switch (qinst->op) {
262                 case QOP_MOV:
263                         /* Skip emitting the MOV if it's a no-op. */
264                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
265                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
266                                 queue(c, qpu_a_MOV(dst, src[0]));
267                         }
268                         break;
269
270                 case QOP_SEL_X_0_ZS:
271                 case QOP_SEL_X_0_ZC:
272                 case QOP_SEL_X_0_NS:
273                 case QOP_SEL_X_0_NC:
274                         queue(c, qpu_a_MOV(dst, src[0]));
275                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
276                                           QPU_COND_ZS);
277
278                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
279                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
280                                               1) + QPU_COND_ZS);
281                         break;
282
283                 case QOP_SEL_X_Y_ZS:
284                 case QOP_SEL_X_Y_ZC:
285                 case QOP_SEL_X_Y_NS:
286                 case QOP_SEL_X_Y_NC:
287                         queue(c, qpu_a_MOV(dst, src[0]));
288                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
289                                           QPU_COND_ZS);
290
291                         queue(c, qpu_a_MOV(dst, src[1]));
292                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
293                                               1) + QPU_COND_ZS);
294
295                         break;
296
297                 case QOP_RCP:
298                 case QOP_RSQ:
299                 case QOP_EXP2:
300                 case QOP_LOG2:
301                         switch (qinst->op) {
302                         case QOP_RCP:
303                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
304                                                    src[0]));
305                                 break;
306                         case QOP_RSQ:
307                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
308                                                    src[0]));
309                                 break;
310                         case QOP_EXP2:
311                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
312                                                    src[0]));
313                                 break;
314                         case QOP_LOG2:
315                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
316                                                    src[0]));
317                                 break;
318                         default:
319                                 abort();
320                         }
321
322                         if (dst.mux != QPU_MUX_R4)
323                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
324
325                         break;
326
327                 case QOP_PACK_8888_F:
328                         queue(c, qpu_m_MOV(dst, src[0]));
329                         *last_inst(c) |= QPU_PM;
330                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
331                                                        QPU_PACK);
332                         break;
333
334                 case QOP_PACK_8A_F:
335                 case QOP_PACK_8B_F:
336                 case QOP_PACK_8C_F:
337                 case QOP_PACK_8D_F:
338                         queue(c,
339                               qpu_m_MOV(dst, src[0]) |
340                               QPU_PM |
341                               QPU_SET_FIELD(QPU_PACK_MUL_8A +
342                                             qinst->op - QOP_PACK_8A_F,
343                                             QPU_PACK));
344                         break;
345
346                 case QOP_FRAG_X:
347                         queue(c, qpu_a_ITOF(dst,
348                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
349                         break;
350
351                 case QOP_FRAG_Y:
352                         queue(c, qpu_a_ITOF(dst,
353                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
354                         break;
355
356                 case QOP_FRAG_REV_FLAG:
357                         queue(c, qpu_a_ITOF(dst,
358                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
359                         break;
360
361                 case QOP_FRAG_Z:
362                 case QOP_FRAG_W:
363                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
364                          * the register to the Z/W payload.
365                          */
366                         break;
367
368                 case QOP_TLB_DISCARD_SETUP:
369                         discard = true;
370                         queue(c, qpu_a_MOV(src[0], src[0]));
371                         *last_inst(c) |= QPU_SF;
372                         break;
373
374                 case QOP_TLB_STENCIL_SETUP:
375                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
376                         break;
377
378                 case QOP_TLB_Z_WRITE:
379                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
380                         if (discard) {
381                                 set_last_cond_add(c, QPU_COND_ZS);
382                         }
383                         break;
384
385                 case QOP_TLB_COLOR_READ:
386                         queue(c, qpu_NOP());
387                         *last_inst(c) = qpu_set_sig(*last_inst(c),
388                                                     QPU_SIG_COLOR_LOAD);
389
390                         if (dst.mux != QPU_MUX_R4)
391                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
392                         break;
393
394                 case QOP_TLB_COLOR_WRITE:
395                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
396                         if (discard) {
397                                 set_last_cond_add(c, QPU_COND_ZS);
398                         }
399                         break;
400
401                 case QOP_VARY_ADD_C:
402                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
403                         break;
404
405                 case QOP_TEX_S:
406                 case QOP_TEX_T:
407                 case QOP_TEX_R:
408                 case QOP_TEX_B:
409                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
410                                                   (qinst->op - QOP_TEX_S)),
411                                            src[0]));
412                         break;
413
414                 case QOP_TEX_DIRECT:
415                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
416                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
417                         break;
418
419                 case QOP_TEX_RESULT:
420                         queue(c, qpu_NOP());
421                         *last_inst(c) = qpu_set_sig(*last_inst(c),
422                                                     QPU_SIG_LOAD_TMU0);
423                         if (dst.mux != QPU_MUX_R4)
424                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
425                         break;
426
427                 case QOP_UNPACK_8A_F:
428                 case QOP_UNPACK_8B_F:
429                 case QOP_UNPACK_8C_F:
430                 case QOP_UNPACK_8D_F:
431                 case QOP_UNPACK_16A_F:
432                 case QOP_UNPACK_16B_F: {
433                         if (src[0].mux == QPU_MUX_R4) {
434                                 queue(c, qpu_a_MOV(dst, src[0]));
435                                 *last_inst(c) |= QPU_PM;
436                                 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
437                                                                (qinst->op -
438                                                                 QOP_UNPACK_8A_F),
439                                                                QPU_UNPACK);
440                         } else {
441                                 assert(src[0].mux == QPU_MUX_A);
442
443                                 /* Since we're setting the pack bits, if the
444                                  * destination is in A it would get re-packed.
445                                  */
446                                 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
447                                                      qpu_rb(31) : dst),
448                                                     src[0], src[0]));
449                                 *last_inst(c) |=
450                                         QPU_SET_FIELD(unpack_map[qinst->op -
451                                                                  QOP_UNPACK_8A_F],
452                                                       QPU_UNPACK);
453
454                                 if (dst.mux == QPU_MUX_A) {
455                                         queue(c, qpu_a_MOV(dst, qpu_rb(31)));
456                                 }
457                         }
458                 }
459                         break;
460
461                 case QOP_UNPACK_8A_I:
462                 case QOP_UNPACK_8B_I:
463                 case QOP_UNPACK_8C_I:
464                 case QOP_UNPACK_8D_I:
465                 case QOP_UNPACK_16A_I:
466                 case QOP_UNPACK_16B_I: {
467                         assert(src[0].mux == QPU_MUX_A);
468
469                         /* Since we're setting the pack bits, if the
470                          * destination is in A it would get re-packed.
471                          */
472                         queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
473                                             qpu_rb(31) : dst), src[0]));
474                         *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
475                                                                   QOP_UNPACK_8A_I],
476                                                        QPU_UNPACK);
477
478                         if (dst.mux == QPU_MUX_A) {
479                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
480                         }
481                 }
482                         break;
483
484                 default:
485                         assert(qinst->op < ARRAY_SIZE(translate));
486                         assert(translate[qinst->op].op != 0); /* NOPs */
487
488                         /* If we have only one source, put it in the second
489                          * argument slot as well so that we don't take up
490                          * another raddr just to get unused data.
491                          */
492                         if (qir_get_op_nsrc(qinst->op) == 1)
493                                 src[1] = src[0];
494
495                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
496
497                         if (qir_is_mul(qinst)) {
498                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
499                                                     dst,
500                                                     src[0], src[1]));
501                                 if (qinst->dst.pack) {
502                                         *last_inst(c) |= QPU_PM;
503                                         *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
504                                                                        QPU_PACK);
505                                 }
506                         } else {
507                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
508                                                     dst,
509                                                     src[0], src[1]));
510                                 if (qinst->dst.pack) {
511                                         assert(dst.mux == QPU_MUX_A);
512                                         *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
513                                                                        QPU_PACK);
514                                 }
515                         }
516
517                         break;
518                 }
519
520                 if (qinst->sf) {
521                         assert(!qir_is_multi_instruction(qinst));
522                         *last_inst(c) |= QPU_SF;
523                 }
524         }
525
526         qpu_schedule_instructions(c);
527
528         /* thread end can't have VPM write or read */
529         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
530                           QPU_WADDR_ADD) == QPU_W_VPM ||
531             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
532                           QPU_WADDR_MUL) == QPU_W_VPM ||
533             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
534                           QPU_RADDR_A) == QPU_R_VPM ||
535             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
536                           QPU_RADDR_B) == QPU_R_VPM) {
537                 qpu_serialize_one_inst(c, qpu_NOP());
538         }
539
540         /* thread end can't have uniform read */
541         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
542                           QPU_RADDR_A) == QPU_R_UNIF ||
543             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
544                           QPU_RADDR_B) == QPU_R_UNIF) {
545                 qpu_serialize_one_inst(c, qpu_NOP());
546         }
547
548         /* thread end can't have TLB operations */
549         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
550                 qpu_serialize_one_inst(c, qpu_NOP());
551
552         c->qpu_insts[c->qpu_inst_count - 1] =
553                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
554                             QPU_SIG_PROG_END);
555         qpu_serialize_one_inst(c, qpu_NOP());
556         qpu_serialize_one_inst(c, qpu_NOP());
557
558         switch (c->stage) {
559         case QSTAGE_VERT:
560         case QSTAGE_COORD:
561                 break;
562         case QSTAGE_FRAG:
563                 c->qpu_insts[c->qpu_inst_count - 1] =
564                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
565                                     QPU_SIG_SCOREBOARD_UNLOCK);
566                 break;
567         }
568
569         if (vc4_debug & VC4_DEBUG_QPU)
570                 vc4_dump_program(c);
571
572         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
573
574         free(temp_registers);
575 }