OSDN Git Service

vc4: Move FRAG_X/Y/REV_FLAG to a QFILE like VPM or TLB color writes.
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34         fprintf(stderr, "%s prog %d/%d QPU:\n",
35                 qir_get_stage_name(c->stage),
36                 c->program_id, c->variant_id);
37
38         for (int i = 0; i < c->qpu_inst_count; i++) {
39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                 fprintf(stderr, "\n");
42         }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49         q->inst = inst;
50         list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56         struct queued_qpu_inst *q =
57                 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58         return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70         *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74  * Some special registers can be read from either file, which lets us resolve
75  * raddr conflicts without extra MOVs.
76  */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80         switch (src->addr) {
81         case QPU_R_UNIF:
82         case QPU_R_VARY:
83                 if (src->mux == QPU_MUX_SMALL_IMM) {
84                         return false;
85                 } else {
86                         if (src->mux == QPU_MUX_A)
87                                 src->mux = QPU_MUX_B;
88                         else
89                                 src->mux = QPU_MUX_A;
90                         return true;
91                 }
92
93         default:
94                 return false;
95         }
96 }
97
98 /**
99  * This is used to resolve the fact that we might register-allocate two
100  * different operands of an instruction to the same physical register file
101  * even though instructions have only one field for the register file source
102  * address.
103  *
104  * In that case, we need to move one to a temporary that can be used in the
105  * instruction, instead.  We reserve ra31/rb31 for this purpose.
106  */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109                      struct qpu_reg dst,
110                      struct qpu_reg *src0, struct qpu_reg *src1,
111                      struct qinst *inst, uint64_t *unpack)
112 {
113         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116         if (mux0 <= QPU_MUX_R5 ||
117             mux0 != mux1 ||
118             (src0->addr == src1->addr &&
119              src0->mux == src1->mux)) {
120                 return;
121         }
122
123         if (swap_file(src0) || swap_file(src1))
124                 return;
125
126         if (mux0 == QPU_MUX_A) {
127                 /* Make sure we use the same type of MOV as the instruction,
128                  * in case of unpacks.
129                  */
130                 if (qir_is_float_input(inst))
131                         queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132                 else
133                         queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135                 /* If we had an unpack on this A-file source, we need to put
136                  * it into this MOV, not into the later move from regfile B.
137                  */
138                 if (inst->src[0].pack) {
139                         *last_inst(c) |= *unpack;
140                         *unpack = 0;
141                 }
142                 *src0 = qpu_rb(31);
143         } else {
144                 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145                 *src0 = qpu_ra(31);
146         }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152         bool had_pm = *last_inst(c) & QPU_PM;
153         bool had_ws = *last_inst(c) & QPU_WS;
154         uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156         if (!inst->dst.pack)
157                 return;
158
159         *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161         if (qir_is_mul(inst)) {
162                 assert(!unpack || had_pm);
163                 *last_inst(c) |= QPU_PM;
164         } else {
165                 assert(!unpack || !had_pm);
166                 assert(!had_ws); /* dst must be a-file to pack. */
167         }
168 }
169
170 static void
171 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
172                     struct qpu_reg dst)
173 {
174         if (dst.mux != QPU_MUX_R4)
175                 queue(c, qpu_a_MOV(dst, qpu_r4()));
176         else if (qinst->sf)
177                 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
178 }
179
180 void
181 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
182 {
183         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
184         uint32_t inputs_remaining = c->num_inputs;
185         uint32_t vpm_read_fifo_count = 0;
186         uint32_t vpm_read_offset = 0;
187         int last_vpm_read_index = -1;
188
189         list_inithead(&c->qpu_inst_list);
190
191         switch (c->stage) {
192         case QSTAGE_VERT:
193         case QSTAGE_COORD:
194                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
195                  * load up to 16 dwords (4 vec4s) per vertex.
196                  */
197                 while (inputs_remaining) {
198                         uint32_t num_entries = MIN2(inputs_remaining, 16);
199                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
200                                                  vpm_read_offset |
201                                                  0x00001a00 |
202                                                  ((num_entries & 0xf) << 20)));
203                         inputs_remaining -= num_entries;
204                         vpm_read_offset += num_entries;
205                         vpm_read_fifo_count++;
206                 }
207                 assert(vpm_read_fifo_count <= 4);
208
209                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
210                 break;
211         case QSTAGE_FRAG:
212                 break;
213         }
214
215         list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
216 #if 0
217                 fprintf(stderr, "translating qinst to qpu: ");
218                 qir_dump_inst(qinst);
219                 fprintf(stderr, "\n");
220 #endif
221
222                 static const struct {
223                         uint32_t op;
224                 } translate[] = {
225 #define A(name) [QOP_##name] = {QPU_A_##name}
226 #define M(name) [QOP_##name] = {QPU_M_##name}
227                         A(FADD),
228                         A(FSUB),
229                         A(FMIN),
230                         A(FMAX),
231                         A(FMINABS),
232                         A(FMAXABS),
233                         A(FTOI),
234                         A(ITOF),
235                         A(ADD),
236                         A(SUB),
237                         A(SHL),
238                         A(SHR),
239                         A(ASR),
240                         A(MIN),
241                         A(MAX),
242                         A(AND),
243                         A(OR),
244                         A(XOR),
245                         A(NOT),
246
247                         M(FMUL),
248                         M(V8MULD),
249                         M(V8MIN),
250                         M(V8MAX),
251                         M(V8ADDS),
252                         M(V8SUBS),
253                         M(MUL24),
254
255                         /* If we replicate src[0] out to src[1], this works
256                          * out the same as a MOV.
257                          */
258                         [QOP_MOV] = { QPU_A_OR },
259                         [QOP_FMOV] = { QPU_A_FMAX },
260                         [QOP_MMOV] = { QPU_M_V8MIN },
261                 };
262
263                 uint64_t unpack = 0;
264                 struct qpu_reg src[4];
265                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
266                         int index = qinst->src[i].index;
267                         switch (qinst->src[i].file) {
268                         case QFILE_NULL:
269                                 src[i] = qpu_rn(0);
270                                 break;
271                         case QFILE_TEMP:
272                                 src[i] = temp_registers[index];
273                                 if (qinst->src[i].pack) {
274                                         assert(!unpack ||
275                                                unpack == qinst->src[i].pack);
276                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
277                                                                QPU_UNPACK);
278                                         if (src[i].mux == QPU_MUX_R4)
279                                                 unpack |= QPU_PM;
280                                 }
281                                 break;
282                         case QFILE_UNIF:
283                                 src[i] = qpu_unif();
284                                 break;
285                         case QFILE_VARY:
286                                 src[i] = qpu_vary();
287                                 break;
288                         case QFILE_SMALL_IMM:
289                                 src[i].mux = QPU_MUX_SMALL_IMM;
290                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
291                                 /* This should only have returned a valid
292                                  * small immediate field, not ~0 for failure.
293                                  */
294                                 assert(src[i].addr <= 47);
295                                 break;
296                         case QFILE_VPM:
297                                 assert((int)qinst->src[i].index >=
298                                        last_vpm_read_index);
299                                 (void)last_vpm_read_index;
300                                 last_vpm_read_index = qinst->src[i].index;
301                                 src[i] = qpu_ra(QPU_R_VPM);
302                                 break;
303
304                         case QFILE_FRAG_X:
305                                 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
306                                 break;
307                         case QFILE_FRAG_Y:
308                                 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
309                                 break;
310                         case QFILE_FRAG_REV_FLAG:
311                                 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
312                                 break;
313
314                         case QFILE_TLB_COLOR_WRITE:
315                         case QFILE_TLB_COLOR_WRITE_MS:
316                         case QFILE_TLB_Z_WRITE:
317                         case QFILE_TLB_STENCIL_SETUP:
318                                 unreachable("bad qir src file");
319                         }
320                 }
321
322                 struct qpu_reg dst;
323                 switch (qinst->dst.file) {
324                 case QFILE_NULL:
325                         dst = qpu_ra(QPU_W_NOP);
326                         break;
327                 case QFILE_TEMP:
328                         dst = temp_registers[qinst->dst.index];
329                         break;
330                 case QFILE_VPM:
331                         dst = qpu_ra(QPU_W_VPM);
332                         break;
333
334                 case QFILE_TLB_COLOR_WRITE:
335                         dst = qpu_tlbc();
336                         break;
337
338                 case QFILE_TLB_COLOR_WRITE_MS:
339                         dst = qpu_tlbc_ms();
340                         break;
341
342                 case QFILE_TLB_Z_WRITE:
343                         dst = qpu_ra(QPU_W_TLB_Z);
344                         break;
345
346                 case QFILE_TLB_STENCIL_SETUP:
347                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
348                         break;
349
350                 case QFILE_VARY:
351                 case QFILE_UNIF:
352                 case QFILE_SMALL_IMM:
353                 case QFILE_FRAG_X:
354                 case QFILE_FRAG_Y:
355                 case QFILE_FRAG_REV_FLAG:
356                         assert(!"not reached");
357                         break;
358                 }
359
360                 bool handled_qinst_cond = false;
361
362                 switch (qinst->op) {
363                 case QOP_RCP:
364                 case QOP_RSQ:
365                 case QOP_EXP2:
366                 case QOP_LOG2:
367                         switch (qinst->op) {
368                         case QOP_RCP:
369                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
370                                                    src[0]) | unpack);
371                                 break;
372                         case QOP_RSQ:
373                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
374                                                    src[0]) | unpack);
375                                 break;
376                         case QOP_EXP2:
377                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
378                                                    src[0]) | unpack);
379                                 break;
380                         case QOP_LOG2:
381                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
382                                                    src[0]) | unpack);
383                                 break;
384                         default:
385                                 abort();
386                         }
387
388                         handle_r4_qpu_write(c, qinst, dst);
389
390                         break;
391
392                 case QOP_MS_MASK:
393                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
394                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
395                                              qinst, &unpack);
396                         queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
397                                            src[0], src[1]) | unpack);
398                         break;
399
400                 case QOP_FRAG_Z:
401                 case QOP_FRAG_W:
402                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
403                          * the register to the Z/W payload.
404                          */
405                         break;
406
407                 case QOP_TLB_COLOR_READ:
408                         queue(c, qpu_NOP());
409                         *last_inst(c) = qpu_set_sig(*last_inst(c),
410                                                     QPU_SIG_COLOR_LOAD);
411                         handle_r4_qpu_write(c, qinst, dst);
412                         break;
413
414                 case QOP_VARY_ADD_C:
415                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
416                         break;
417
418                 case QOP_TEX_S:
419                 case QOP_TEX_T:
420                 case QOP_TEX_R:
421                 case QOP_TEX_B:
422                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
423                                                   (qinst->op - QOP_TEX_S)),
424                                            src[0]) | unpack);
425                         break;
426
427                 case QOP_TEX_DIRECT:
428                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
429                                              qinst, &unpack);
430                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
431                                            src[0], src[1]) | unpack);
432                         break;
433
434                 case QOP_TEX_RESULT:
435                         queue(c, qpu_NOP());
436                         *last_inst(c) = qpu_set_sig(*last_inst(c),
437                                                     QPU_SIG_LOAD_TMU0);
438                         handle_r4_qpu_write(c, qinst, dst);
439                         break;
440
441                 default:
442                         assert(qinst->op < ARRAY_SIZE(translate));
443                         assert(translate[qinst->op].op != 0); /* NOPs */
444
445                         /* Skip emitting the MOV if it's a no-op. */
446                         if (qir_is_raw_mov(qinst) &&
447                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
448                                 break;
449                         }
450
451                         /* If we have only one source, put it in the second
452                          * argument slot as well so that we don't take up
453                          * another raddr just to get unused data.
454                          */
455                         if (qir_get_op_nsrc(qinst->op) == 1)
456                                 src[1] = src[0];
457
458                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
459                                              qinst, &unpack);
460
461                         if (qir_is_mul(qinst)) {
462                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
463                                                     dst,
464                                                     src[0], src[1]) | unpack);
465                                 set_last_cond_mul(c, qinst->cond);
466                         } else {
467                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
468                                                     dst,
469                                                     src[0], src[1]) | unpack);
470                                 set_last_cond_add(c, qinst->cond);
471                         }
472                         handled_qinst_cond = true;
473                         set_last_dst_pack(c, qinst);
474
475                         break;
476                 }
477
478                 assert(qinst->cond == QPU_COND_ALWAYS ||
479                        handled_qinst_cond);
480
481                 if (qinst->sf)
482                         *last_inst(c) |= QPU_SF;
483         }
484
485         uint32_t cycles = qpu_schedule_instructions(c);
486         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
487
488         /* thread end can't have VPM write or read */
489         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
490                           QPU_WADDR_ADD) == QPU_W_VPM ||
491             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
492                           QPU_WADDR_MUL) == QPU_W_VPM ||
493             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494                           QPU_RADDR_A) == QPU_R_VPM ||
495             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496                           QPU_RADDR_B) == QPU_R_VPM) {
497                 qpu_serialize_one_inst(c, qpu_NOP());
498         }
499
500         /* thread end can't have uniform read */
501         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
502                           QPU_RADDR_A) == QPU_R_UNIF ||
503             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
504                           QPU_RADDR_B) == QPU_R_UNIF) {
505                 qpu_serialize_one_inst(c, qpu_NOP());
506         }
507
508         /* thread end can't have TLB operations */
509         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
510                 qpu_serialize_one_inst(c, qpu_NOP());
511
512         c->qpu_insts[c->qpu_inst_count - 1] =
513                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
514                             QPU_SIG_PROG_END);
515         qpu_serialize_one_inst(c, qpu_NOP());
516         qpu_serialize_one_inst(c, qpu_NOP());
517
518         switch (c->stage) {
519         case QSTAGE_VERT:
520         case QSTAGE_COORD:
521                 break;
522         case QSTAGE_FRAG:
523                 c->qpu_insts[c->qpu_inst_count - 1] =
524                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
525                                     QPU_SIG_SCOREBOARD_UNLOCK);
526                 break;
527         }
528
529         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
530
531         if (vc4_debug & VC4_DEBUG_SHADERDB) {
532                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
533                         qir_get_stage_name(c->stage),
534                         c->program_id, c->variant_id,
535                         cycles);
536         }
537
538         if (vc4_debug & VC4_DEBUG_QPU)
539                 vc4_dump_program(c);
540
541         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
542
543         free(temp_registers);
544 }