OSDN Git Service

vc4: Handle SF on instructions that write r4.
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34         fprintf(stderr, "%s prog %d/%d QPU:\n",
35                 qir_get_stage_name(c->stage),
36                 c->program_id, c->variant_id);
37
38         for (int i = 0; i < c->qpu_inst_count; i++) {
39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41                 fprintf(stderr, "\n");
42         }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49         q->inst = inst;
50         list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56         struct queued_qpu_inst *q =
57                 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58         return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70         *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74  * Some special registers can be read from either file, which lets us resolve
75  * raddr conflicts without extra MOVs.
76  */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80         switch (src->addr) {
81         case QPU_R_UNIF:
82         case QPU_R_VARY:
83                 if (src->mux == QPU_MUX_SMALL_IMM) {
84                         return false;
85                 } else {
86                         if (src->mux == QPU_MUX_A)
87                                 src->mux = QPU_MUX_B;
88                         else
89                                 src->mux = QPU_MUX_A;
90                         return true;
91                 }
92
93         default:
94                 return false;
95         }
96 }
97
98 /**
99  * This is used to resolve the fact that we might register-allocate two
100  * different operands of an instruction to the same physical register file
101  * even though instructions have only one field for the register file source
102  * address.
103  *
104  * In that case, we need to move one to a temporary that can be used in the
105  * instruction, instead.  We reserve ra31/rb31 for this purpose.
106  */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109                      struct qpu_reg dst,
110                      struct qpu_reg *src0, struct qpu_reg *src1,
111                      struct qinst *inst, uint64_t *unpack)
112 {
113         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116         if (mux0 <= QPU_MUX_R5 ||
117             mux0 != mux1 ||
118             (src0->addr == src1->addr &&
119              src0->mux == src1->mux)) {
120                 return;
121         }
122
123         if (swap_file(src0) || swap_file(src1))
124                 return;
125
126         if (mux0 == QPU_MUX_A) {
127                 /* Make sure we use the same type of MOV as the instruction,
128                  * in case of unpacks.
129                  */
130                 if (qir_is_float_input(inst))
131                         queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132                 else
133                         queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135                 /* If we had an unpack on this A-file source, we need to put
136                  * it into this MOV, not into the later move from regfile B.
137                  */
138                 if (inst->src[0].pack) {
139                         *last_inst(c) |= *unpack;
140                         *unpack = 0;
141                 }
142                 *src0 = qpu_rb(31);
143         } else {
144                 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145                 *src0 = qpu_ra(31);
146         }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152         bool had_pm = *last_inst(c) & QPU_PM;
153         bool had_ws = *last_inst(c) & QPU_WS;
154         uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156         if (!inst->dst.pack)
157                 return;
158
159         *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161         if (qir_is_mul(inst)) {
162                 assert(!unpack || had_pm);
163                 *last_inst(c) |= QPU_PM;
164         } else {
165                 assert(!unpack || !had_pm);
166                 assert(!had_ws); /* dst must be a-file to pack. */
167         }
168 }
169
170 static void
171 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
172                     struct qpu_reg dst)
173 {
174         if (dst.mux != QPU_MUX_R4)
175                 queue(c, qpu_a_MOV(dst, qpu_r4()));
176         else if (qinst->sf)
177                 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
178 }
179
180 void
181 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
182 {
183         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
184         uint32_t inputs_remaining = c->num_inputs;
185         uint32_t vpm_read_fifo_count = 0;
186         uint32_t vpm_read_offset = 0;
187         int last_vpm_read_index = -1;
188
189         list_inithead(&c->qpu_inst_list);
190
191         switch (c->stage) {
192         case QSTAGE_VERT:
193         case QSTAGE_COORD:
194                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
195                  * load up to 16 dwords (4 vec4s) per vertex.
196                  */
197                 while (inputs_remaining) {
198                         uint32_t num_entries = MIN2(inputs_remaining, 16);
199                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
200                                                  vpm_read_offset |
201                                                  0x00001a00 |
202                                                  ((num_entries & 0xf) << 20)));
203                         inputs_remaining -= num_entries;
204                         vpm_read_offset += num_entries;
205                         vpm_read_fifo_count++;
206                 }
207                 assert(vpm_read_fifo_count <= 4);
208
209                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
210                 break;
211         case QSTAGE_FRAG:
212                 break;
213         }
214
215         list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
216 #if 0
217                 fprintf(stderr, "translating qinst to qpu: ");
218                 qir_dump_inst(qinst);
219                 fprintf(stderr, "\n");
220 #endif
221
222                 static const struct {
223                         uint32_t op;
224                 } translate[] = {
225 #define A(name) [QOP_##name] = {QPU_A_##name}
226 #define M(name) [QOP_##name] = {QPU_M_##name}
227                         A(FADD),
228                         A(FSUB),
229                         A(FMIN),
230                         A(FMAX),
231                         A(FMINABS),
232                         A(FMAXABS),
233                         A(FTOI),
234                         A(ITOF),
235                         A(ADD),
236                         A(SUB),
237                         A(SHL),
238                         A(SHR),
239                         A(ASR),
240                         A(MIN),
241                         A(MAX),
242                         A(AND),
243                         A(OR),
244                         A(XOR),
245                         A(NOT),
246
247                         M(FMUL),
248                         M(V8MULD),
249                         M(V8MIN),
250                         M(V8MAX),
251                         M(V8ADDS),
252                         M(V8SUBS),
253                         M(MUL24),
254
255                         /* If we replicate src[0] out to src[1], this works
256                          * out the same as a MOV.
257                          */
258                         [QOP_MOV] = { QPU_A_OR },
259                         [QOP_FMOV] = { QPU_A_FMAX },
260                         [QOP_MMOV] = { QPU_M_V8MIN },
261                 };
262
263                 uint64_t unpack = 0;
264                 struct qpu_reg src[4];
265                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
266                         int index = qinst->src[i].index;
267                         switch (qinst->src[i].file) {
268                         case QFILE_NULL:
269                                 src[i] = qpu_rn(0);
270                                 break;
271                         case QFILE_TEMP:
272                                 src[i] = temp_registers[index];
273                                 if (qinst->src[i].pack) {
274                                         assert(!unpack ||
275                                                unpack == qinst->src[i].pack);
276                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
277                                                                QPU_UNPACK);
278                                         if (src[i].mux == QPU_MUX_R4)
279                                                 unpack |= QPU_PM;
280                                 }
281                                 break;
282                         case QFILE_UNIF:
283                                 src[i] = qpu_unif();
284                                 break;
285                         case QFILE_VARY:
286                                 src[i] = qpu_vary();
287                                 break;
288                         case QFILE_SMALL_IMM:
289                                 src[i].mux = QPU_MUX_SMALL_IMM;
290                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
291                                 /* This should only have returned a valid
292                                  * small immediate field, not ~0 for failure.
293                                  */
294                                 assert(src[i].addr <= 47);
295                                 break;
296                         case QFILE_VPM:
297                                 assert((int)qinst->src[i].index >=
298                                        last_vpm_read_index);
299                                 (void)last_vpm_read_index;
300                                 last_vpm_read_index = qinst->src[i].index;
301                                 src[i] = qpu_ra(QPU_R_VPM);
302                                 break;
303                         }
304                 }
305
306                 struct qpu_reg dst;
307                 switch (qinst->dst.file) {
308                 case QFILE_NULL:
309                         dst = qpu_ra(QPU_W_NOP);
310                         break;
311                 case QFILE_TEMP:
312                         dst = temp_registers[qinst->dst.index];
313                         break;
314                 case QFILE_VPM:
315                         dst = qpu_ra(QPU_W_VPM);
316                         break;
317                 case QFILE_VARY:
318                 case QFILE_UNIF:
319                 case QFILE_SMALL_IMM:
320                         assert(!"not reached");
321                         break;
322                 }
323
324                 bool handled_qinst_cond = false;
325
326                 switch (qinst->op) {
327                 case QOP_RCP:
328                 case QOP_RSQ:
329                 case QOP_EXP2:
330                 case QOP_LOG2:
331                         switch (qinst->op) {
332                         case QOP_RCP:
333                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
334                                                    src[0]) | unpack);
335                                 break;
336                         case QOP_RSQ:
337                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
338                                                    src[0]) | unpack);
339                                 break;
340                         case QOP_EXP2:
341                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
342                                                    src[0]) | unpack);
343                                 break;
344                         case QOP_LOG2:
345                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
346                                                    src[0]) | unpack);
347                                 break;
348                         default:
349                                 abort();
350                         }
351
352                         handle_r4_qpu_write(c, qinst, dst);
353
354                         break;
355
356                 case QOP_FRAG_X:
357                         queue(c, qpu_a_ITOF(dst,
358                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
359                         break;
360
361                 case QOP_FRAG_Y:
362                         queue(c, qpu_a_ITOF(dst,
363                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
364                         break;
365
366                 case QOP_FRAG_REV_FLAG:
367                         queue(c, qpu_a_ITOF(dst,
368                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
369                         break;
370
371                 case QOP_MS_MASK:
372                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
373                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
374                                              qinst, &unpack);
375                         queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
376                                            src[0], src[1]) | unpack);
377                         break;
378
379                 case QOP_FRAG_Z:
380                 case QOP_FRAG_W:
381                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
382                          * the register to the Z/W payload.
383                          */
384                         break;
385
386                 case QOP_TLB_STENCIL_SETUP:
387                         assert(!unpack);
388                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
389                                            src[0]) | unpack);
390                         break;
391
392                 case QOP_TLB_Z_WRITE:
393                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
394                                            src[0]) | unpack);
395                         set_last_cond_add(c, qinst->cond);
396                         handled_qinst_cond = true;
397                         break;
398
399                 case QOP_TLB_COLOR_READ:
400                         queue(c, qpu_NOP());
401                         *last_inst(c) = qpu_set_sig(*last_inst(c),
402                                                     QPU_SIG_COLOR_LOAD);
403                         handle_r4_qpu_write(c, qinst, dst);
404                         break;
405
406                 case QOP_TLB_COLOR_WRITE:
407                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
408                         set_last_cond_add(c, qinst->cond);
409                         handled_qinst_cond = true;
410                         break;
411
412                 case QOP_TLB_COLOR_WRITE_MS:
413                         queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
414                         set_last_cond_add(c, qinst->cond);
415                         handled_qinst_cond = true;
416                         break;
417
418                 case QOP_VARY_ADD_C:
419                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
420                         break;
421
422                 case QOP_TEX_S:
423                 case QOP_TEX_T:
424                 case QOP_TEX_R:
425                 case QOP_TEX_B:
426                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
427                                                   (qinst->op - QOP_TEX_S)),
428                                            src[0]) | unpack);
429                         break;
430
431                 case QOP_TEX_DIRECT:
432                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
433                                              qinst, &unpack);
434                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
435                                            src[0], src[1]) | unpack);
436                         break;
437
438                 case QOP_TEX_RESULT:
439                         queue(c, qpu_NOP());
440                         *last_inst(c) = qpu_set_sig(*last_inst(c),
441                                                     QPU_SIG_LOAD_TMU0);
442                         handle_r4_qpu_write(c, qinst, dst);
443                         break;
444
445                 default:
446                         assert(qinst->op < ARRAY_SIZE(translate));
447                         assert(translate[qinst->op].op != 0); /* NOPs */
448
449                         /* Skip emitting the MOV if it's a no-op. */
450                         if (qir_is_raw_mov(qinst) &&
451                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
452                                 break;
453                         }
454
455                         /* If we have only one source, put it in the second
456                          * argument slot as well so that we don't take up
457                          * another raddr just to get unused data.
458                          */
459                         if (qir_get_op_nsrc(qinst->op) == 1)
460                                 src[1] = src[0];
461
462                         fixup_raddr_conflict(c, dst, &src[0], &src[1],
463                                              qinst, &unpack);
464
465                         if (qir_is_mul(qinst)) {
466                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
467                                                     dst,
468                                                     src[0], src[1]) | unpack);
469                                 set_last_cond_mul(c, qinst->cond);
470                         } else {
471                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
472                                                     dst,
473                                                     src[0], src[1]) | unpack);
474                                 set_last_cond_add(c, qinst->cond);
475                         }
476                         handled_qinst_cond = true;
477                         set_last_dst_pack(c, qinst);
478
479                         break;
480                 }
481
482                 assert(qinst->cond == QPU_COND_ALWAYS ||
483                        handled_qinst_cond);
484
485                 if (qinst->sf)
486                         *last_inst(c) |= QPU_SF;
487         }
488
489         uint32_t cycles = qpu_schedule_instructions(c);
490         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
491
492         /* thread end can't have VPM write or read */
493         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494                           QPU_WADDR_ADD) == QPU_W_VPM ||
495             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496                           QPU_WADDR_MUL) == QPU_W_VPM ||
497             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
498                           QPU_RADDR_A) == QPU_R_VPM ||
499             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
500                           QPU_RADDR_B) == QPU_R_VPM) {
501                 qpu_serialize_one_inst(c, qpu_NOP());
502         }
503
504         /* thread end can't have uniform read */
505         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
506                           QPU_RADDR_A) == QPU_R_UNIF ||
507             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
508                           QPU_RADDR_B) == QPU_R_UNIF) {
509                 qpu_serialize_one_inst(c, qpu_NOP());
510         }
511
512         /* thread end can't have TLB operations */
513         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
514                 qpu_serialize_one_inst(c, qpu_NOP());
515
516         c->qpu_insts[c->qpu_inst_count - 1] =
517                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
518                             QPU_SIG_PROG_END);
519         qpu_serialize_one_inst(c, qpu_NOP());
520         qpu_serialize_one_inst(c, qpu_NOP());
521
522         switch (c->stage) {
523         case QSTAGE_VERT:
524         case QSTAGE_COORD:
525                 break;
526         case QSTAGE_FRAG:
527                 c->qpu_insts[c->qpu_inst_count - 1] =
528                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
529                                     QPU_SIG_SCOREBOARD_UNLOCK);
530                 break;
531         }
532
533         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
534
535         if (vc4_debug & VC4_DEBUG_SHADERDB) {
536                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
537                         qir_get_stage_name(c->stage),
538                         c->program_id, c->variant_id,
539                         cycles);
540         }
541
542         if (vc4_debug & VC4_DEBUG_QPU)
543                 vc4_dump_program(c);
544
545         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
546
547         free(temp_registers);
548 }