src/gallium/drivers/vc4/vc4_qpu_emit.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29 #include "util/ralloc.h"
  30
  31 static void
  32 vc4_dump_program(struct vc4_compile *c)
  33 {
  34         fprintf(stderr, "%s prog %d/%d QPU:\n",
  35                 qir_get_stage_name(c->stage),
  36                 c->program_id, c->variant_id);
  37
  38         for (int i = 0; i < c->qpu_inst_count; i++) {
  39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
  40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
  41                 fprintf(stderr, "\n");
  42         }
  43 }
  44
  45 static void
  46 queue(struct vc4_compile *c, uint64_t inst)
  47 {
  48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
  49         q->inst = inst;
  50         insert_at_tail(&c->qpu_inst_list, &q->link);
  51 }
  52
  53 static uint64_t *
  54 last_inst(struct vc4_compile *c)
  55 {
  56         struct queued_qpu_inst *q =
  57                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
  58         return &q->inst;
  59 }
  60
  61 static void
  62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
  63 {
  64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
  65 }
  66
  67 /**
  68  * Some special registers can be read from either file, which lets us resolve
  69  * raddr conflicts without extra MOVs.
  70  */
  71 static bool
  72 swap_file(struct qpu_reg *src)
  73 {
  74         switch (src->addr) {
  75         case QPU_R_UNIF:
  76         case QPU_R_VARY:
  77                 if (src->mux == QPU_MUX_A)
  78                         src->mux = QPU_MUX_B;
  79                 else
  80                         src->mux = QPU_MUX_A;
  81                 return true;
  82
  83         default:
  84                 return false;
  85         }
  86 }
  87
  88 /**
  89  * This is used to resolve the fact that we might register-allocate two
  90  * different operands of an instruction to the same physical register file
  91  * even though instructions have only one field for the register file source
  92  * address.
  93  *
  94  * In that case, we need to move one to a temporary that can be used in the
  95  * instruction, instead.
  96  */
  97 static bool
  98 fixup_raddr_conflict(struct vc4_compile *c,
  99                      struct qpu_reg dst,
 100                      struct qpu_reg *src0, struct qpu_reg *src1,
 101                      bool r3_live)
 102 {
 103         if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
 104             src0->mux != src1->mux ||
 105             src0->addr == src1->addr) {
 106                 return false;
 107         }
 108
 109         if (swap_file(src0) || swap_file(src1))
 110                 return false;
 111
 112         if (src0->mux == QPU_MUX_A) {
 113                 /* If we're conflicting over the A regfile, then we can just
 114                  * use the reserved rb31.
 115                  */
 116                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
 117                 *src1 = qpu_rb(31);
 118                 return false;
 119         } else {
 120                 /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
 121                  * rb31, then store our desired value in r3, and tell the
 122                  * caller to put rb31 back into r3 when we're done.
 123                  */
 124                 if (r3_live)
 125                         queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
 126                 queue(c, qpu_a_MOV(qpu_r3(), *src1));
 127
 128                 *src1 = qpu_r3();
 129
 130                 return r3_live && dst.mux != QPU_MUX_R3;
 131         }
 132 }
 133
 134 void
 135 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 136 {
 137         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
 138         bool discard = false;
 139         uint32_t inputs_remaining = c->num_inputs;
 140         uint32_t vpm_read_fifo_count = 0;
 141         uint32_t vpm_read_offset = 0;
 142         bool written_r3 = false;
 143         bool needs_restore;
 144
 145         make_empty_list(&c->qpu_inst_list);
 146
 147         switch (c->stage) {
 148         case QSTAGE_VERT:
 149         case QSTAGE_COORD:
 150                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
 151                  * load up to 16 dwords (4 vec4s) per vertex.
 152                  */
 153                 while (inputs_remaining) {
 154                         uint32_t num_entries = MIN2(inputs_remaining, 16);
 155                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
 156                                                  vpm_read_offset |
 157                                                  0x00001a00 |
 158                                                  ((num_entries & 0xf) << 20)));
 159                         inputs_remaining -= num_entries;
 160                         vpm_read_offset += num_entries;
 161                         vpm_read_fifo_count++;
 162                 }
 163                 assert(vpm_read_fifo_count <= 4);
 164
 165                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
 166                 break;
 167         case QSTAGE_FRAG:
 168                 break;
 169         }
 170
 171         struct simple_node *node;
 172         foreach(node, &c->instructions) {
 173                 struct qinst *qinst = (struct qinst *)node;
 174
 175 #if 0
 176                 fprintf(stderr, "translating qinst to qpu: ");
 177                 qir_dump_inst(qinst);
 178                 fprintf(stderr, "\n");
 179 #endif
 180
 181                 static const struct {
 182                         uint32_t op;
 183                         bool is_mul;
 184                 } translate[] = {
 185 #define A(name) [QOP_##name] = {QPU_A_##name, false}
 186 #define M(name) [QOP_##name] = {QPU_M_##name, true}
 187                         A(FADD),
 188                         A(FSUB),
 189                         A(FMIN),
 190                         A(FMAX),
 191                         A(FMINABS),
 192                         A(FMAXABS),
 193                         A(FTOI),
 194                         A(ITOF),
 195                         A(ADD),
 196                         A(SUB),
 197                         A(SHL),
 198                         A(SHR),
 199                         A(ASR),
 200                         A(MIN),
 201                         A(MAX),
 202                         A(AND),
 203                         A(OR),
 204                         A(XOR),
 205                         A(NOT),
 206
 207                         M(FMUL),
 208                         M(MUL24),
 209                 };
 210
 211                 struct qpu_reg src[4];
 212                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
 213                         int index = qinst->src[i].index;
 214                         switch (qinst->src[i].file) {
 215                         case QFILE_NULL:
 216                                 src[i] = qpu_rn(0);
 217                                 break;
 218                         case QFILE_TEMP:
 219                                 src[i] = temp_registers[index];
 220                                 break;
 221                         case QFILE_UNIF:
 222                                 src[i] = qpu_unif();
 223                                 break;
 224                         case QFILE_VARY:
 225                                 src[i] = qpu_vary();
 226                                 break;
 227                         }
 228                 }
 229
 230                 struct qpu_reg dst;
 231                 switch (qinst->dst.file) {
 232                 case QFILE_NULL:
 233                         dst = qpu_ra(QPU_W_NOP);
 234                         break;
 235                 case QFILE_TEMP:
 236                         dst = temp_registers[qinst->dst.index];
 237                         break;
 238                 case QFILE_VARY:
 239                 case QFILE_UNIF:
 240                         assert(!"not reached");
 241                         break;
 242                 }
 243
 244                 switch (qinst->op) {
 245                 case QOP_MOV:
 246                         /* Skip emitting the MOV if it's a no-op. */
 247                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
 248                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
 249                                 queue(c, qpu_a_MOV(dst, src[0]));
 250                         }
 251                         break;
 252
 253                 case QOP_SF:
 254                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
 255                         *last_inst(c) |= QPU_SF;
 256                         break;
 257
 258                 case QOP_SEL_X_0_ZS:
 259                 case QOP_SEL_X_0_ZC:
 260                 case QOP_SEL_X_0_NS:
 261                 case QOP_SEL_X_0_NC:
 262                         queue(c, qpu_a_MOV(dst, src[0]));
 263                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
 264                                           QPU_COND_ZS);
 265
 266                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
 267                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
 268                                               1) + QPU_COND_ZS);
 269                         break;
 270
 271                 case QOP_SEL_X_Y_ZS:
 272                 case QOP_SEL_X_Y_ZC:
 273                 case QOP_SEL_X_Y_NS:
 274                 case QOP_SEL_X_Y_NC:
 275                         queue(c, qpu_a_MOV(dst, src[0]));
 276                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
 277                                           QPU_COND_ZS);
 278
 279                         queue(c, qpu_a_MOV(dst, src[1]));
 280                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
 281                                               1) + QPU_COND_ZS);
 282
 283                         break;
 284
 285                 case QOP_VPM_WRITE:
 286                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
 287                         break;
 288
 289                 case QOP_VPM_READ:
 290                         queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
 291                         break;
 292
 293                 case QOP_RCP:
 294                 case QOP_RSQ:
 295                 case QOP_EXP2:
 296                 case QOP_LOG2:
 297                         switch (qinst->op) {
 298                         case QOP_RCP:
 299                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
 300                                                    src[0]));
 301                                 break;
 302                         case QOP_RSQ:
 303                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
 304                                                    src[0]));
 305                                 break;
 306                         case QOP_EXP2:
 307                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
 308                                                    src[0]));
 309                                 break;
 310                         case QOP_LOG2:
 311                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
 312                                                    src[0]));
 313                                 break;
 314                         default:
 315                                 abort();
 316                         }
 317
 318                         queue(c, qpu_a_MOV(dst, qpu_r4()));
 319
 320                         break;
 321
 322                 case QOP_PACK_COLORS: {
 323                         /* We have to be careful not to start writing over one
 324                          * of our source values when incrementally writing the
 325                          * destination.  So, if the dst is one of the srcs, we
 326                          * pack that one first (and we pack 4 channels at once
 327                          * for the first pack).
 328                          */
 329                         struct qpu_reg first_pack = src[0];
 330                         for (int i = 0; i < 4; i++) {
 331                                 if (src[i].mux == dst.mux &&
 332                                     src[i].addr == dst.addr) {
 333                                         first_pack = dst;
 334                                         break;
 335                                 }
 336                         }
 337                         queue(c, qpu_m_MOV(dst, first_pack));
 338                         *last_inst(c) |= QPU_PM;
 339                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
 340                                                        QPU_PACK);
 341
 342                         for (int i = 0; i < 4; i++) {
 343                                 if (src[i].mux == first_pack.mux &&
 344                                     src[i].addr == first_pack.addr) {
 345                                         continue;
 346                                 }
 347
 348                                 queue(c, qpu_m_MOV(dst, src[i]));
 349                                 *last_inst(c) |= QPU_PM;
 350                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
 351                                                                QPU_PACK);
 352                         }
 353
 354                         break;
 355                 }
 356
 357                 case QOP_FRAG_X:
 358                         queue(c, qpu_a_ITOF(dst,
 359                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
 360                         break;
 361
 362                 case QOP_FRAG_Y:
 363                         queue(c, qpu_a_ITOF(dst,
 364                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
 365                         break;
 366
 367                 case QOP_FRAG_REV_FLAG:
 368                         queue(c, qpu_a_ITOF(dst,
 369                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
 370                         break;
 371
 372                 case QOP_FRAG_Z:
 373                 case QOP_FRAG_W:
 374                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
 375                          * the register to the Z/W payload.
 376                          */
 377                         break;
 378
 379                 case QOP_TLB_DISCARD_SETUP:
 380                         discard = true;
 381                         queue(c, qpu_a_MOV(src[0], src[0]));
 382                         *last_inst(c) |= QPU_SF;
 383                         break;
 384
 385                 case QOP_TLB_STENCIL_SETUP:
 386                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
 387                         break;
 388
 389                 case QOP_TLB_Z_WRITE:
 390                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
 391                         if (discard) {
 392                                 set_last_cond_add(c, QPU_COND_ZS);
 393                         }
 394                         break;
 395
 396                 case QOP_TLB_COLOR_READ:
 397                         queue(c, qpu_NOP());
 398                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 399                                                     QPU_SIG_COLOR_LOAD);
 400
 401                         break;
 402
 403                 case QOP_TLB_COLOR_WRITE:
 404                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
 405                         if (discard) {
 406                                 set_last_cond_add(c, QPU_COND_ZS);
 407                         }
 408                         break;
 409
 410                 case QOP_VARY_ADD_C:
 411                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
 412                         break;
 413
 414                 case QOP_PACK_SCALED: {
 415                         uint64_t a = (qpu_a_MOV(dst, src[0]) |
 416                                       QPU_SET_FIELD(QPU_PACK_A_16A,
 417                                                     QPU_PACK));
 418                         uint64_t b = (qpu_a_MOV(dst, src[1]) |
 419                                       QPU_SET_FIELD(QPU_PACK_A_16B,
 420                                                     QPU_PACK));
 421
 422                         if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
 423                                 queue(c, b);
 424                                 queue(c, a);
 425                         } else {
 426                                 queue(c, a);
 427                                 queue(c, b);
 428                         }
 429                         break;
 430                 }
 431
 432                 case QOP_TEX_S:
 433                 case QOP_TEX_T:
 434                 case QOP_TEX_R:
 435                 case QOP_TEX_B:
 436                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
 437                                                   (qinst->op - QOP_TEX_S)),
 438                                            src[0]));
 439                         break;
 440
 441                 case QOP_TEX_DIRECT:
 442                         needs_restore = fixup_raddr_conflict(c, dst,
 443                                                              &src[0], &src[1],
 444                                                              written_r3);
 445                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
 446                         if (needs_restore)
 447                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 448                         break;
 449
 450                 case QOP_TEX_RESULT:
 451                         queue(c, qpu_NOP());
 452                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 453                                                     QPU_SIG_LOAD_TMU0);
 454
 455                         break;
 456
 457                 case QOP_R4_UNPACK_A:
 458                 case QOP_R4_UNPACK_B:
 459                 case QOP_R4_UNPACK_C:
 460                 case QOP_R4_UNPACK_D:
 461                         assert(src[0].mux == QPU_MUX_R4);
 462                         queue(c, qpu_a_MOV(dst, src[0]));
 463                         *last_inst(c) |= QPU_PM;
 464                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 465                                                        (qinst->op -
 466                                                         QOP_R4_UNPACK_A),
 467                                                        QPU_UNPACK);
 468
 469                         break;
 470
 471                 case QOP_UNPACK_8A_F:
 472                 case QOP_UNPACK_8B_F:
 473                 case QOP_UNPACK_8C_F:
 474                 case QOP_UNPACK_8D_F:
 475                         assert(src[0].mux == QPU_MUX_A);
 476
 477                         /* Since we're setting the pack bits, if the
 478                          * destination is in A it would get re-packed.
 479                          */
 480                         queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
 481                                              qpu_rb(31) : dst),
 482                                             src[0], src[0]));
 483                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 484                                                        (qinst->op -
 485                                                         QOP_UNPACK_8A_F),
 486                                                        QPU_UNPACK);
 487
 488                         if (dst.mux == QPU_MUX_A) {
 489                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
 490                         }
 491                         break;
 492
 493                 case QOP_UNPACK_8A_I:
 494                 case QOP_UNPACK_8B_I:
 495                 case QOP_UNPACK_8C_I:
 496                 case QOP_UNPACK_8D_I:
 497                         assert(src[0].mux == QPU_MUX_A);
 498
 499                         /* Since we're setting the pack bits, if the
 500                          * destination is in A it would get re-packed.
 501                          */
 502                         queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
 503                                             qpu_rb(31) : dst), src[0]));
 504                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 505                                                        (qinst->op -
 506                                                         QOP_UNPACK_8A_I),
 507                                                        QPU_UNPACK);
 508
 509                         if (dst.mux == QPU_MUX_A) {
 510                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
 511                         }
 512                         break;
 513
 514                 default:
 515                         assert(qinst->op < ARRAY_SIZE(translate));
 516                         assert(translate[qinst->op].op != 0); /* NOPs */
 517
 518                         /* If we have only one source, put it in the second
 519                          * argument slot as well so that we don't take up
 520                          * another raddr just to get unused data.
 521                          */
 522                         if (qir_get_op_nsrc(qinst->op) == 1)
 523                                 src[1] = src[0];
 524
 525                         needs_restore = fixup_raddr_conflict(c, dst,
 526                                                              &src[0], &src[1],
 527                                                              written_r3);
 528
 529                         if (translate[qinst->op].is_mul) {
 530                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
 531                                                     dst,
 532                                                     src[0], src[1]));
 533                         } else {
 534                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
 535                                                     dst,
 536                                                     src[0], src[1]));
 537                         }
 538                         if (needs_restore)
 539                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 540
 541                         break;
 542                 }
 543
 544                 if (dst.mux == QPU_MUX_R3)
 545                         written_r3 = true;
 546         }
 547
 548         qpu_schedule_instructions(c);
 549
 550         /* thread end can't have VPM write or read */
 551         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 552                           QPU_WADDR_ADD) == QPU_W_VPM ||
 553             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 554                           QPU_WADDR_MUL) == QPU_W_VPM ||
 555             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 556                           QPU_RADDR_A) == QPU_R_VPM ||
 557             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 558                           QPU_RADDR_B) == QPU_R_VPM) {
 559                 qpu_serialize_one_inst(c, qpu_NOP());
 560         }
 561
 562         /* thread end can't have uniform read */
 563         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 564                           QPU_RADDR_A) == QPU_R_UNIF ||
 565             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 566                           QPU_RADDR_B) == QPU_R_UNIF) {
 567                 qpu_serialize_one_inst(c, qpu_NOP());
 568         }
 569
 570         /* thread end can't have TLB operations */
 571         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
 572                 qpu_serialize_one_inst(c, qpu_NOP());
 573
 574         c->qpu_insts[c->qpu_inst_count - 1] =
 575                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 576                             QPU_SIG_PROG_END);
 577         qpu_serialize_one_inst(c, qpu_NOP());
 578         qpu_serialize_one_inst(c, qpu_NOP());
 579
 580         switch (c->stage) {
 581         case QSTAGE_VERT:
 582         case QSTAGE_COORD:
 583                 break;
 584         case QSTAGE_FRAG:
 585                 c->qpu_insts[c->qpu_inst_count - 1] =
 586                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 587                                     QPU_SIG_SCOREBOARD_UNLOCK);
 588                 break;
 589         }
 590
 591         if (vc4_debug & VC4_DEBUG_QPU)
 592                 vc4_dump_program(c);
 593
 594         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
 595
 596         free(temp_registers);
 597 }