src/gallium/drivers/vc4/vc4_qpu_emit.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29 #include "util/ralloc.h"
  30
  31 static void
  32 vc4_dump_program(struct vc4_compile *c)
  33 {
  34         fprintf(stderr, "%s prog %d/%d QPU:\n",
  35                 qir_get_stage_name(c->stage),
  36                 c->program_id, c->variant_id);
  37
  38         for (int i = 0; i < c->qpu_inst_count; i++) {
  39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
  40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
  41                 fprintf(stderr, "\n");
  42         }
  43         fprintf(stderr, "\n");
  44 }
  45
  46 static void
  47 queue(struct qblock *block, uint64_t inst)
  48 {
  49         struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
  50         q->inst = inst;
  51         list_addtail(&q->link, &block->qpu_inst_list);
  52 }
  53
  54 static uint64_t *
  55 last_inst(struct qblock *block)
  56 {
  57         struct queued_qpu_inst *q =
  58                 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
  59         return &q->inst;
  60 }
  61
  62 static void
  63 set_last_cond_add(struct qblock *block, uint32_t cond)
  64 {
  65         *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
  66 }
  67
  68 static void
  69 set_last_cond_mul(struct qblock *block, uint32_t cond)
  70 {
  71         *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
  72 }
  73
  74 /**
  75  * Some special registers can be read from either file, which lets us resolve
  76  * raddr conflicts without extra MOVs.
  77  */
  78 static bool
  79 swap_file(struct qpu_reg *src)
  80 {
  81         switch (src->addr) {
  82         case QPU_R_UNIF:
  83         case QPU_R_VARY:
  84                 if (src->mux == QPU_MUX_SMALL_IMM) {
  85                         return false;
  86                 } else {
  87                         if (src->mux == QPU_MUX_A)
  88                                 src->mux = QPU_MUX_B;
  89                         else
  90                                 src->mux = QPU_MUX_A;
  91                         return true;
  92                 }
  93
  94         default:
  95                 return false;
  96         }
  97 }
  98
  99 /**
 100  * Sets up the VPM read FIFO before we do any VPM read.
 101  *
 102  * VPM reads (vertex attribute input) and VPM writes (varyings output) from
 103  * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
 104  * VPM block.  In the VS/CS (unlike in the FS), the block starts out
 105  * uninitialized, and you need to emit setup to the block before any VPM
 106  * reads/writes.
 107  *
 108  * VRI has a FIFO in each direction, with each FIFO able to hold four
 109  * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
 110  * writes go through the write FIFO.  The read/write setup values from QPU go
 111  * through the write FIFO as well, with a sideband signal indicating that
 112  * they're setup values.  Once a read setup reaches the other side of the
 113  * FIFO, the VPM block will start asynchronously reading vertex attributes and
 114  * filling the read FIFO -- that way hopefully the QPU doesn't have to block
 115  * on reads later.
 116  *
 117  * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
 118  * time, which is 4 vec4s.  If more than that is being read (since we support
 119  * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
 120  *
 121  * The existence of the FIFO makes it seem like you should be able to emit
 122  * both setups for the 5-8 attribute cases and then do all the attribute
 123  * reads.  However, once the setup value makes it to the other end of the
 124  * write FIFO, it will immediately update the VPM block's setup register.
 125  * That updated setup register would be used for read FIFO fills from then on,
 126  * breaking whatever remaining VPM values were supposed to be read into the
 127  * read FIFO from the previous attribute set.
 128  *
 129  * As a result, we need to emit the read setup, pull every VPM read value from
 130  * that setup, and only then emit the second setup if applicable.
 131  */
 132 static void
 133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
 134 {
 135         if (c->num_inputs_in_fifo) {
 136                 c->num_inputs_in_fifo--;
 137                 return;
 138         }
 139
 140         c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
 141
 142         queue(block,
 143               qpu_load_imm_ui(qpu_vrsetup(),
 144                               c->vpm_read_offset |
 145                               0x00001a00 |
 146                               ((c->num_inputs_in_fifo & 0xf) << 20)));
 147         c->num_inputs_remaining -= c->num_inputs_in_fifo;
 148         c->vpm_read_offset += c->num_inputs_in_fifo;
 149
 150         c->num_inputs_in_fifo--;
 151 }
 152
 153 /**
 154  * This is used to resolve the fact that we might register-allocate two
 155  * different operands of an instruction to the same physical register file
 156  * even though instructions have only one field for the register file source
 157  * address.
 158  *
 159  * In that case, we need to move one to a temporary that can be used in the
 160  * instruction, instead.  We reserve ra31/rb31 for this purpose.
 161  */
 162 static void
 163 fixup_raddr_conflict(struct qblock *block,
 164                      struct qpu_reg dst,
 165                      struct qpu_reg *src0, struct qpu_reg *src1,
 166                      struct qinst *inst, uint64_t *unpack)
 167 {
 168         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
 169         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
 170
 171         if (mux0 <= QPU_MUX_R5 ||
 172             mux0 != mux1 ||
 173             (src0->addr == src1->addr &&
 174              src0->mux == src1->mux)) {
 175                 return;
 176         }
 177
 178         if (swap_file(src0) || swap_file(src1))
 179                 return;
 180
 181         if (mux0 == QPU_MUX_A) {
 182                 /* Make sure we use the same type of MOV as the instruction,
 183                  * in case of unpacks.
 184                  */
 185                 if (qir_is_float_input(inst))
 186                         queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
 187                 else
 188                         queue(block, qpu_a_MOV(qpu_rb(31), *src0));
 189
 190                 /* If we had an unpack on this A-file source, we need to put
 191                  * it into this MOV, not into the later move from regfile B.
 192                  */
 193                 if (inst->src[0].pack) {
 194                         *last_inst(block) |= *unpack;
 195                         *unpack = 0;
 196                 }
 197                 *src0 = qpu_rb(31);
 198         } else {
 199                 queue(block, qpu_a_MOV(qpu_ra(31), *src0));
 200                 *src0 = qpu_ra(31);
 201         }
 202 }
 203
 204 static void
 205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
 206 {
 207         bool had_pm = *last_inst(block) & QPU_PM;
 208         bool had_ws = *last_inst(block) & QPU_WS;
 209         uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
 210
 211         if (!inst->dst.pack)
 212                 return;
 213
 214         *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
 215
 216         if (qir_is_mul(inst)) {
 217                 assert(!unpack || had_pm);
 218                 *last_inst(block) |= QPU_PM;
 219         } else {
 220                 assert(!unpack || !had_pm);
 221                 assert(!had_ws); /* dst must be a-file to pack. */
 222         }
 223 }
 224
 225 static void
 226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
 227                     struct qpu_reg dst)
 228 {
 229         if (dst.mux != QPU_MUX_R4)
 230                 queue(block, qpu_a_MOV(dst, qpu_r4()));
 231         else if (qinst->sf)
 232                 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
 233 }
 234
 235 static void
 236 vc4_generate_code_block(struct vc4_compile *c,
 237                         struct qblock *block,
 238                         struct qpu_reg *temp_registers)
 239 {
 240         int last_vpm_read_index = -1;
 241
 242         qir_for_each_inst(qinst, block) {
 243 #if 0
 244                 fprintf(stderr, "translating qinst to qpu: ");
 245                 qir_dump_inst(qinst);
 246                 fprintf(stderr, "\n");
 247 #endif
 248
 249                 static const struct {
 250                         uint32_t op;
 251                 } translate[] = {
 252 #define A(name) [QOP_##name] = {QPU_A_##name}
 253 #define M(name) [QOP_##name] = {QPU_M_##name}
 254                         A(FADD),
 255                         A(FSUB),
 256                         A(FMIN),
 257                         A(FMAX),
 258                         A(FMINABS),
 259                         A(FMAXABS),
 260                         A(FTOI),
 261                         A(ITOF),
 262                         A(ADD),
 263                         A(SUB),
 264                         A(SHL),
 265                         A(SHR),
 266                         A(ASR),
 267                         A(MIN),
 268                         A(MAX),
 269                         A(AND),
 270                         A(OR),
 271                         A(XOR),
 272                         A(NOT),
 273
 274                         M(FMUL),
 275                         M(V8MULD),
 276                         M(V8MIN),
 277                         M(V8MAX),
 278                         M(V8ADDS),
 279                         M(V8SUBS),
 280                         M(MUL24),
 281
 282                         /* If we replicate src[0] out to src[1], this works
 283                          * out the same as a MOV.
 284                          */
 285                         [QOP_MOV] = { QPU_A_OR },
 286                         [QOP_FMOV] = { QPU_A_FMAX },
 287                         [QOP_MMOV] = { QPU_M_V8MIN },
 288                 };
 289
 290                 uint64_t unpack = 0;
 291                 struct qpu_reg src[4];
 292                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
 293                         int index = qinst->src[i].index;
 294                         switch (qinst->src[i].file) {
 295                         case QFILE_NULL:
 296                         case QFILE_LOAD_IMM:
 297                                 src[i] = qpu_rn(0);
 298                                 break;
 299                         case QFILE_TEMP:
 300                                 src[i] = temp_registers[index];
 301                                 if (qinst->src[i].pack) {
 302                                         assert(!unpack ||
 303                                                unpack == qinst->src[i].pack);
 304                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
 305                                                                QPU_UNPACK);
 306                                         if (src[i].mux == QPU_MUX_R4)
 307                                                 unpack |= QPU_PM;
 308                                 }
 309                                 break;
 310                         case QFILE_UNIF:
 311                                 src[i] = qpu_unif();
 312                                 break;
 313                         case QFILE_VARY:
 314                                 src[i] = qpu_vary();
 315                                 break;
 316                         case QFILE_SMALL_IMM:
 317                                 src[i].mux = QPU_MUX_SMALL_IMM;
 318                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
 319                                 /* This should only have returned a valid
 320                                  * small immediate field, not ~0 for failure.
 321                                  */
 322                                 assert(src[i].addr <= 47);
 323                                 break;
 324                         case QFILE_VPM:
 325                                 setup_for_vpm_read(c, block);
 326                                 assert((int)qinst->src[i].index >=
 327                                        last_vpm_read_index);
 328                                 (void)last_vpm_read_index;
 329                                 last_vpm_read_index = qinst->src[i].index;
 330                                 src[i] = qpu_ra(QPU_R_VPM);
 331                                 break;
 332
 333                         case QFILE_FRAG_X:
 334                                 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
 335                                 break;
 336                         case QFILE_FRAG_Y:
 337                                 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
 338                                 break;
 339                         case QFILE_FRAG_REV_FLAG:
 340                                 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
 341                                 break;
 342                         case QFILE_QPU_ELEMENT:
 343                                 src[i] = qpu_ra(QPU_R_ELEM_QPU);
 344                                 break;
 345
 346                         case QFILE_TLB_COLOR_WRITE:
 347                         case QFILE_TLB_COLOR_WRITE_MS:
 348                         case QFILE_TLB_Z_WRITE:
 349                         case QFILE_TLB_STENCIL_SETUP:
 350                                 unreachable("bad qir src file");
 351                         }
 352                 }
 353
 354                 struct qpu_reg dst;
 355                 switch (qinst->dst.file) {
 356                 case QFILE_NULL:
 357                         dst = qpu_ra(QPU_W_NOP);
 358                         break;
 359                 case QFILE_TEMP:
 360                         dst = temp_registers[qinst->dst.index];
 361                         break;
 362                 case QFILE_VPM:
 363                         dst = qpu_ra(QPU_W_VPM);
 364                         break;
 365
 366                 case QFILE_TLB_COLOR_WRITE:
 367                         dst = qpu_tlbc();
 368                         break;
 369
 370                 case QFILE_TLB_COLOR_WRITE_MS:
 371                         dst = qpu_tlbc_ms();
 372                         break;
 373
 374                 case QFILE_TLB_Z_WRITE:
 375                         dst = qpu_ra(QPU_W_TLB_Z);
 376                         break;
 377
 378                 case QFILE_TLB_STENCIL_SETUP:
 379                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
 380                         break;
 381
 382                 case QFILE_VARY:
 383                 case QFILE_UNIF:
 384                 case QFILE_SMALL_IMM:
 385                 case QFILE_LOAD_IMM:
 386                 case QFILE_FRAG_X:
 387                 case QFILE_FRAG_Y:
 388                 case QFILE_FRAG_REV_FLAG:
 389                 case QFILE_QPU_ELEMENT:
 390                         assert(!"not reached");
 391                         break;
 392                 }
 393
 394                 bool handled_qinst_cond = false;
 395
 396                 switch (qinst->op) {
 397                 case QOP_RCP:
 398                 case QOP_RSQ:
 399                 case QOP_EXP2:
 400                 case QOP_LOG2:
 401                         switch (qinst->op) {
 402                         case QOP_RCP:
 403                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
 404                                                        src[0]) | unpack);
 405                                 break;
 406                         case QOP_RSQ:
 407                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
 408                                                        src[0]) | unpack);
 409                                 break;
 410                         case QOP_EXP2:
 411                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
 412                                                        src[0]) | unpack);
 413                                 break;
 414                         case QOP_LOG2:
 415                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
 416                                                        src[0]) | unpack);
 417                                 break;
 418                         default:
 419                                 abort();
 420                         }
 421
 422                         handle_r4_qpu_write(block, qinst, dst);
 423
 424                         break;
 425
 426                 case QOP_LOAD_IMM:
 427                         assert(qinst->src[0].file == QFILE_LOAD_IMM);
 428                         queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
 429                         break;
 430
 431                 case QOP_LOAD_IMM_U2:
 432                         queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
 433                         break;
 434
 435                 case QOP_LOAD_IMM_I2:
 436                         queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
 437                         break;
 438
 439                 case QOP_ROT_MUL:
 440                         /* Rotation at the hardware level occurs on the inputs
 441                          * to the MUL unit, and they must be accumulators in
 442                          * order to have the time necessary to move things.
 443                          */
 444                         assert(src[0].mux <= QPU_MUX_R3);
 445
 446                         queue(block,
 447                               qpu_m_rot(dst, src[0], qinst->src[1].index -
 448                                         QPU_SMALL_IMM_MUL_ROT) | unpack);
 449                         set_last_cond_mul(block, qinst->cond);
 450                         handled_qinst_cond = true;
 451                         set_last_dst_pack(block, qinst);
 452                         break;
 453
 454                 case QOP_MS_MASK:
 455                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
 456                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 457                                              qinst, &unpack);
 458                         queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
 459                                                src[0], src[1]) | unpack);
 460                         break;
 461
 462                 case QOP_FRAG_Z:
 463                 case QOP_FRAG_W:
 464                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
 465                          * the register to the Z/W payload.
 466                          */
 467                         break;
 468
 469                 case QOP_TLB_COLOR_READ:
 470                         queue(block, qpu_NOP());
 471                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 472                                                         QPU_SIG_COLOR_LOAD);
 473                         handle_r4_qpu_write(block, qinst, dst);
 474                         break;
 475
 476                 case QOP_VARY_ADD_C:
 477                         queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
 478                         break;
 479
 480                 case QOP_TEX_S:
 481                 case QOP_TEX_T:
 482                 case QOP_TEX_R:
 483                 case QOP_TEX_B:
 484                         queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
 485                                                       (qinst->op - QOP_TEX_S)),
 486                                                src[0]) | unpack);
 487                         break;
 488
 489                 case QOP_TEX_DIRECT:
 490                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 491                                              qinst, &unpack);
 492                         queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
 493                                                src[0], src[1]) | unpack);
 494                         break;
 495
 496                 case QOP_TEX_RESULT:
 497                         queue(block, qpu_NOP());
 498                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 499                                                         QPU_SIG_LOAD_TMU0);
 500                         handle_r4_qpu_write(block, qinst, dst);
 501                         break;
 502
 503                 case QOP_THRSW:
 504                         queue(block, qpu_NOP());
 505                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 506                                                         QPU_SIG_THREAD_SWITCH);
 507                         break;
 508
 509                 case QOP_BRANCH:
 510                         /* The branch target will be updated at QPU scheduling
 511                          * time.
 512                          */
 513                         queue(block, (qpu_branch(qinst->cond, 0) |
 514                                       QPU_BRANCH_REL));
 515                         handled_qinst_cond = true;
 516                         break;
 517
 518                 case QOP_UNIFORMS_RESET:
 519                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 520                                              qinst, &unpack);
 521
 522                         queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
 523                                                src[0], src[1]));
 524                         break;
 525
 526                 default:
 527                         assert(qinst->op < ARRAY_SIZE(translate));
 528                         assert(translate[qinst->op].op != 0); /* NOPs */
 529
 530                         /* Skip emitting the MOV if it's a no-op. */
 531                         if (qir_is_raw_mov(qinst) &&
 532                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
 533                                 break;
 534                         }
 535
 536                         /* If we have only one source, put it in the second
 537                          * argument slot as well so that we don't take up
 538                          * another raddr just to get unused data.
 539                          */
 540                         if (qir_get_op_nsrc(qinst->op) == 1)
 541                                 src[1] = src[0];
 542
 543                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 544                                              qinst, &unpack);
 545
 546                         if (qir_is_mul(qinst)) {
 547                                 queue(block, qpu_m_alu2(translate[qinst->op].op,
 548                                                         dst,
 549                                                         src[0], src[1]) | unpack);
 550                                 set_last_cond_mul(block, qinst->cond);
 551                         } else {
 552                                 queue(block, qpu_a_alu2(translate[qinst->op].op,
 553                                                         dst,
 554                                                         src[0], src[1]) | unpack);
 555                                 set_last_cond_add(block, qinst->cond);
 556                         }
 557                         handled_qinst_cond = true;
 558                         set_last_dst_pack(block, qinst);
 559
 560                         break;
 561                 }
 562
 563                 assert(qinst->cond == QPU_COND_ALWAYS ||
 564                        handled_qinst_cond);
 565
 566                 if (qinst->sf)
 567                         *last_inst(block) |= QPU_SF;
 568         }
 569 }
 570
 571 void
 572 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 573 {
 574         struct qblock *start_block = list_first_entry(&c->blocks,
 575                                                       struct qblock, link);
 576
 577         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
 578         if (!temp_registers)
 579                 return;
 580
 581         switch (c->stage) {
 582         case QSTAGE_VERT:
 583         case QSTAGE_COORD:
 584                 c->num_inputs_remaining = c->num_inputs;
 585                 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
 586                 break;
 587         case QSTAGE_FRAG:
 588                 break;
 589         }
 590
 591         qir_for_each_block(block, c)
 592                 vc4_generate_code_block(c, block, temp_registers);
 593
 594         uint32_t cycles = qpu_schedule_instructions(c);
 595         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
 596
 597         /* thread end can't have VPM write or read */
 598         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 599                           QPU_WADDR_ADD) == QPU_W_VPM ||
 600             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 601                           QPU_WADDR_MUL) == QPU_W_VPM ||
 602             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 603                           QPU_RADDR_A) == QPU_R_VPM ||
 604             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 605                           QPU_RADDR_B) == QPU_R_VPM) {
 606                 qpu_serialize_one_inst(c, qpu_NOP());
 607         }
 608
 609         /* thread end can't have uniform read */
 610         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 611                           QPU_RADDR_A) == QPU_R_UNIF ||
 612             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 613                           QPU_RADDR_B) == QPU_R_UNIF) {
 614                 qpu_serialize_one_inst(c, qpu_NOP());
 615         }
 616
 617         /* thread end can't have TLB operations */
 618         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
 619                 qpu_serialize_one_inst(c, qpu_NOP());
 620
 621         /* Make sure there's no existing signal set (like for a small
 622          * immediate)
 623          */
 624         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 625                           QPU_SIG) != QPU_SIG_NONE) {
 626                 qpu_serialize_one_inst(c, qpu_NOP());
 627         }
 628
 629         c->qpu_insts[c->qpu_inst_count - 1] =
 630                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 631                             QPU_SIG_PROG_END);
 632         qpu_serialize_one_inst(c, qpu_NOP());
 633         qpu_serialize_one_inst(c, qpu_NOP());
 634
 635         switch (c->stage) {
 636         case QSTAGE_VERT:
 637         case QSTAGE_COORD:
 638                 break;
 639         case QSTAGE_FRAG:
 640                 c->qpu_insts[c->qpu_inst_count - 1] =
 641                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 642                                     QPU_SIG_SCOREBOARD_UNLOCK);
 643                 break;
 644         }
 645
 646         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
 647
 648         if (vc4_debug & VC4_DEBUG_SHADERDB) {
 649                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
 650                         qir_get_stage_name(c->stage),
 651                         c->program_id, c->variant_id,
 652                         cycles);
 653         }
 654
 655         if (vc4_debug & VC4_DEBUG_QPU)
 656                 vc4_dump_program(c);
 657
 658         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
 659
 660         free(temp_registers);
 661 }