OSDN Git Service

76cad2e03fefb1a80ebc249ce700c6d979ca4167
[android-x86/external-mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_schedule.c
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24
25 /**
26  * @file vc4_qpu_schedule.c
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46         struct list_head link;
47         struct queued_qpu_inst *inst;
48         struct schedule_node_child *children;
49         uint32_t child_count;
50         uint32_t child_array_size;
51         uint32_t parent_count;
52
53         /* Longest cycles + n->latency of any parent of this node. */
54         uint32_t unblocked_time;
55
56         /**
57          * Minimum number of cycles from scheduling this instruction until the
58          * end of the program, based on the slowest dependency chain through
59          * the children.
60          */
61         uint32_t delay;
62
63         /**
64          * cycles between this instruction being scheduled and when its result
65          * can be consumed.
66          */
67         uint32_t latency;
68
69         /**
70          * Which uniform from uniform_data[] this instruction read, or -1 if
71          * not reading a uniform.
72          */
73         int uniform;
74 };
75
76 struct schedule_node_child {
77         struct schedule_node *node;
78         bool write_after_read;
79 };
80
81 /* When walking the instructions in reverse, we need to swap before/after in
82  * add_dep().
83  */
84 enum direction { F, R };
85
86 struct schedule_state {
87         struct schedule_node *last_r[6];
88         struct schedule_node *last_ra[32];
89         struct schedule_node *last_rb[32];
90         struct schedule_node *last_sf;
91         struct schedule_node *last_vpm_read;
92         struct schedule_node *last_tmu_write;
93         struct schedule_node *last_tlb;
94         struct schedule_node *last_vpm;
95         enum direction dir;
96         /* Estimated cycle when the current instruction would start. */
97         uint32_t time;
98 };
99
100 static void
101 add_dep(struct schedule_state *state,
102         struct schedule_node *before,
103         struct schedule_node *after,
104         bool write)
105 {
106         bool write_after_read = !write && state->dir == R;
107
108         if (!before || !after)
109                 return;
110
111         assert(before != after);
112
113         if (state->dir == R) {
114                 struct schedule_node *t = before;
115                 before = after;
116                 after = t;
117         }
118
119         for (int i = 0; i < before->child_count; i++) {
120                 if (before->children[i].node == after &&
121                     (before->children[i].write_after_read == write_after_read)) {
122                         return;
123                 }
124         }
125
126         if (before->child_array_size <= before->child_count) {
127                 before->child_array_size = MAX2(before->child_array_size * 2, 16);
128                 before->children = reralloc(before, before->children,
129                                             struct schedule_node_child,
130                                             before->child_array_size);
131         }
132
133         before->children[before->child_count].node = after;
134         before->children[before->child_count].write_after_read =
135                 write_after_read;
136         before->child_count++;
137         after->parent_count++;
138 }
139
140 static void
141 add_read_dep(struct schedule_state *state,
142               struct schedule_node *before,
143               struct schedule_node *after)
144 {
145         add_dep(state, before, after, false);
146 }
147
148 static void
149 add_write_dep(struct schedule_state *state,
150               struct schedule_node **before,
151               struct schedule_node *after)
152 {
153         add_dep(state, *before, after, true);
154         *before = after;
155 }
156
157 static bool
158 qpu_writes_r4(uint64_t inst)
159 {
160         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
161
162         switch(sig) {
163         case QPU_SIG_COLOR_LOAD:
164         case QPU_SIG_LOAD_TMU0:
165         case QPU_SIG_LOAD_TMU1:
166         case QPU_SIG_ALPHA_MASK_LOAD:
167                 return true;
168         default:
169                 return false;
170         }
171 }
172
173 static void
174 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
175                    uint32_t raddr, bool is_a)
176 {
177         switch (raddr) {
178         case QPU_R_VARY:
179                 add_write_dep(state, &state->last_r[5], n);
180                 break;
181
182         case QPU_R_VPM:
183                 add_write_dep(state, &state->last_vpm_read, n);
184                 break;
185
186         case QPU_R_UNIF:
187         case QPU_R_NOP:
188         case QPU_R_ELEM_QPU:
189         case QPU_R_XY_PIXEL_COORD:
190         case QPU_R_MS_REV_FLAGS:
191                 break;
192
193         default:
194                 if (raddr < 32) {
195                         if (is_a)
196                                 add_read_dep(state, state->last_ra[raddr], n);
197                         else
198                                 add_read_dep(state, state->last_rb[raddr], n);
199                 } else {
200                         fprintf(stderr, "unknown raddr %d\n", raddr);
201                         abort();
202                 }
203                 break;
204         }
205 }
206
207 static bool
208 is_tmu_write(uint32_t waddr)
209 {
210         switch (waddr) {
211         case QPU_W_TMU0_S:
212         case QPU_W_TMU0_T:
213         case QPU_W_TMU0_R:
214         case QPU_W_TMU0_B:
215         case QPU_W_TMU1_S:
216         case QPU_W_TMU1_T:
217         case QPU_W_TMU1_R:
218         case QPU_W_TMU1_B:
219                 return true;
220         default:
221                 return false;
222         }
223 }
224
225 static bool
226 reads_uniform(uint64_t inst)
227 {
228         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
229                 return false;
230
231         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
232                 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
233                  QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
234                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
235                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
236 }
237
238 static void
239 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
240                  uint32_t mux)
241 {
242         if (mux != QPU_MUX_A && mux != QPU_MUX_B)
243                 add_read_dep(state, state->last_r[mux], n);
244 }
245
246
247 static void
248 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
249                    uint32_t waddr, bool is_add)
250 {
251         uint64_t inst = n->inst->inst;
252         bool is_a = is_add ^ ((inst & QPU_WS) != 0);
253
254         if (waddr < 32) {
255                 if (is_a) {
256                         add_write_dep(state, &state->last_ra[waddr], n);
257                 } else {
258                         add_write_dep(state, &state->last_rb[waddr], n);
259                 }
260         } else if (is_tmu_write(waddr)) {
261                 add_write_dep(state, &state->last_tmu_write, n);
262         } else if (qpu_waddr_is_tlb(waddr) ||
263                    waddr == QPU_W_MS_FLAGS) {
264                 add_write_dep(state, &state->last_tlb, n);
265         } else {
266                 switch (waddr) {
267                 case QPU_W_ACC0:
268                 case QPU_W_ACC1:
269                 case QPU_W_ACC2:
270                 case QPU_W_ACC3:
271                 case QPU_W_ACC5:
272                         add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
273                                       n);
274                         break;
275
276                 case QPU_W_VPM:
277                         add_write_dep(state, &state->last_vpm, n);
278                         break;
279
280                 case QPU_W_VPMVCD_SETUP:
281                         if (is_a)
282                                 add_write_dep(state, &state->last_vpm_read, n);
283                         else
284                                 add_write_dep(state, &state->last_vpm, n);
285                         break;
286
287                 case QPU_W_SFU_RECIP:
288                 case QPU_W_SFU_RECIPSQRT:
289                 case QPU_W_SFU_EXP:
290                 case QPU_W_SFU_LOG:
291                         add_write_dep(state, &state->last_r[4], n);
292                         break;
293
294                 case QPU_W_TLB_STENCIL_SETUP:
295                         /* This isn't a TLB operation that does things like
296                          * implicitly lock the scoreboard, but it does have to
297                          * appear before TLB_Z, and each of the TLB_STENCILs
298                          * have to schedule in the same order relative to each
299                          * other.
300                          */
301                         add_write_dep(state, &state->last_tlb, n);
302                         break;
303
304                 case QPU_W_MS_FLAGS:
305                         add_write_dep(state, &state->last_tlb, n);
306                         break;
307
308                 case QPU_W_NOP:
309                         break;
310
311                 default:
312                         fprintf(stderr, "Unknown waddr %d\n", waddr);
313                         abort();
314                 }
315         }
316 }
317
318 static void
319 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
320                   uint32_t cond)
321 {
322         switch (cond) {
323         case QPU_COND_NEVER:
324         case QPU_COND_ALWAYS:
325                 break;
326         default:
327                 add_read_dep(state, state->last_sf, n);
328                 break;
329         }
330 }
331
332 /**
333  * Common code for dependencies that need to be tracked both forward and
334  * backward.
335  *
336  * This is for things like "all reads of r4 have to happen between the r4
337  * writes that surround them".
338  */
339 static void
340 calculate_deps(struct schedule_state *state, struct schedule_node *n)
341 {
342         uint64_t inst = n->inst->inst;
343         uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
344         uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
345         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
346         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
347         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
348         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
349         uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
350         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
351         uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
352         uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
353         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
354
355         if (sig != QPU_SIG_LOAD_IMM) {
356                 process_raddr_deps(state, n, raddr_a, true);
357                 if (sig != QPU_SIG_SMALL_IMM)
358                         process_raddr_deps(state, n, raddr_b, false);
359         }
360
361         if (add_op != QPU_A_NOP) {
362                 process_mux_deps(state, n, add_a);
363                 process_mux_deps(state, n, add_b);
364         }
365         if (mul_op != QPU_M_NOP) {
366                 process_mux_deps(state, n, mul_a);
367                 process_mux_deps(state, n, mul_b);
368         }
369
370         process_waddr_deps(state, n, waddr_add, true);
371         process_waddr_deps(state, n, waddr_mul, false);
372         if (qpu_writes_r4(inst))
373                 add_write_dep(state, &state->last_r[4], n);
374
375         switch (sig) {
376         case QPU_SIG_SW_BREAKPOINT:
377         case QPU_SIG_NONE:
378         case QPU_SIG_THREAD_SWITCH:
379         case QPU_SIG_LAST_THREAD_SWITCH:
380         case QPU_SIG_SMALL_IMM:
381         case QPU_SIG_LOAD_IMM:
382                 break;
383
384         case QPU_SIG_LOAD_TMU0:
385         case QPU_SIG_LOAD_TMU1:
386                 /* TMU loads are coming from a FIFO, so ordering is important.
387                  */
388                 add_write_dep(state, &state->last_tmu_write, n);
389                 break;
390
391         case QPU_SIG_COLOR_LOAD:
392                 add_read_dep(state, state->last_tlb, n);
393                 break;
394
395         case QPU_SIG_PROG_END:
396         case QPU_SIG_WAIT_FOR_SCOREBOARD:
397         case QPU_SIG_SCOREBOARD_UNLOCK:
398         case QPU_SIG_COVERAGE_LOAD:
399         case QPU_SIG_COLOR_LOAD_END:
400         case QPU_SIG_ALPHA_MASK_LOAD:
401         case QPU_SIG_BRANCH:
402                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
403                 abort();
404         }
405
406         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
407         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
408         if (inst & QPU_SF)
409                 add_write_dep(state, &state->last_sf, n);
410 }
411
412 static void
413 calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list)
414 {
415         struct schedule_state state;
416
417         memset(&state, 0, sizeof(state));
418         state.dir = F;
419
420         list_for_each_entry(struct schedule_node, node, schedule_list, link)
421                 calculate_deps(&state, node);
422 }
423
424 static void
425 calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list)
426 {
427         struct list_head *node;
428         struct schedule_state state;
429
430         memset(&state, 0, sizeof(state));
431         state.dir = R;
432
433         for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
434                 calculate_deps(&state, (struct schedule_node *)node);
435         }
436 }
437
438 struct choose_scoreboard {
439         int tick;
440         int last_sfu_write_tick;
441         uint32_t last_waddr_a, last_waddr_b;
442 };
443
444 static bool
445 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
446 {
447         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
448         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
449         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
450         uint32_t src_muxes[] = {
451                 QPU_GET_FIELD(inst, QPU_ADD_A),
452                 QPU_GET_FIELD(inst, QPU_ADD_B),
453                 QPU_GET_FIELD(inst, QPU_MUL_A),
454                 QPU_GET_FIELD(inst, QPU_MUL_B),
455         };
456         for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
457                 if ((src_muxes[i] == QPU_MUX_A &&
458                      raddr_a < 32 &&
459                      scoreboard->last_waddr_a == raddr_a) ||
460                     (src_muxes[i] == QPU_MUX_B &&
461                      sig != QPU_SIG_SMALL_IMM &&
462                      raddr_b < 32 &&
463                      scoreboard->last_waddr_b == raddr_b)) {
464                         return true;
465                 }
466
467                 if (src_muxes[i] == QPU_MUX_R4) {
468                         if (scoreboard->tick -
469                             scoreboard->last_sfu_write_tick <= 2) {
470                                 return true;
471                         }
472                 }
473         }
474
475         return false;
476 }
477
478 static bool
479 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
480 {
481         return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
482 }
483
484 static int
485 get_instruction_priority(uint64_t inst)
486 {
487         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
488         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
489         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
490         uint32_t baseline_score;
491         uint32_t next_score = 0;
492
493         /* Schedule TLB operations as late as possible, to get more
494          * parallelism between shaders.
495          */
496         if (qpu_inst_is_tlb(inst))
497                 return next_score;
498         next_score++;
499
500         /* Schedule texture read results collection late to hide latency. */
501         if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
502                 return next_score;
503         next_score++;
504
505         /* Default score for things that aren't otherwise special. */
506         baseline_score = next_score;
507         next_score++;
508
509         /* Schedule texture read setup early to hide their latency better. */
510         if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
511                 return next_score;
512         next_score++;
513
514         return baseline_score;
515 }
516
517 static struct schedule_node *
518 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
519                                struct list_head *schedule_list,
520                                struct schedule_node *prev_inst)
521 {
522         struct schedule_node *chosen = NULL;
523         int chosen_prio = 0;
524
525         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
526                 uint64_t inst = n->inst->inst;
527
528                 /* "An instruction must not read from a location in physical
529                  *  regfile A or B that was written to by the previous
530                  *  instruction."
531                  */
532                 if (reads_too_soon_after_write(scoreboard, inst))
533                         continue;
534
535                 /* "A scoreboard wait must not occur in the first two
536                  *  instructions of a fragment shader. This is either the
537                  *  explicit Wait for Scoreboard signal or an implicit wait
538                  *  with the first tile-buffer read or write instruction."
539                  */
540                 if (pixel_scoreboard_too_soon(scoreboard, inst))
541                         continue;
542
543                 /* If we're trying to pair with another instruction, check
544                  * that they're compatible.
545                  */
546                 if (prev_inst) {
547                         if (prev_inst->uniform != -1 && n->uniform != -1)
548                                 continue;
549
550                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
551                         if (!inst)
552                                 continue;
553                 }
554
555                 int prio = get_instruction_priority(inst);
556
557                 /* Found a valid instruction.  If nothing better comes along,
558                  * this one works.
559                  */
560                 if (!chosen) {
561                         chosen = n;
562                         chosen_prio = prio;
563                         continue;
564                 }
565
566                 if (prio > chosen_prio) {
567                         chosen = n;
568                         chosen_prio = prio;
569                 } else if (prio < chosen_prio) {
570                         continue;
571                 }
572
573                 if (n->delay > chosen->delay) {
574                         chosen = n;
575                         chosen_prio = prio;
576                 } else if (n->delay < chosen->delay) {
577                         continue;
578                 }
579         }
580
581         return chosen;
582 }
583
584 static void
585 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
586                              uint64_t inst)
587 {
588         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
589         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
590
591         if (!(inst & QPU_WS)) {
592                 scoreboard->last_waddr_a = waddr_add;
593                 scoreboard->last_waddr_b = waddr_mul;
594         } else {
595                 scoreboard->last_waddr_b = waddr_add;
596                 scoreboard->last_waddr_a = waddr_mul;
597         }
598
599         if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
600             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
601                 scoreboard->last_sfu_write_tick = scoreboard->tick;
602         }
603 }
604
605 static void
606 dump_state(struct list_head *schedule_list)
607 {
608         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
609                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
610                 vc4_qpu_disasm(&n->inst->inst, 1);
611                 fprintf(stderr, "\n");
612
613                 for (int i = 0; i < n->child_count; i++) {
614                         struct schedule_node *child = n->children[i].node;
615                         if (!child)
616                                 continue;
617
618                         fprintf(stderr, "                 - ");
619                         vc4_qpu_disasm(&child->inst->inst, 1);
620                         fprintf(stderr, " (%d parents, %c)\n",
621                                 child->parent_count,
622                                 n->children[i].write_after_read ? 'w' : 'r');
623                 }
624         }
625 }
626
627 /** Recursive computation of the delay member of a node. */
628 static void
629 compute_delay(struct schedule_node *n)
630 {
631         if (!n->child_count) {
632                 n->delay = 1;
633         } else {
634                 for (int i = 0; i < n->child_count; i++) {
635                         if (!n->children[i].node->delay)
636                                 compute_delay(n->children[i].node);
637                         n->delay = MAX2(n->delay,
638                                         n->children[i].node->delay + n->latency);
639                 }
640         }
641 }
642
643 static void
644 mark_instruction_scheduled(struct list_head *schedule_list,
645                            uint32_t time,
646                            struct schedule_node *node,
647                            bool war_only)
648 {
649         if (!node)
650                 return;
651
652         for (int i = node->child_count - 1; i >= 0; i--) {
653                 struct schedule_node *child =
654                         node->children[i].node;
655
656                 if (!child)
657                         continue;
658
659                 if (war_only && !node->children[i].write_after_read)
660                         continue;
661
662                 /* If the requirement is only that the node not appear before
663                  * the last read of its destination, then it can be scheduled
664                  * immediately after (or paired with!) the thing reading the
665                  * destination.
666                  */
667                 int latency_from_previous = war_only ? 0 : node->latency;
668                 child->unblocked_time = MAX2(child->unblocked_time,
669                                              time + latency_from_previous);
670                 child->parent_count--;
671                 if (child->parent_count == 0)
672                         list_add(&child->link, schedule_list);
673
674                 node->children[i].node = NULL;
675         }
676 }
677
678 static uint32_t
679 schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
680 {
681         struct choose_scoreboard scoreboard;
682         uint32_t time = 0;
683
684         /* We reorder the uniforms as we schedule instructions, so save the
685          * old data off and replace it.
686          */
687         uint32_t *uniform_data = c->uniform_data;
688         enum quniform_contents *uniform_contents = c->uniform_contents;
689         c->uniform_contents = ralloc_array(c, enum quniform_contents,
690                                            c->num_uniforms);
691         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
692         c->uniform_array_size = c->num_uniforms;
693         uint32_t next_uniform = 0;
694
695         memset(&scoreboard, 0, sizeof(scoreboard));
696         scoreboard.last_waddr_a = ~0;
697         scoreboard.last_waddr_b = ~0;
698         scoreboard.last_sfu_write_tick = -10;
699
700         if (debug) {
701                 fprintf(stderr, "initial deps:\n");
702                 dump_state(schedule_list);
703                 fprintf(stderr, "\n");
704         }
705
706         /* Remove non-DAG heads from the list. */
707         list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
708                 if (n->parent_count != 0)
709                         list_del(&n->link);
710         }
711
712         while (!list_empty(schedule_list)) {
713                 struct schedule_node *chosen =
714                         choose_instruction_to_schedule(&scoreboard,
715                                                        schedule_list,
716                                                        NULL);
717                 struct schedule_node *merge = NULL;
718
719                 /* If there are no valid instructions to schedule, drop a NOP
720                  * in.
721                  */
722                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
723
724                 if (debug) {
725                         fprintf(stderr, "t=%4d: current list:\n",
726                                 time);
727                         dump_state(schedule_list);
728                         fprintf(stderr, "t=%4d: chose: ", time);
729                         vc4_qpu_disasm(&inst, 1);
730                         fprintf(stderr, "\n");
731                 }
732
733                 /* Schedule this instruction onto the QPU list. Also try to
734                  * find an instruction to pair with it.
735                  */
736                 if (chosen) {
737                         time = MAX2(chosen->unblocked_time, time);
738                         list_del(&chosen->link);
739                         mark_instruction_scheduled(schedule_list, time,
740                                                    chosen, true);
741                         if (chosen->uniform != -1) {
742                                 c->uniform_data[next_uniform] =
743                                         uniform_data[chosen->uniform];
744                                 c->uniform_contents[next_uniform] =
745                                         uniform_contents[chosen->uniform];
746                                 next_uniform++;
747                         }
748
749                         merge = choose_instruction_to_schedule(&scoreboard,
750                                                                schedule_list,
751                                                                chosen);
752                         if (merge) {
753                                 time = MAX2(merge->unblocked_time, time);
754                                 list_del(&merge->link);
755                                 inst = qpu_merge_inst(inst, merge->inst->inst);
756                                 assert(inst != 0);
757                                 if (merge->uniform != -1) {
758                                         c->uniform_data[next_uniform] =
759                                                 uniform_data[merge->uniform];
760                                         c->uniform_contents[next_uniform] =
761                                                 uniform_contents[merge->uniform];
762                                         next_uniform++;
763                                 }
764
765                                 if (debug) {
766                                         fprintf(stderr, "t=%4d: merging: ",
767                                                 time);
768                                         vc4_qpu_disasm(&merge->inst->inst, 1);
769                                         fprintf(stderr, "\n");
770                                         fprintf(stderr, "            resulting in: ");
771                                         vc4_qpu_disasm(&inst, 1);
772                                         fprintf(stderr, "\n");
773                                 }
774                         }
775                 }
776
777                 if (debug) {
778                         fprintf(stderr, "\n");
779                 }
780
781                 qpu_serialize_one_inst(c, inst);
782
783                 update_scoreboard_for_chosen(&scoreboard, inst);
784
785                 /* Now that we've scheduled a new instruction, some of its
786                  * children can be promoted to the list of instructions ready to
787                  * be scheduled.  Update the children's unblocked time for this
788                  * DAG edge as we do so.
789                  */
790                 mark_instruction_scheduled(schedule_list, time, chosen, false);
791                 mark_instruction_scheduled(schedule_list, time, merge, false);
792
793                 scoreboard.tick++;
794                 time++;
795         }
796
797         assert(next_uniform == c->num_uniforms);
798
799         return time;
800 }
801
802 static uint32_t waddr_latency(uint32_t waddr)
803 {
804         if (waddr < 32)
805                 return 2;
806
807         /* Some huge number, really. */
808         if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
809                 return 100;
810
811         switch(waddr) {
812         case QPU_W_SFU_RECIP:
813         case QPU_W_SFU_RECIPSQRT:
814         case QPU_W_SFU_EXP:
815         case QPU_W_SFU_LOG:
816                 return 3;
817         default:
818                 return 1;
819         }
820 }
821
822 static uint32_t
823 instruction_latency(uint64_t inst)
824 {
825         return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
826                     waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
827 }
828
829 uint32_t
830 qpu_schedule_instructions(struct vc4_compile *c)
831 {
832         void *mem_ctx = ralloc_context(NULL);
833         struct list_head schedule_list;
834
835         list_inithead(&schedule_list);
836
837         if (debug) {
838                 fprintf(stderr, "Pre-schedule instructions\n");
839                 list_for_each_entry(struct queued_qpu_inst, q,
840                                     &c->qpu_inst_list, link) {
841                         vc4_qpu_disasm(&q->inst, 1);
842                         fprintf(stderr, "\n");
843                 }
844                 fprintf(stderr, "\n");
845         }
846
847         /* Wrap each instruction in a scheduler structure. */
848         uint32_t next_uniform = 0;
849         while (!list_empty(&c->qpu_inst_list)) {
850                 struct queued_qpu_inst *inst =
851                         (struct queued_qpu_inst *)c->qpu_inst_list.next;
852                 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
853
854                 n->inst = inst;
855                 n->latency = instruction_latency(inst->inst);
856
857                 if (reads_uniform(inst->inst)) {
858                         n->uniform = next_uniform++;
859                 } else {
860                         n->uniform = -1;
861                 }
862                 list_del(&inst->link);
863                 list_addtail(&n->link, &schedule_list);
864         }
865         assert(next_uniform == c->num_uniforms);
866
867         calculate_forward_deps(c, &schedule_list);
868         calculate_reverse_deps(c, &schedule_list);
869
870         list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
871                 compute_delay(n);
872         }
873
874         uint32_t cycles = schedule_instructions(c, &schedule_list);
875
876         if (debug) {
877                 fprintf(stderr, "Post-schedule instructions\n");
878                 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
879                 fprintf(stderr, "\n");
880         }
881
882         ralloc_free(mem_ctx);
883
884         return cycles;
885 }