kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/cpuidle.h>
  27 #include <linux/slab.h>
  28 #include <linux/profile.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/mempolicy.h>
  31 #include <linux/migrate.h>
  32 #include <linux/task_work.h>
  33 #include <linux/module.h>
  34
  35 #include "sched.h"
  36 #include <trace/events/sched.h>
  37 #include "tune.h"
  38 #include "walt.h"
  39
  40 /*
  41  * Targeted preemption latency for CPU-bound tasks:
  42  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  43  *
  44  * NOTE: this latency value is not the same as the concept of
  45  * 'timeslice length' - timeslices in CFS are of variable length
  46  * and have no persistent notion like in traditional, time-slice
  47  * based scheduling concepts.
  48  *
  49  * (to see the precise effective timeslice length of your workload,
  50  *  run vmstat and monitor the context-switches (cs) field)
  51  */
  52 unsigned int sysctl_sched_latency = 6000000ULL;
  53 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  54
  55 unsigned int sysctl_sched_sync_hint_enable = 1;
  56 unsigned int sysctl_sched_cstate_aware = 1;
  57
  58 /*
  59  * The initial- and re-scaling of tunables is configurable
  60  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  61  *
  62  * Options are:
  63  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  64  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  65  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  66  */
  67 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  68         = SCHED_TUNABLESCALING_LOG;
  69
  70 /*
  71  * Minimal preemption granularity for CPU-bound tasks:
  72  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  73  */
  74 unsigned int sysctl_sched_min_granularity = 750000ULL;
  75 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  76
  77 /*
  78  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  79  */
  80 static unsigned int sched_nr_latency = 8;
  81
  82 /*
  83  * After fork, child runs first. If set to 0 (default) then
  84  * parent will (try to) run first.
  85  */
  86 unsigned int sysctl_sched_child_runs_first __read_mostly;
  87
  88 /*
  89  * SCHED_OTHER wake-up granularity.
  90  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  91  *
  92  * This option delays the preemption effects of decoupled workloads
  93  * and reduces their over-scheduling. Synchronous workloads will still
  94  * have immediate wakeup/sleep latencies.
  95  */
  96 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  97 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  98
  99 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 100
 101 /*
 102  * The exponential sliding  window over which load is averaged for shares
 103  * distribution.
 104  * (default: 10msec)
 105  */
 106 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 107
 108 #ifdef CONFIG_CFS_BANDWIDTH
 109 /*
 110  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 111  * each time a cfs_rq requests quota.
 112  *
 113  * Note: in the case that the slice exceeds the runtime remaining (either due
 114  * to consumption or the quota being specified to be smaller than the slice)
 115  * we will always only issue the remaining available time.
 116  *
 117  * default: 5 msec, units: microseconds
 118   */
 119 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 120 #endif
 121
 122 /*
 123  * The margin used when comparing utilization with CPU capacity:
 124  * util * margin < capacity * 1024
 125  */
 126 unsigned int capacity_margin = 1280; /* ~20% */
 127
 128 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 129 {
 130         lw->weight += inc;
 131         lw->inv_weight = 0;
 132 }
 133
 134 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 135 {
 136         lw->weight -= dec;
 137         lw->inv_weight = 0;
 138 }
 139
 140 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 141 {
 142         lw->weight = w;
 143         lw->inv_weight = 0;
 144 }
 145
 146 /*
 147  * Increase the granularity value when there are more CPUs,
 148  * because with more CPUs the 'effective latency' as visible
 149  * to users decreases. But the relationship is not linear,
 150  * so pick a second-best guess by going with the log2 of the
 151  * number of CPUs.
 152  *
 153  * This idea comes from the SD scheduler of Con Kolivas:
 154  */
 155 static unsigned int get_update_sysctl_factor(void)
 156 {
 157         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 158         unsigned int factor;
 159
 160         switch (sysctl_sched_tunable_scaling) {
 161         case SCHED_TUNABLESCALING_NONE:
 162                 factor = 1;
 163                 break;
 164         case SCHED_TUNABLESCALING_LINEAR:
 165                 factor = cpus;
 166                 break;
 167         case SCHED_TUNABLESCALING_LOG:
 168         default:
 169                 factor = 1 + ilog2(cpus);
 170                 break;
 171         }
 172
 173         return factor;
 174 }
 175
 176 static void update_sysctl(void)
 177 {
 178         unsigned int factor = get_update_sysctl_factor();
 179
 180 #define SET_SYSCTL(name) \
 181         (sysctl_##name = (factor) * normalized_sysctl_##name)
 182         SET_SYSCTL(sched_min_granularity);
 183         SET_SYSCTL(sched_latency);
 184         SET_SYSCTL(sched_wakeup_granularity);
 185 #undef SET_SYSCTL
 186 }
 187
 188 void sched_init_granularity(void)
 189 {
 190         update_sysctl();
 191 }
 192
 193 #define WMULT_CONST     (~0U)
 194 #define WMULT_SHIFT     32
 195
 196 static void __update_inv_weight(struct load_weight *lw)
 197 {
 198         unsigned long w;
 199
 200         if (likely(lw->inv_weight))
 201                 return;
 202
 203         w = scale_load_down(lw->weight);
 204
 205         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 206                 lw->inv_weight = 1;
 207         else if (unlikely(!w))
 208                 lw->inv_weight = WMULT_CONST;
 209         else
 210                 lw->inv_weight = WMULT_CONST / w;
 211 }
 212
 213 /*
 214  * delta_exec * weight / lw.weight
 215  *   OR
 216  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 217  *
 218  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 219  * we're guaranteed shift stays positive because inv_weight is guaranteed to
 220  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 221  *
 222  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 223  * weight/lw.weight <= 1, and therefore our shift will also be positive.
 224  */
 225 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 226 {
 227         u64 fact = scale_load_down(weight);
 228         int shift = WMULT_SHIFT;
 229
 230         __update_inv_weight(lw);
 231
 232         if (unlikely(fact >> 32)) {
 233                 while (fact >> 32) {
 234                         fact >>= 1;
 235                         shift--;
 236                 }
 237         }
 238
 239         /* hint to use a 32x32->64 mul */
 240         fact = (u64)(u32)fact * lw->inv_weight;
 241
 242         while (fact >> 32) {
 243                 fact >>= 1;
 244                 shift--;
 245         }
 246
 247         return mul_u64_u32_shr(delta_exec, fact, shift);
 248 }
 249
 250 #ifdef CONFIG_SMP
 251 static int active_load_balance_cpu_stop(void *data);
 252 #endif
 253
 254 const struct sched_class fair_sched_class;
 255
 256 /**************************************************************
 257  * CFS operations on generic schedulable entities:
 258  */
 259
 260 #ifdef CONFIG_FAIR_GROUP_SCHED
 261
 262 /* cpu runqueue to which this cfs_rq is attached */
 263 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 264 {
 265         return cfs_rq->rq;
 266 }
 267
 268 /* An entity is a task if it doesn't "own" a runqueue */
 269 #define entity_is_task(se)      (!se->my_q)
 270
 271 static inline struct task_struct *task_of(struct sched_entity *se)
 272 {
 273 #ifdef CONFIG_SCHED_DEBUG
 274         WARN_ON_ONCE(!entity_is_task(se));
 275 #endif
 276         return container_of(se, struct task_struct, se);
 277 }
 278
 279 /* Walk up scheduling entities hierarchy */
 280 #define for_each_sched_entity(se) \
 281                 for (; se; se = se->parent)
 282
 283 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 284 {
 285         return p->se.cfs_rq;
 286 }
 287
 288 /* runqueue on which this entity is (to be) queued */
 289 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 290 {
 291         return se->cfs_rq;
 292 }
 293
 294 /* runqueue "owned" by this group */
 295 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 296 {
 297         return grp->my_q;
 298 }
 299
 300 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 301 {
 302         if (!cfs_rq->on_list) {
 303                 struct rq *rq = rq_of(cfs_rq);
 304                 int cpu = cpu_of(rq);
 305                 /*
 306                  * Ensure we either appear before our parent (if already
 307                  * enqueued) or force our parent to appear after us when it is
 308                  * enqueued. The fact that we always enqueue bottom-up
 309                  * reduces this to two cases and a special case for the root
 310                  * cfs_rq. Furthermore, it also means that we will always reset
 311                  * tmp_alone_branch either when the branch is connected
 312                  * to a tree or when we reach the beg of the tree
 313                  */
 314                 if (cfs_rq->tg->parent &&
 315                     cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 316                         /*
 317                          * If parent is already on the list, we add the child
 318                          * just before. Thanks to circular linked property of
 319                          * the list, this means to put the child at the tail
 320                          * of the list that starts by parent.
 321                          */
 322                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 323                                 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 324                         /*
 325                          * The branch is now connected to its tree so we can
 326                          * reset tmp_alone_branch to the beginning of the
 327                          * list.
 328                          */
 329                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 330                 } else if (!cfs_rq->tg->parent) {
 331                         /*
 332                          * cfs rq without parent should be put
 333                          * at the tail of the list.
 334                          */
 335                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 336                                 &rq->leaf_cfs_rq_list);
 337                         /*
 338                          * We have reach the beg of a tree so we can reset
 339                          * tmp_alone_branch to the beginning of the list.
 340                          */
 341                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 342                 } else {
 343                         /*
 344                          * The parent has not already been added so we want to
 345                          * make sure that it will be put after us.
 346                          * tmp_alone_branch points to the beg of the branch
 347                          * where we will add parent.
 348                          */
 349                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 350                                 rq->tmp_alone_branch);
 351                         /*
 352                          * update tmp_alone_branch to points to the new beg
 353                          * of the branch
 354                          */
 355                         rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 356                 }
 357
 358                 cfs_rq->on_list = 1;
 359         }
 360 }
 361
 362 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 363 {
 364         if (cfs_rq->on_list) {
 365                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 366                 cfs_rq->on_list = 0;
 367         }
 368 }
 369
 370 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 371 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 372         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 373
 374 /* Do the two (enqueued) entities belong to the same group ? */
 375 static inline struct cfs_rq *
 376 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 377 {
 378         if (se->cfs_rq == pse->cfs_rq)
 379                 return se->cfs_rq;
 380
 381         return NULL;
 382 }
 383
 384 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 385 {
 386         return se->parent;
 387 }
 388
 389 static void
 390 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 391 {
 392         int se_depth, pse_depth;
 393
 394         /*
 395          * preemption test can be made between sibling entities who are in the
 396          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 397          * both tasks until we find their ancestors who are siblings of common
 398          * parent.
 399          */
 400
 401         /* First walk up until both entities are at same depth */
 402         se_depth = (*se)->depth;
 403         pse_depth = (*pse)->depth;
 404
 405         while (se_depth > pse_depth) {
 406                 se_depth--;
 407                 *se = parent_entity(*se);
 408         }
 409
 410         while (pse_depth > se_depth) {
 411                 pse_depth--;
 412                 *pse = parent_entity(*pse);
 413         }
 414
 415         while (!is_same_group(*se, *pse)) {
 416                 *se = parent_entity(*se);
 417                 *pse = parent_entity(*pse);
 418         }
 419 }
 420
 421 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 422
 423 static inline struct task_struct *task_of(struct sched_entity *se)
 424 {
 425         return container_of(se, struct task_struct, se);
 426 }
 427
 428 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 429 {
 430         return container_of(cfs_rq, struct rq, cfs);
 431 }
 432
 433 #define entity_is_task(se)      1
 434
 435 #define for_each_sched_entity(se) \
 436                 for (; se; se = NULL)
 437
 438 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 439 {
 440         return &task_rq(p)->cfs;
 441 }
 442
 443 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 444 {
 445         struct task_struct *p = task_of(se);
 446         struct rq *rq = task_rq(p);
 447
 448         return &rq->cfs;
 449 }
 450
 451 /* runqueue "owned" by this group */
 452 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 453 {
 454         return NULL;
 455 }
 456
 457 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 458 {
 459 }
 460
 461 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 462 {
 463 }
 464
 465 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 466                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 467
 468 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 469 {
 470         return NULL;
 471 }
 472
 473 static inline void
 474 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 475 {
 476 }
 477
 478 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 479
 480 static __always_inline
 481 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 482
 483 /**************************************************************
 484  * Scheduling class tree data structure manipulation methods:
 485  */
 486
 487 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 488 {
 489         s64 delta = (s64)(vruntime - max_vruntime);
 490         if (delta > 0)
 491                 max_vruntime = vruntime;
 492
 493         return max_vruntime;
 494 }
 495
 496 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 497 {
 498         s64 delta = (s64)(vruntime - min_vruntime);
 499         if (delta < 0)
 500                 min_vruntime = vruntime;
 501
 502         return min_vruntime;
 503 }
 504
 505 static inline int entity_before(struct sched_entity *a,
 506                                 struct sched_entity *b)
 507 {
 508         return (s64)(a->vruntime - b->vruntime) < 0;
 509 }
 510
 511 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 512 {
 513         u64 vruntime = cfs_rq->min_vruntime;
 514
 515         if (cfs_rq->curr)
 516                 vruntime = cfs_rq->curr->vruntime;
 517
 518         if (cfs_rq->rb_leftmost) {
 519                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 520                                                    struct sched_entity,
 521                                                    run_node);
 522
 523                 if (!cfs_rq->curr)
 524                         vruntime = se->vruntime;
 525                 else
 526                         vruntime = min_vruntime(vruntime, se->vruntime);
 527         }
 528
 529         /* ensure we never gain time by being placed backwards. */
 530         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 531 #ifndef CONFIG_64BIT
 532         smp_wmb();
 533         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 534 #endif
 535 }
 536
 537 /*
 538  * Enqueue an entity into the rb-tree:
 539  */
 540 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 541 {
 542         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 543         struct rb_node *parent = NULL;
 544         struct sched_entity *entry;
 545         int leftmost = 1;
 546
 547         /*
 548          * Find the right place in the rbtree:
 549          */
 550         while (*link) {
 551                 parent = *link;
 552                 entry = rb_entry(parent, struct sched_entity, run_node);
 553                 /*
 554                  * We dont care about collisions. Nodes with
 555                  * the same key stay together.
 556                  */
 557                 if (entity_before(se, entry)) {
 558                         link = &parent->rb_left;
 559                 } else {
 560                         link = &parent->rb_right;
 561                         leftmost = 0;
 562                 }
 563         }
 564
 565         /*
 566          * Maintain a cache of leftmost tree entries (it is frequently
 567          * used):
 568          */
 569         if (leftmost)
 570                 cfs_rq->rb_leftmost = &se->run_node;
 571
 572         rb_link_node(&se->run_node, parent, link);
 573         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 574 }
 575
 576 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 577 {
 578         if (cfs_rq->rb_leftmost == &se->run_node) {
 579                 struct rb_node *next_node;
 580
 581                 next_node = rb_next(&se->run_node);
 582                 cfs_rq->rb_leftmost = next_node;
 583         }
 584
 585         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 586 }
 587
 588 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 589 {
 590         struct rb_node *left = cfs_rq->rb_leftmost;
 591
 592         if (!left)
 593                 return NULL;
 594
 595         return rb_entry(left, struct sched_entity, run_node);
 596 }
 597
 598 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 599 {
 600         struct rb_node *next = rb_next(&se->run_node);
 601
 602         if (!next)
 603                 return NULL;
 604
 605         return rb_entry(next, struct sched_entity, run_node);
 606 }
 607
 608 #ifdef CONFIG_SCHED_DEBUG
 609 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 610 {
 611         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 612
 613         if (!last)
 614                 return NULL;
 615
 616         return rb_entry(last, struct sched_entity, run_node);
 617 }
 618
 619 /**************************************************************
 620  * Scheduling class statistics methods:
 621  */
 622
 623 int sched_proc_update_handler(struct ctl_table *table, int write,
 624                 void __user *buffer, size_t *lenp,
 625                 loff_t *ppos)
 626 {
 627         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 628         unsigned int factor = get_update_sysctl_factor();
 629
 630         if (ret || !write)
 631                 return ret;
 632
 633         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 634                                         sysctl_sched_min_granularity);
 635
 636 #define WRT_SYSCTL(name) \
 637         (normalized_sysctl_##name = sysctl_##name / (factor))
 638         WRT_SYSCTL(sched_min_granularity);
 639         WRT_SYSCTL(sched_latency);
 640         WRT_SYSCTL(sched_wakeup_granularity);
 641 #undef WRT_SYSCTL
 642
 643         return 0;
 644 }
 645 #endif
 646
 647 /*
 648  * delta /= w
 649  */
 650 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 651 {
 652         if (unlikely(se->load.weight != NICE_0_LOAD))
 653                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 654
 655         return delta;
 656 }
 657
 658 /*
 659  * The idea is to set a period in which each task runs once.
 660  *
 661  * When there are too many tasks (sched_nr_latency) we have to stretch
 662  * this period because otherwise the slices get too small.
 663  *
 664  * p = (nr <= nl) ? l : l*nr/nl
 665  */
 666 static u64 __sched_period(unsigned long nr_running)
 667 {
 668         if (unlikely(nr_running > sched_nr_latency))
 669                 return nr_running * sysctl_sched_min_granularity;
 670         else
 671                 return sysctl_sched_latency;
 672 }
 673
 674 /*
 675  * We calculate the wall-time slice from the period by taking a part
 676  * proportional to the weight.
 677  *
 678  * s = p*P[w/rw]
 679  */
 680 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 681 {
 682         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 683
 684         for_each_sched_entity(se) {
 685                 struct load_weight *load;
 686                 struct load_weight lw;
 687
 688                 cfs_rq = cfs_rq_of(se);
 689                 load = &cfs_rq->load;
 690
 691                 if (unlikely(!se->on_rq)) {
 692                         lw = cfs_rq->load;
 693
 694                         update_load_add(&lw, se->load.weight);
 695                         load = &lw;
 696                 }
 697                 slice = __calc_delta(slice, se->load.weight, load);
 698         }
 699         return slice;
 700 }
 701
 702 /*
 703  * We calculate the vruntime slice of a to-be-inserted task.
 704  *
 705  * vs = s/w
 706  */
 707 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 708 {
 709         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 710 }
 711
 712 #ifdef CONFIG_SMP
 713 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 714 static unsigned long task_h_load(struct task_struct *p);
 715
 716 /*
 717  * We choose a half-life close to 1 scheduling period.
 718  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
 719  * dependent on this value.
 720  */
 721 #define LOAD_AVG_PERIOD 32
 722 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 723 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 724
 725 /* Give new sched_entity start runnable values to heavy its load in infant time */
 726 void init_entity_runnable_average(struct sched_entity *se)
 727 {
 728         struct sched_avg *sa = &se->avg;
 729
 730         sa->last_update_time = 0;
 731         /*
 732          * sched_avg's period_contrib should be strictly less then 1024, so
 733          * we give it 1023 to make sure it is almost a period (1024us), and
 734          * will definitely be update (after enqueue).
 735          */
 736         sa->period_contrib = 1023;
 737         /*
 738          * Tasks are intialized with full load to be seen as heavy tasks until
 739          * they get a chance to stabilize to their real load level.
 740          * Group entities are intialized with zero load to reflect the fact that
 741          * nothing has been attached to the task group yet.
 742          */
 743         if (entity_is_task(se))
 744                 sa->load_avg = scale_load_down(se->load.weight);
 745         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 746         /*
 747          * In previous Android versions, we used to have:
 748          *      sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 749          *      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 750          * However, that functionality has been moved to enqueue.
 751          * It is unclear if we should restore this in enqueue.
 752          */
 753         /*
 754          * At this point, util_avg won't be used in select_task_rq_fair anyway
 755          */
 756         sa->util_avg = 0;
 757         sa->util_sum = 0;
 758         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 759 }
 760
 761 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 762 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
 763 static void attach_entity_cfs_rq(struct sched_entity *se);
 764 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 765
 766 /*
 767  * With new tasks being created, their initial util_avgs are extrapolated
 768  * based on the cfs_rq's current util_avg:
 769  *
 770  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 771  *
 772  * However, in many cases, the above util_avg does not give a desired
 773  * value. Moreover, the sum of the util_avgs may be divergent, such
 774  * as when the series is a harmonic series.
 775  *
 776  * To solve this problem, we also cap the util_avg of successive tasks to
 777  * only 1/2 of the left utilization budget:
 778  *
 779  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
 780  *
 781  * where n denotes the nth task.
 782  *
 783  * For example, a simplest series from the beginning would be like:
 784  *
 785  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 786  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 787  *
 788  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 789  * if util_avg > util_avg_cap.
 790  */
 791 void post_init_entity_util_avg(struct sched_entity *se)
 792 {
 793         struct cfs_rq *cfs_rq = cfs_rq_of(se);
 794         struct sched_avg *sa = &se->avg;
 795         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 796
 797         if (cap > 0) {
 798                 if (cfs_rq->avg.util_avg != 0) {
 799                         sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 800                         sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 801
 802                         if (sa->util_avg > cap)
 803                                 sa->util_avg = cap;
 804                 } else {
 805                         sa->util_avg = cap;
 806                 }
 807                 /*
 808                  * If we wish to restore tuning via setting initial util,
 809                  * this is where we should do it.
 810                  */
 811                 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 812         }
 813
 814         if (entity_is_task(se)) {
 815                 struct task_struct *p = task_of(se);
 816                 if (p->sched_class != &fair_sched_class) {
 817                         /*
 818                          * For !fair tasks do:
 819                          *
 820                         update_cfs_rq_load_avg(now, cfs_rq, false);
 821                         attach_entity_load_avg(cfs_rq, se);
 822                         switched_from_fair(rq, p);
 823                          *
 824                          * such that the next switched_to_fair() has the
 825                          * expected state.
 826                          */
 827                         se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 828                         return;
 829                 }
 830         }
 831
 832         attach_entity_cfs_rq(se);
 833 }
 834
 835 #else /* !CONFIG_SMP */
 836 void init_entity_runnable_average(struct sched_entity *se)
 837 {
 838 }
 839 void post_init_entity_util_avg(struct sched_entity *se)
 840 {
 841 }
 842 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 843 {
 844 }
 845 #endif /* CONFIG_SMP */
 846
 847 /*
 848  * Update the current task's runtime statistics.
 849  */
 850 static void update_curr(struct cfs_rq *cfs_rq)
 851 {
 852         struct sched_entity *curr = cfs_rq->curr;
 853         u64 now = rq_clock_task(rq_of(cfs_rq));
 854         u64 delta_exec;
 855
 856         if (unlikely(!curr))
 857                 return;
 858
 859         delta_exec = now - curr->exec_start;
 860         if (unlikely((s64)delta_exec <= 0))
 861                 return;
 862
 863         curr->exec_start = now;
 864
 865         schedstat_set(curr->statistics.exec_max,
 866                       max(delta_exec, curr->statistics.exec_max));
 867
 868         curr->sum_exec_runtime += delta_exec;
 869         schedstat_add(cfs_rq, exec_clock, delta_exec);
 870
 871         curr->vruntime += calc_delta_fair(delta_exec, curr);
 872         update_min_vruntime(cfs_rq);
 873
 874         if (entity_is_task(curr)) {
 875                 struct task_struct *curtask = task_of(curr);
 876
 877                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 878                 cpuacct_charge(curtask, delta_exec);
 879                 account_group_exec_runtime(curtask, delta_exec);
 880         }
 881
 882         account_cfs_rq_runtime(cfs_rq, delta_exec);
 883 }
 884
 885 static void update_curr_fair(struct rq *rq)
 886 {
 887         update_curr(cfs_rq_of(&rq->curr->se));
 888 }
 889
 890 #ifdef CONFIG_SCHEDSTATS
 891 static inline void
 892 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 893 {
 894         u64 wait_start = rq_clock(rq_of(cfs_rq));
 895
 896         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 897             likely(wait_start > se->statistics.wait_start))
 898                 wait_start -= se->statistics.wait_start;
 899
 900         se->statistics.wait_start = wait_start;
 901 }
 902
 903 static void
 904 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 905 {
 906         struct task_struct *p;
 907         u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 908
 909         if (entity_is_task(se)) {
 910                 p = task_of(se);
 911                 if (task_on_rq_migrating(p)) {
 912                         /*
 913                          * Preserve migrating task's wait time so wait_start
 914                          * time stamp can be adjusted to accumulate wait time
 915                          * prior to migration.
 916                          */
 917                         se->statistics.wait_start = delta;
 918                         return;
 919                 }
 920                 trace_sched_stat_wait(p, delta);
 921         }
 922
 923         se->statistics.wait_max = max(se->statistics.wait_max, delta);
 924         se->statistics.wait_count++;
 925         se->statistics.wait_sum += delta;
 926         se->statistics.wait_start = 0;
 927 }
 928 #else
 929 static inline void
 930 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 931 {
 932 }
 933
 934 static inline void
 935 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 936 {
 937 }
 938 #endif
 939
 940 /*
 941  * Task is being enqueued - update stats:
 942  */
 943 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 944 {
 945         /*
 946          * Are we enqueueing a waiting task? (for current tasks
 947          * a dequeue/enqueue event is a NOP)
 948          */
 949         if (se != cfs_rq->curr)
 950                 update_stats_wait_start(cfs_rq, se);
 951 }
 952
 953 static inline void
 954 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 955 {
 956         /*
 957          * Mark the end of the wait period if dequeueing a
 958          * waiting task:
 959          */
 960         if (se != cfs_rq->curr)
 961                 update_stats_wait_end(cfs_rq, se);
 962 }
 963
 964 /*
 965  * We are picking a new current task - update its stats:
 966  */
 967 static inline void
 968 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 969 {
 970         /*
 971          * We are starting a new run period:
 972          */
 973         se->exec_start = rq_clock_task(rq_of(cfs_rq));
 974 }
 975
 976 /**************************************************
 977  * Scheduling class queueing methods:
 978  */
 979
 980 #ifdef CONFIG_NUMA_BALANCING
 981 /*
 982  * Approximate time to scan a full NUMA task in ms. The task scan period is
 983  * calculated based on the tasks virtual memory size and
 984  * numa_balancing_scan_size.
 985  */
 986 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 987 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 988
 989 /* Portion of address space to scan in MB */
 990 unsigned int sysctl_numa_balancing_scan_size = 256;
 991
 992 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 993 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 994
 995 static unsigned int task_nr_scan_windows(struct task_struct *p)
 996 {
 997         unsigned long rss = 0;
 998         unsigned long nr_scan_pages;
 999
1000         /*
1001          * Calculations based on RSS as non-present and empty pages are skipped
1002          * by the PTE scanner and NUMA hinting faults should be trapped based
1003          * on resident pages
1004          */
1005         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1006         rss = get_mm_rss(p->mm);
1007         if (!rss)
1008                 rss = nr_scan_pages;
1009
1010         rss = round_up(rss, nr_scan_pages);
1011         return rss / nr_scan_pages;
1012 }
1013
1014 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1015 #define MAX_SCAN_WINDOW 2560
1016
1017 static unsigned int task_scan_min(struct task_struct *p)
1018 {
1019         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1020         unsigned int scan, floor;
1021         unsigned int windows = 1;
1022
1023         if (scan_size < MAX_SCAN_WINDOW)
1024                 windows = MAX_SCAN_WINDOW / scan_size;
1025         floor = 1000 / windows;
1026
1027         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1028         return max_t(unsigned int, floor, scan);
1029 }
1030
1031 static unsigned int task_scan_max(struct task_struct *p)
1032 {
1033         unsigned int smin = task_scan_min(p);
1034         unsigned int smax;
1035
1036         /* Watch for min being lower than max due to floor calculations */
1037         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1038         return max(smin, smax);
1039 }
1040
1041 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1042 {
1043         rq->nr_numa_running += (p->numa_preferred_nid != -1);
1044         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1045 }
1046
1047 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1048 {
1049         rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1050         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1051 }
1052
1053 struct numa_group {
1054         atomic_t refcount;
1055
1056         spinlock_t lock; /* nr_tasks, tasks */
1057         int nr_tasks;
1058         pid_t gid;
1059
1060         struct rcu_head rcu;
1061         nodemask_t active_nodes;
1062         unsigned long total_faults;
1063         /*
1064          * Faults_cpu is used to decide whether memory should move
1065          * towards the CPU. As a consequence, these stats are weighted
1066          * more by CPU use than by memory faults.
1067          */
1068         unsigned long *faults_cpu;
1069         unsigned long faults[0];
1070 };
1071
1072 /* Shared or private faults. */
1073 #define NR_NUMA_HINT_FAULT_TYPES 2
1074
1075 /* Memory and CPU locality */
1076 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1077
1078 /* Averaged statistics, and temporary buffers. */
1079 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1080
1081 pid_t task_numa_group_id(struct task_struct *p)
1082 {
1083         return p->numa_group ? p->numa_group->gid : 0;
1084 }
1085
1086 /*
1087  * The averaged statistics, shared & private, memory & cpu,
1088  * occupy the first half of the array. The second half of the
1089  * array is for current counters, which are averaged into the
1090  * first set by task_numa_placement.
1091  */
1092 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1093 {
1094         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1095 }
1096
1097 static inline unsigned long task_faults(struct task_struct *p, int nid)
1098 {
1099         if (!p->numa_faults)
1100                 return 0;
1101
1102         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1103                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1104 }
1105
1106 static inline unsigned long group_faults(struct task_struct *p, int nid)
1107 {
1108         if (!p->numa_group)
1109                 return 0;
1110
1111         return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1112                 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1113 }
1114
1115 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1116 {
1117         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1118                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1119 }
1120
1121 /* Handle placement on systems where not all nodes are directly connected. */
1122 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1123                                         int maxdist, bool task)
1124 {
1125         unsigned long score = 0;
1126         int node;
1127
1128         /*
1129          * All nodes are directly connected, and the same distance
1130          * from each other. No need for fancy placement algorithms.
1131          */
1132         if (sched_numa_topology_type == NUMA_DIRECT)
1133                 return 0;
1134
1135         /*
1136          * This code is called for each node, introducing N^2 complexity,
1137          * which should be ok given the number of nodes rarely exceeds 8.
1138          */
1139         for_each_online_node(node) {
1140                 unsigned long faults;
1141                 int dist = node_distance(nid, node);
1142
1143                 /*
1144                  * The furthest away nodes in the system are not interesting
1145                  * for placement; nid was already counted.
1146                  */
1147                 if (dist == sched_max_numa_distance || node == nid)
1148                         continue;
1149
1150                 /*
1151                  * On systems with a backplane NUMA topology, compare groups
1152                  * of nodes, and move tasks towards the group with the most
1153                  * memory accesses. When comparing two nodes at distance
1154                  * "hoplimit", only nodes closer by than "hoplimit" are part
1155                  * of each group. Skip other nodes.
1156                  */
1157                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1158                                         dist > maxdist)
1159                         continue;
1160
1161                 /* Add up the faults from nearby nodes. */
1162                 if (task)
1163                         faults = task_faults(p, node);
1164                 else
1165                         faults = group_faults(p, node);
1166
1167                 /*
1168                  * On systems with a glueless mesh NUMA topology, there are
1169                  * no fixed "groups of nodes". Instead, nodes that are not
1170                  * directly connected bounce traffic through intermediate
1171                  * nodes; a numa_group can occupy any set of nodes.
1172                  * The further away a node is, the less the faults count.
1173                  * This seems to result in good task placement.
1174                  */
1175                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1176                         faults *= (sched_max_numa_distance - dist);
1177                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1178                 }
1179
1180                 score += faults;
1181         }
1182
1183         return score;
1184 }
1185
1186 /*
1187  * These return the fraction of accesses done by a particular task, or
1188  * task group, on a particular numa node.  The group weight is given a
1189  * larger multiplier, in order to group tasks together that are almost
1190  * evenly spread out between numa nodes.
1191  */
1192 static inline unsigned long task_weight(struct task_struct *p, int nid,
1193                                         int dist)
1194 {
1195         unsigned long faults, total_faults;
1196
1197         if (!p->numa_faults)
1198                 return 0;
1199
1200         total_faults = p->total_numa_faults;
1201
1202         if (!total_faults)
1203                 return 0;
1204
1205         faults = task_faults(p, nid);
1206         faults += score_nearby_nodes(p, nid, dist, true);
1207
1208         return 1000 * faults / total_faults;
1209 }
1210
1211 static inline unsigned long group_weight(struct task_struct *p, int nid,
1212                                          int dist)
1213 {
1214         unsigned long faults, total_faults;
1215
1216         if (!p->numa_group)
1217                 return 0;
1218
1219         total_faults = p->numa_group->total_faults;
1220
1221         if (!total_faults)
1222                 return 0;
1223
1224         faults = group_faults(p, nid);
1225         faults += score_nearby_nodes(p, nid, dist, false);
1226
1227         return 1000 * faults / total_faults;
1228 }
1229
1230 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1231                                 int src_nid, int dst_cpu)
1232 {
1233         struct numa_group *ng = p->numa_group;
1234         int dst_nid = cpu_to_node(dst_cpu);
1235         int last_cpupid, this_cpupid;
1236
1237         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1238
1239         /*
1240          * Multi-stage node selection is used in conjunction with a periodic
1241          * migration fault to build a temporal task<->page relation. By using
1242          * a two-stage filter we remove short/unlikely relations.
1243          *
1244          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1245          * a task's usage of a particular page (n_p) per total usage of this
1246          * page (n_t) (in a given time-span) to a probability.
1247          *
1248          * Our periodic faults will sample this probability and getting the
1249          * same result twice in a row, given these samples are fully
1250          * independent, is then given by P(n)^2, provided our sample period
1251          * is sufficiently short compared to the usage pattern.
1252          *
1253          * This quadric squishes small probabilities, making it less likely we
1254          * act on an unlikely task<->page relation.
1255          */
1256         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1257         if (!cpupid_pid_unset(last_cpupid) &&
1258                                 cpupid_to_nid(last_cpupid) != dst_nid)
1259                 return false;
1260
1261         /* Always allow migrate on private faults */
1262         if (cpupid_match_pid(p, last_cpupid))
1263                 return true;
1264
1265         /* A shared fault, but p->numa_group has not been set up yet. */
1266         if (!ng)
1267                 return true;
1268
1269         /*
1270          * Do not migrate if the destination is not a node that
1271          * is actively used by this numa group.
1272          */
1273         if (!node_isset(dst_nid, ng->active_nodes))
1274                 return false;
1275
1276         /*
1277          * Source is a node that is not actively used by this
1278          * numa group, while the destination is. Migrate.
1279          */
1280         if (!node_isset(src_nid, ng->active_nodes))
1281                 return true;
1282
1283         /*
1284          * Both source and destination are nodes in active
1285          * use by this numa group. Maximize memory bandwidth
1286          * by migrating from more heavily used groups, to less
1287          * heavily used ones, spreading the load around.
1288          * Use a 1/4 hysteresis to avoid spurious page movement.
1289          */
1290         return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1291 }
1292
1293 static unsigned long weighted_cpuload(const int cpu);
1294 static unsigned long source_load(int cpu, int type);
1295 static unsigned long target_load(int cpu, int type);
1296 static unsigned long capacity_of(int cpu);
1297 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1298
1299 /* Cached statistics for all CPUs within a node */
1300 struct numa_stats {
1301         unsigned long nr_running;
1302         unsigned long load;
1303
1304         /* Total compute capacity of CPUs on a node */
1305         unsigned long compute_capacity;
1306
1307         /* Approximate capacity in terms of runnable tasks on a node */
1308         unsigned long task_capacity;
1309         int has_free_capacity;
1310 };
1311
1312 /*
1313  * XXX borrowed from update_sg_lb_stats
1314  */
1315 static void update_numa_stats(struct numa_stats *ns, int nid)
1316 {
1317         int smt, cpu, cpus = 0;
1318         unsigned long capacity;
1319
1320         memset(ns, 0, sizeof(*ns));
1321         for_each_cpu(cpu, cpumask_of_node(nid)) {
1322                 struct rq *rq = cpu_rq(cpu);
1323
1324                 ns->nr_running += rq->nr_running;
1325                 ns->load += weighted_cpuload(cpu);
1326                 ns->compute_capacity += capacity_of(cpu);
1327
1328                 cpus++;
1329         }
1330
1331         /*
1332          * If we raced with hotplug and there are no CPUs left in our mask
1333          * the @ns structure is NULL'ed and task_numa_compare() will
1334          * not find this node attractive.
1335          *
1336          * We'll either bail at !has_free_capacity, or we'll detect a huge
1337          * imbalance and bail there.
1338          */
1339         if (!cpus)
1340                 return;
1341
1342         /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1343         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1344         capacity = cpus / smt; /* cores */
1345
1346         ns->task_capacity = min_t(unsigned, capacity,
1347                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1348         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1349 }
1350
1351 struct task_numa_env {
1352         struct task_struct *p;
1353
1354         int src_cpu, src_nid;
1355         int dst_cpu, dst_nid;
1356
1357         struct numa_stats src_stats, dst_stats;
1358
1359         int imbalance_pct;
1360         int dist;
1361
1362         struct task_struct *best_task;
1363         long best_imp;
1364         int best_cpu;
1365 };
1366
1367 static void task_numa_assign(struct task_numa_env *env,
1368                              struct task_struct *p, long imp)
1369 {
1370         if (env->best_task)
1371                 put_task_struct(env->best_task);
1372
1373         env->best_task = p;
1374         env->best_imp = imp;
1375         env->best_cpu = env->dst_cpu;
1376 }
1377
1378 static bool load_too_imbalanced(long src_load, long dst_load,
1379                                 struct task_numa_env *env)
1380 {
1381         long imb, old_imb;
1382         long orig_src_load, orig_dst_load;
1383         long src_capacity, dst_capacity;
1384
1385         /*
1386          * The load is corrected for the CPU capacity available on each node.
1387          *
1388          * src_load        dst_load
1389          * ------------ vs ---------
1390          * src_capacity    dst_capacity
1391          */
1392         src_capacity = env->src_stats.compute_capacity;
1393         dst_capacity = env->dst_stats.compute_capacity;
1394
1395         /* We care about the slope of the imbalance, not the direction. */
1396         if (dst_load < src_load)
1397                 swap(dst_load, src_load);
1398
1399         /* Is the difference below the threshold? */
1400         imb = dst_load * src_capacity * 100 -
1401               src_load * dst_capacity * env->imbalance_pct;
1402         if (imb <= 0)
1403                 return false;
1404
1405         /*
1406          * The imbalance is above the allowed threshold.
1407          * Compare it with the old imbalance.
1408          */
1409         orig_src_load = env->src_stats.load;
1410         orig_dst_load = env->dst_stats.load;
1411
1412         if (orig_dst_load < orig_src_load)
1413                 swap(orig_dst_load, orig_src_load);
1414
1415         old_imb = orig_dst_load * src_capacity * 100 -
1416                   orig_src_load * dst_capacity * env->imbalance_pct;
1417
1418         /* Would this change make things worse? */
1419         return (imb > old_imb);
1420 }
1421
1422 /*
1423  * This checks if the overall compute and NUMA accesses of the system would
1424  * be improved if the source tasks was migrated to the target dst_cpu taking
1425  * into account that it might be best if task running on the dst_cpu should
1426  * be exchanged with the source task
1427  */
1428 static void task_numa_compare(struct task_numa_env *env,
1429                               long taskimp, long groupimp)
1430 {
1431         struct rq *src_rq = cpu_rq(env->src_cpu);
1432         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1433         struct task_struct *cur;
1434         long src_load, dst_load;
1435         long load;
1436         long imp = env->p->numa_group ? groupimp : taskimp;
1437         long moveimp = imp;
1438         int dist = env->dist;
1439         bool assigned = false;
1440
1441         rcu_read_lock();
1442
1443         raw_spin_lock_irq(&dst_rq->lock);
1444         cur = dst_rq->curr;
1445         /*
1446          * No need to move the exiting task or idle task.
1447          */
1448         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1449                 cur = NULL;
1450         else {
1451                 /*
1452                  * The task_struct must be protected here to protect the
1453                  * p->numa_faults access in the task_weight since the
1454                  * numa_faults could already be freed in the following path:
1455                  * finish_task_switch()
1456                  *     --> put_task_struct()
1457                  *         --> __put_task_struct()
1458                  *             --> task_numa_free()
1459                  */
1460                 get_task_struct(cur);
1461         }
1462
1463         raw_spin_unlock_irq(&dst_rq->lock);
1464
1465         /*
1466          * Because we have preemption enabled we can get migrated around and
1467          * end try selecting ourselves (current == env->p) as a swap candidate.
1468          */
1469         if (cur == env->p)
1470                 goto unlock;
1471
1472         /*
1473          * "imp" is the fault differential for the source task between the
1474          * source and destination node. Calculate the total differential for
1475          * the source task and potential destination task. The more negative
1476          * the value is, the more rmeote accesses that would be expected to
1477          * be incurred if the tasks were swapped.
1478          */
1479         if (cur) {
1480                 /* Skip this swap candidate if cannot move to the source cpu */
1481                 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1482                         goto unlock;
1483
1484                 /*
1485                  * If dst and source tasks are in the same NUMA group, or not
1486                  * in any group then look only at task weights.
1487                  */
1488                 if (cur->numa_group == env->p->numa_group) {
1489                         imp = taskimp + task_weight(cur, env->src_nid, dist) -
1490                               task_weight(cur, env->dst_nid, dist);
1491                         /*
1492                          * Add some hysteresis to prevent swapping the
1493                          * tasks within a group over tiny differences.
1494                          */
1495                         if (cur->numa_group)
1496                                 imp -= imp/16;
1497                 } else {
1498                         /*
1499                          * Compare the group weights. If a task is all by
1500                          * itself (not part of a group), use the task weight
1501                          * instead.
1502                          */
1503                         if (cur->numa_group)
1504                                 imp += group_weight(cur, env->src_nid, dist) -
1505                                        group_weight(cur, env->dst_nid, dist);
1506                         else
1507                                 imp += task_weight(cur, env->src_nid, dist) -
1508                                        task_weight(cur, env->dst_nid, dist);
1509                 }
1510         }
1511
1512         if (imp <= env->best_imp && moveimp <= env->best_imp)
1513                 goto unlock;
1514
1515         if (!cur) {
1516                 /* Is there capacity at our destination? */
1517                 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1518                     !env->dst_stats.has_free_capacity)
1519                         goto unlock;
1520
1521                 goto balance;
1522         }
1523
1524         /* Balance doesn't matter much if we're running a task per cpu */
1525         if (imp > env->best_imp && src_rq->nr_running == 1 &&
1526                         dst_rq->nr_running == 1)
1527                 goto assign;
1528
1529         /*
1530          * In the overloaded case, try and keep the load balanced.
1531          */
1532 balance:
1533         load = task_h_load(env->p);
1534         dst_load = env->dst_stats.load + load;
1535         src_load = env->src_stats.load - load;
1536
1537         if (moveimp > imp && moveimp > env->best_imp) {
1538                 /*
1539                  * If the improvement from just moving env->p direction is
1540                  * better than swapping tasks around, check if a move is
1541                  * possible. Store a slightly smaller score than moveimp,
1542                  * so an actually idle CPU will win.
1543                  */
1544                 if (!load_too_imbalanced(src_load, dst_load, env)) {
1545                         imp = moveimp - 1;
1546                         put_task_struct(cur);
1547                         cur = NULL;
1548                         goto assign;
1549                 }
1550         }
1551
1552         if (imp <= env->best_imp)
1553                 goto unlock;
1554
1555         if (cur) {
1556                 load = task_h_load(cur);
1557                 dst_load -= load;
1558                 src_load += load;
1559         }
1560
1561         if (load_too_imbalanced(src_load, dst_load, env))
1562                 goto unlock;
1563
1564         /*
1565          * One idle CPU per node is evaluated for a task numa move.
1566          * Call select_idle_sibling to maybe find a better one.
1567          */
1568         if (!cur)
1569                 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1570                                                    env->dst_cpu);
1571
1572 assign:
1573         assigned = true;
1574         task_numa_assign(env, cur, imp);
1575 unlock:
1576         rcu_read_unlock();
1577         /*
1578          * The dst_rq->curr isn't assigned. The protection for task_struct is
1579          * finished.
1580          */
1581         if (cur && !assigned)
1582                 put_task_struct(cur);
1583 }
1584
1585 static void task_numa_find_cpu(struct task_numa_env *env,
1586                                 long taskimp, long groupimp)
1587 {
1588         int cpu;
1589
1590         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1591                 /* Skip this CPU if the source task cannot migrate */
1592                 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1593                         continue;
1594
1595                 env->dst_cpu = cpu;
1596                 task_numa_compare(env, taskimp, groupimp);
1597         }
1598 }
1599
1600 /* Only move tasks to a NUMA node less busy than the current node. */
1601 static bool numa_has_capacity(struct task_numa_env *env)
1602 {
1603         struct numa_stats *src = &env->src_stats;
1604         struct numa_stats *dst = &env->dst_stats;
1605
1606         if (src->has_free_capacity && !dst->has_free_capacity)
1607                 return false;
1608
1609         /*
1610          * Only consider a task move if the source has a higher load
1611          * than the destination, corrected for CPU capacity on each node.
1612          *
1613          *      src->load                dst->load
1614          * --------------------- vs ---------------------
1615          * src->compute_capacity    dst->compute_capacity
1616          */
1617         if (src->load * dst->compute_capacity * env->imbalance_pct >
1618
1619             dst->load * src->compute_capacity * 100)
1620                 return true;
1621
1622         return false;
1623 }
1624
1625 static int task_numa_migrate(struct task_struct *p)
1626 {
1627         struct task_numa_env env = {
1628                 .p = p,
1629
1630                 .src_cpu = task_cpu(p),
1631                 .src_nid = task_node(p),
1632
1633                 .imbalance_pct = 112,
1634
1635                 .best_task = NULL,
1636                 .best_imp = 0,
1637                 .best_cpu = -1
1638         };
1639         struct sched_domain *sd;
1640         unsigned long taskweight, groupweight;
1641         int nid, ret, dist;
1642         long taskimp, groupimp;
1643
1644         /*
1645          * Pick the lowest SD_NUMA domain, as that would have the smallest
1646          * imbalance and would be the first to start moving tasks about.
1647          *
1648          * And we want to avoid any moving of tasks about, as that would create
1649          * random movement of tasks -- counter the numa conditions we're trying
1650          * to satisfy here.
1651          */
1652         rcu_read_lock();
1653         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1654         if (sd)
1655                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1656         rcu_read_unlock();
1657
1658         /*
1659          * Cpusets can break the scheduler domain tree into smaller
1660          * balance domains, some of which do not cross NUMA boundaries.
1661          * Tasks that are "trapped" in such domains cannot be migrated
1662          * elsewhere, so there is no point in (re)trying.
1663          */
1664         if (unlikely(!sd)) {
1665                 p->numa_preferred_nid = task_node(p);
1666                 return -EINVAL;
1667         }
1668
1669         env.dst_nid = p->numa_preferred_nid;
1670         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1671         taskweight = task_weight(p, env.src_nid, dist);
1672         groupweight = group_weight(p, env.src_nid, dist);
1673         update_numa_stats(&env.src_stats, env.src_nid);
1674         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1675         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1676         update_numa_stats(&env.dst_stats, env.dst_nid);
1677
1678         /* Try to find a spot on the preferred nid. */
1679         if (numa_has_capacity(&env))
1680                 task_numa_find_cpu(&env, taskimp, groupimp);
1681
1682         /*
1683          * Look at other nodes in these cases:
1684          * - there is no space available on the preferred_nid
1685          * - the task is part of a numa_group that is interleaved across
1686          *   multiple NUMA nodes; in order to better consolidate the group,
1687          *   we need to check other locations.
1688          */
1689         if (env.best_cpu == -1 || (p->numa_group &&
1690                         nodes_weight(p->numa_group->active_nodes) > 1)) {
1691                 for_each_online_node(nid) {
1692                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1693                                 continue;
1694
1695                         dist = node_distance(env.src_nid, env.dst_nid);
1696                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1697                                                 dist != env.dist) {
1698                                 taskweight = task_weight(p, env.src_nid, dist);
1699                                 groupweight = group_weight(p, env.src_nid, dist);
1700                         }
1701
1702                         /* Only consider nodes where both task and groups benefit */
1703                         taskimp = task_weight(p, nid, dist) - taskweight;
1704                         groupimp = group_weight(p, nid, dist) - groupweight;
1705                         if (taskimp < 0 && groupimp < 0)
1706                                 continue;
1707
1708                         env.dist = dist;
1709                         env.dst_nid = nid;
1710                         update_numa_stats(&env.dst_stats, env.dst_nid);
1711                         if (numa_has_capacity(&env))
1712                                 task_numa_find_cpu(&env, taskimp, groupimp);
1713                 }
1714         }
1715
1716         /*
1717          * If the task is part of a workload that spans multiple NUMA nodes,
1718          * and is migrating into one of the workload's active nodes, remember
1719          * this node as the task's preferred numa node, so the workload can
1720          * settle down.
1721          * A task that migrated to a second choice node will be better off
1722          * trying for a better one later. Do not set the preferred node here.
1723          */
1724         if (p->numa_group) {
1725                 if (env.best_cpu == -1)
1726                         nid = env.src_nid;
1727                 else
1728                         nid = env.dst_nid;
1729
1730                 if (node_isset(nid, p->numa_group->active_nodes))
1731                         sched_setnuma(p, env.dst_nid);
1732         }
1733
1734         /* No better CPU than the current one was found. */
1735         if (env.best_cpu == -1)
1736                 return -EAGAIN;
1737
1738         /*
1739          * Reset the scan period if the task is being rescheduled on an
1740          * alternative node to recheck if the tasks is now properly placed.
1741          */
1742         p->numa_scan_period = task_scan_min(p);
1743
1744         if (env.best_task == NULL) {
1745                 ret = migrate_task_to(p, env.best_cpu);
1746                 if (ret != 0)
1747                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1748                 return ret;
1749         }
1750
1751         ret = migrate_swap(p, env.best_task);
1752         if (ret != 0)
1753                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1754         put_task_struct(env.best_task);
1755         return ret;
1756 }
1757
1758 /* Attempt to migrate a task to a CPU on the preferred node. */
1759 static void numa_migrate_preferred(struct task_struct *p)
1760 {
1761         unsigned long interval = HZ;
1762
1763         /* This task has no NUMA fault statistics yet */
1764         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1765                 return;
1766
1767         /* Periodically retry migrating the task to the preferred node */
1768         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1769         p->numa_migrate_retry = jiffies + interval;
1770
1771         /* Success if task is already running on preferred CPU */
1772         if (task_node(p) == p->numa_preferred_nid)
1773                 return;
1774
1775         /* Otherwise, try migrate to a CPU on the preferred node */
1776         task_numa_migrate(p);
1777 }
1778
1779 /*
1780  * Find the nodes on which the workload is actively running. We do this by
1781  * tracking the nodes from which NUMA hinting faults are triggered. This can
1782  * be different from the set of nodes where the workload's memory is currently
1783  * located.
1784  *
1785  * The bitmask is used to make smarter decisions on when to do NUMA page
1786  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1787  * are added when they cause over 6/16 of the maximum number of faults, but
1788  * only removed when they drop below 3/16.
1789  */
1790 static void update_numa_active_node_mask(struct numa_group *numa_group)
1791 {
1792         unsigned long faults, max_faults = 0;
1793         int nid;
1794
1795         for_each_online_node(nid) {
1796                 faults = group_faults_cpu(numa_group, nid);
1797                 if (faults > max_faults)
1798                         max_faults = faults;
1799         }
1800
1801         for_each_online_node(nid) {
1802                 faults = group_faults_cpu(numa_group, nid);
1803                 if (!node_isset(nid, numa_group->active_nodes)) {
1804                         if (faults > max_faults * 6 / 16)
1805                                 node_set(nid, numa_group->active_nodes);
1806                 } else if (faults < max_faults * 3 / 16)
1807                         node_clear(nid, numa_group->active_nodes);
1808         }
1809 }
1810
1811 /*
1812  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1813  * increments. The more local the fault statistics are, the higher the scan
1814  * period will be for the next scan window. If local/(local+remote) ratio is
1815  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1816  * the scan period will decrease. Aim for 70% local accesses.
1817  */
1818 #define NUMA_PERIOD_SLOTS 10
1819 #define NUMA_PERIOD_THRESHOLD 7
1820
1821 /*
1822  * Increase the scan period (slow down scanning) if the majority of
1823  * our memory is already on our local node, or if the majority of
1824  * the page accesses are shared with other processes.
1825  * Otherwise, decrease the scan period.
1826  */
1827 static void update_task_scan_period(struct task_struct *p,
1828                         unsigned long shared, unsigned long private)
1829 {
1830         unsigned int period_slot;
1831         int ratio;
1832         int diff;
1833
1834         unsigned long remote = p->numa_faults_locality[0];
1835         unsigned long local = p->numa_faults_locality[1];
1836
1837         /*
1838          * If there were no record hinting faults then either the task is
1839          * completely idle or all activity is areas that are not of interest
1840          * to automatic numa balancing. Related to that, if there were failed
1841          * migration then it implies we are migrating too quickly or the local
1842          * node is overloaded. In either case, scan slower
1843          */
1844         if (local + shared == 0 || p->numa_faults_locality[2]) {
1845                 p->numa_scan_period = min(p->numa_scan_period_max,
1846                         p->numa_scan_period << 1);
1847
1848                 p->mm->numa_next_scan = jiffies +
1849                         msecs_to_jiffies(p->numa_scan_period);
1850
1851                 return;
1852         }
1853
1854         /*
1855          * Prepare to scale scan period relative to the current period.
1856          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1857          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1858          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1859          */
1860         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1861         ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1862         if (ratio >= NUMA_PERIOD_THRESHOLD) {
1863                 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1864                 if (!slot)
1865                         slot = 1;
1866                 diff = slot * period_slot;
1867         } else {
1868                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1869
1870                 /*
1871                  * Scale scan rate increases based on sharing. There is an
1872                  * inverse relationship between the degree of sharing and
1873                  * the adjustment made to the scanning period. Broadly
1874                  * speaking the intent is that there is little point
1875                  * scanning faster if shared accesses dominate as it may
1876                  * simply bounce migrations uselessly
1877                  */
1878                 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1879                 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1880         }
1881
1882         p->numa_scan_period = clamp(p->numa_scan_period + diff,
1883                         task_scan_min(p), task_scan_max(p));
1884         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1885 }
1886
1887 /*
1888  * Get the fraction of time the task has been running since the last
1889  * NUMA placement cycle. The scheduler keeps similar statistics, but
1890  * decays those on a 32ms period, which is orders of magnitude off
1891  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1892  * stats only if the task is so new there are no NUMA statistics yet.
1893  */
1894 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1895 {
1896         u64 runtime, delta, now;
1897         /* Use the start of this time slice to avoid calculations. */
1898         now = p->se.exec_start;
1899         runtime = p->se.sum_exec_runtime;
1900
1901         if (p->last_task_numa_placement) {
1902                 delta = runtime - p->last_sum_exec_runtime;
1903                 *period = now - p->last_task_numa_placement;
1904         } else {
1905                 delta = p->se.avg.load_sum / p->se.load.weight;
1906                 *period = LOAD_AVG_MAX;
1907         }
1908
1909         p->last_sum_exec_runtime = runtime;
1910         p->last_task_numa_placement = now;
1911
1912         return delta;
1913 }
1914
1915 /*
1916  * Determine the preferred nid for a task in a numa_group. This needs to
1917  * be done in a way that produces consistent results with group_weight,
1918  * otherwise workloads might not converge.
1919  */
1920 static int preferred_group_nid(struct task_struct *p, int nid)
1921 {
1922         nodemask_t nodes;
1923         int dist;
1924
1925         /* Direct connections between all NUMA nodes. */
1926         if (sched_numa_topology_type == NUMA_DIRECT)
1927                 return nid;
1928
1929         /*
1930          * On a system with glueless mesh NUMA topology, group_weight
1931          * scores nodes according to the number of NUMA hinting faults on
1932          * both the node itself, and on nearby nodes.
1933          */
1934         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1935                 unsigned long score, max_score = 0;
1936                 int node, max_node = nid;
1937
1938                 dist = sched_max_numa_distance;
1939
1940                 for_each_online_node(node) {
1941                         score = group_weight(p, node, dist);
1942                         if (score > max_score) {
1943                                 max_score = score;
1944                                 max_node = node;
1945                         }
1946                 }
1947                 return max_node;
1948         }
1949
1950         /*
1951          * Finding the preferred nid in a system with NUMA backplane
1952          * interconnect topology is more involved. The goal is to locate
1953          * tasks from numa_groups near each other in the system, and
1954          * untangle workloads from different sides of the system. This requires
1955          * searching down the hierarchy of node groups, recursively searching
1956          * inside the highest scoring group of nodes. The nodemask tricks
1957          * keep the complexity of the search down.
1958          */
1959         nodes = node_online_map;
1960         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1961                 unsigned long max_faults = 0;
1962                 nodemask_t max_group = NODE_MASK_NONE;
1963                 int a, b;
1964
1965                 /* Are there nodes at this distance from each other? */
1966                 if (!find_numa_distance(dist))
1967                         continue;
1968
1969                 for_each_node_mask(a, nodes) {
1970                         unsigned long faults = 0;
1971                         nodemask_t this_group;
1972                         nodes_clear(this_group);
1973
1974                         /* Sum group's NUMA faults; includes a==b case. */
1975                         for_each_node_mask(b, nodes) {
1976                                 if (node_distance(a, b) < dist) {
1977                                         faults += group_faults(p, b);
1978                                         node_set(b, this_group);
1979                                         node_clear(b, nodes);
1980                                 }
1981                         }
1982
1983                         /* Remember the top group. */
1984                         if (faults > max_faults) {
1985                                 max_faults = faults;
1986                                 max_group = this_group;
1987                                 /*
1988                                  * subtle: at the smallest distance there is
1989                                  * just one node left in each "group", the
1990                                  * winner is the preferred nid.
1991                                  */
1992                                 nid = a;
1993                         }
1994                 }
1995                 /* Next round, evaluate the nodes within max_group. */
1996                 if (!max_faults)
1997                         break;
1998                 nodes = max_group;
1999         }
2000         return nid;
2001 }
2002
2003 static void task_numa_placement(struct task_struct *p)
2004 {
2005         int seq, nid, max_nid = -1, max_group_nid = -1;
2006         unsigned long max_faults = 0, max_group_faults = 0;
2007         unsigned long fault_types[2] = { 0, 0 };
2008         unsigned long total_faults;
2009         u64 runtime, period;
2010         spinlock_t *group_lock = NULL;
2011
2012         /*
2013          * The p->mm->numa_scan_seq field gets updated without
2014          * exclusive access. Use READ_ONCE() here to ensure
2015          * that the field is read in a single access:
2016          */
2017         seq = READ_ONCE(p->mm->numa_scan_seq);
2018         if (p->numa_scan_seq == seq)
2019                 return;
2020         p->numa_scan_seq = seq;
2021         p->numa_scan_period_max = task_scan_max(p);
2022
2023         total_faults = p->numa_faults_locality[0] +
2024                        p->numa_faults_locality[1];
2025         runtime = numa_get_avg_runtime(p, &period);
2026
2027         /* If the task is part of a group prevent parallel updates to group stats */
2028         if (p->numa_group) {
2029                 group_lock = &p->numa_group->lock;
2030                 spin_lock_irq(group_lock);
2031         }
2032
2033         /* Find the node with the highest number of faults */
2034         for_each_online_node(nid) {
2035                 /* Keep track of the offsets in numa_faults array */
2036                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2037                 unsigned long faults = 0, group_faults = 0;
2038                 int priv;
2039
2040                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2041                         long diff, f_diff, f_weight;
2042
2043                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2044                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2045                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2046                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2047
2048                         /* Decay existing window, copy faults since last scan */
2049                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2050                         fault_types[priv] += p->numa_faults[membuf_idx];
2051                         p->numa_faults[membuf_idx] = 0;
2052
2053                         /*
2054                          * Normalize the faults_from, so all tasks in a group
2055                          * count according to CPU use, instead of by the raw
2056                          * number of faults. Tasks with little runtime have
2057                          * little over-all impact on throughput, and thus their
2058                          * faults are less important.
2059                          */
2060                         f_weight = div64_u64(runtime << 16, period + 1);
2061                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2062                                    (total_faults + 1);
2063                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2064                         p->numa_faults[cpubuf_idx] = 0;
2065
2066                         p->numa_faults[mem_idx] += diff;
2067                         p->numa_faults[cpu_idx] += f_diff;
2068                         faults += p->numa_faults[mem_idx];
2069                         p->total_numa_faults += diff;
2070                         if (p->numa_group) {
2071                                 /*
2072                                  * safe because we can only change our own group
2073                                  *
2074                                  * mem_idx represents the offset for a given
2075                                  * nid and priv in a specific region because it
2076                                  * is at the beginning of the numa_faults array.
2077                                  */
2078                                 p->numa_group->faults[mem_idx] += diff;
2079                                 p->numa_group->faults_cpu[mem_idx] += f_diff;
2080                                 p->numa_group->total_faults += diff;
2081                                 group_faults += p->numa_group->faults[mem_idx];
2082                         }
2083                 }
2084
2085                 if (faults > max_faults) {
2086                         max_faults = faults;
2087                         max_nid = nid;
2088                 }
2089
2090                 if (group_faults > max_group_faults) {
2091                         max_group_faults = group_faults;
2092                         max_group_nid = nid;
2093                 }
2094         }
2095
2096         update_task_scan_period(p, fault_types[0], fault_types[1]);
2097
2098         if (p->numa_group) {
2099                 update_numa_active_node_mask(p->numa_group);
2100                 spin_unlock_irq(group_lock);
2101                 max_nid = preferred_group_nid(p, max_group_nid);
2102         }
2103
2104         if (max_faults) {
2105                 /* Set the new preferred node */
2106                 if (max_nid != p->numa_preferred_nid)
2107                         sched_setnuma(p, max_nid);
2108
2109                 if (task_node(p) != p->numa_preferred_nid)
2110                         numa_migrate_preferred(p);
2111         }
2112 }
2113
2114 static inline int get_numa_group(struct numa_group *grp)
2115 {
2116         return atomic_inc_not_zero(&grp->refcount);
2117 }
2118
2119 static inline void put_numa_group(struct numa_group *grp)
2120 {
2121         if (atomic_dec_and_test(&grp->refcount))
2122                 kfree_rcu(grp, rcu);
2123 }
2124
2125 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2126                         int *priv)
2127 {
2128         struct numa_group *grp, *my_grp;
2129         struct task_struct *tsk;
2130         bool join = false;
2131         int cpu = cpupid_to_cpu(cpupid);
2132         int i;
2133
2134         if (unlikely(!p->numa_group)) {
2135                 unsigned int size = sizeof(struct numa_group) +
2136                                     4*nr_node_ids*sizeof(unsigned long);
2137
2138                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2139                 if (!grp)
2140                         return;
2141
2142                 atomic_set(&grp->refcount, 1);
2143                 spin_lock_init(&grp->lock);
2144                 grp->gid = p->pid;
2145                 /* Second half of the array tracks nids where faults happen */
2146                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2147                                                 nr_node_ids;
2148
2149                 node_set(task_node(current), grp->active_nodes);
2150
2151                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2152                         grp->faults[i] = p->numa_faults[i];
2153
2154                 grp->total_faults = p->total_numa_faults;
2155
2156                 grp->nr_tasks++;
2157                 rcu_assign_pointer(p->numa_group, grp);
2158         }
2159
2160         rcu_read_lock();
2161         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2162
2163         if (!cpupid_match_pid(tsk, cpupid))
2164                 goto no_join;
2165
2166         grp = rcu_dereference(tsk->numa_group);
2167         if (!grp)
2168                 goto no_join;
2169
2170         my_grp = p->numa_group;
2171         if (grp == my_grp)
2172                 goto no_join;
2173
2174         /*
2175          * Only join the other group if its bigger; if we're the bigger group,
2176          * the other task will join us.
2177          */
2178         if (my_grp->nr_tasks > grp->nr_tasks)
2179                 goto no_join;
2180
2181         /*
2182          * Tie-break on the grp address.
2183          */
2184         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2185                 goto no_join;
2186
2187         /* Always join threads in the same process. */
2188         if (tsk->mm == current->mm)
2189                 join = true;
2190
2191         /* Simple filter to avoid false positives due to PID collisions */
2192         if (flags & TNF_SHARED)
2193                 join = true;
2194
2195         /* Update priv based on whether false sharing was detected */
2196         *priv = !join;
2197
2198         if (join && !get_numa_group(grp))
2199                 goto no_join;
2200
2201         rcu_read_unlock();
2202
2203         if (!join)
2204                 return;
2205
2206         BUG_ON(irqs_disabled());
2207         double_lock_irq(&my_grp->lock, &grp->lock);
2208
2209         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2210                 my_grp->faults[i] -= p->numa_faults[i];
2211                 grp->faults[i] += p->numa_faults[i];
2212         }
2213         my_grp->total_faults -= p->total_numa_faults;
2214         grp->total_faults += p->total_numa_faults;
2215
2216         my_grp->nr_tasks--;
2217         grp->nr_tasks++;
2218
2219         spin_unlock(&my_grp->lock);
2220         spin_unlock_irq(&grp->lock);
2221
2222         rcu_assign_pointer(p->numa_group, grp);
2223
2224         put_numa_group(my_grp);
2225         return;
2226
2227 no_join:
2228         rcu_read_unlock();
2229         return;
2230 }
2231
2232 void task_numa_free(struct task_struct *p)
2233 {
2234         struct numa_group *grp = p->numa_group;
2235         void *numa_faults = p->numa_faults;
2236         unsigned long flags;
2237         int i;
2238
2239         if (grp) {
2240                 spin_lock_irqsave(&grp->lock, flags);
2241                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2242                         grp->faults[i] -= p->numa_faults[i];
2243                 grp->total_faults -= p->total_numa_faults;
2244
2245                 grp->nr_tasks--;
2246                 spin_unlock_irqrestore(&grp->lock, flags);
2247                 RCU_INIT_POINTER(p->numa_group, NULL);
2248                 put_numa_group(grp);
2249         }
2250
2251         p->numa_faults = NULL;
2252         kfree(numa_faults);
2253 }
2254
2255 /*
2256  * Got a PROT_NONE fault for a page on @node.
2257  */
2258 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2259 {
2260         struct task_struct *p = current;
2261         bool migrated = flags & TNF_MIGRATED;
2262         int cpu_node = task_node(current);
2263         int local = !!(flags & TNF_FAULT_LOCAL);
2264         int priv;
2265
2266         if (!static_branch_likely(&sched_numa_balancing))
2267                 return;
2268
2269         /* for example, ksmd faulting in a user's mm */
2270         if (!p->mm)
2271                 return;
2272
2273         /* Allocate buffer to track faults on a per-node basis */
2274         if (unlikely(!p->numa_faults)) {
2275                 int size = sizeof(*p->numa_faults) *
2276                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2277
2278                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2279                 if (!p->numa_faults)
2280                         return;
2281
2282                 p->total_numa_faults = 0;
2283                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2284         }
2285
2286         /*
2287          * First accesses are treated as private, otherwise consider accesses
2288          * to be private if the accessing pid has not changed
2289          */
2290         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2291                 priv = 1;
2292         } else {
2293                 priv = cpupid_match_pid(p, last_cpupid);
2294                 if (!priv && !(flags & TNF_NO_GROUP))
2295                         task_numa_group(p, last_cpupid, flags, &priv);
2296         }
2297
2298         /*
2299          * If a workload spans multiple NUMA nodes, a shared fault that
2300          * occurs wholly within the set of nodes that the workload is
2301          * actively using should be counted as local. This allows the
2302          * scan rate to slow down when a workload has settled down.
2303          */
2304         if (!priv && !local && p->numa_group &&
2305                         node_isset(cpu_node, p->numa_group->active_nodes) &&
2306                         node_isset(mem_node, p->numa_group->active_nodes))
2307                 local = 1;
2308
2309         task_numa_placement(p);
2310
2311         /*
2312          * Retry task to preferred node migration periodically, in case it
2313          * case it previously failed, or the scheduler moved us.
2314          */
2315         if (time_after(jiffies, p->numa_migrate_retry))
2316                 numa_migrate_preferred(p);
2317
2318         if (migrated)
2319                 p->numa_pages_migrated += pages;
2320         if (flags & TNF_MIGRATE_FAIL)
2321                 p->numa_faults_locality[2] += pages;
2322
2323         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2324         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2325         p->numa_faults_locality[local] += pages;
2326 }
2327
2328 static void reset_ptenuma_scan(struct task_struct *p)
2329 {
2330         /*
2331          * We only did a read acquisition of the mmap sem, so
2332          * p->mm->numa_scan_seq is written to without exclusive access
2333          * and the update is not guaranteed to be atomic. That's not
2334          * much of an issue though, since this is just used for
2335          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2336          * expensive, to avoid any form of compiler optimizations:
2337          */
2338         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2339         p->mm->numa_scan_offset = 0;
2340 }
2341
2342 /*
2343  * The expensive part of numa migration is done from task_work context.
2344  * Triggered from task_tick_numa().
2345  */
2346 void task_numa_work(struct callback_head *work)
2347 {
2348         unsigned long migrate, next_scan, now = jiffies;
2349         struct task_struct *p = current;
2350         struct mm_struct *mm = p->mm;
2351         struct vm_area_struct *vma;
2352         unsigned long start, end;
2353         unsigned long nr_pte_updates = 0;
2354         long pages, virtpages;
2355
2356         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2357
2358         work->next = work; /* protect against double add */
2359         /*
2360          * Who cares about NUMA placement when they're dying.
2361          *
2362          * NOTE: make sure not to dereference p->mm before this check,
2363          * exit_task_work() happens _after_ exit_mm() so we could be called
2364          * without p->mm even though we still had it when we enqueued this
2365          * work.
2366          */
2367         if (p->flags & PF_EXITING)
2368                 return;
2369
2370         if (!mm->numa_next_scan) {
2371                 mm->numa_next_scan = now +
2372                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2373         }
2374
2375         /*
2376          * Enforce maximal scan/migration frequency..
2377          */
2378         migrate = mm->numa_next_scan;
2379         if (time_before(now, migrate))
2380                 return;
2381
2382         if (p->numa_scan_period == 0) {
2383                 p->numa_scan_period_max = task_scan_max(p);
2384                 p->numa_scan_period = task_scan_min(p);
2385         }
2386
2387         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2388         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2389                 return;
2390
2391         /*
2392          * Delay this task enough that another task of this mm will likely win
2393          * the next time around.
2394          */
2395         p->node_stamp += 2 * TICK_NSEC;
2396
2397         start = mm->numa_scan_offset;
2398         pages = sysctl_numa_balancing_scan_size;
2399         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2400         virtpages = pages * 8;     /* Scan up to this much virtual space */
2401         if (!pages)
2402                 return;
2403
2404
2405         if (!down_read_trylock(&mm->mmap_sem))
2406                 return;
2407         vma = find_vma(mm, start);
2408         if (!vma) {
2409                 reset_ptenuma_scan(p);
2410                 start = 0;
2411                 vma = mm->mmap;
2412         }
2413         for (; vma; vma = vma->vm_next) {
2414                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2415                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2416                         continue;
2417                 }
2418
2419                 /*
2420                  * Shared library pages mapped by multiple processes are not
2421                  * migrated as it is expected they are cache replicated. Avoid
2422                  * hinting faults in read-only file-backed mappings or the vdso
2423                  * as migrating the pages will be of marginal benefit.
2424                  */
2425                 if (!vma->vm_mm ||
2426                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2427                         continue;
2428
2429                 /*
2430                  * Skip inaccessible VMAs to avoid any confusion between
2431                  * PROT_NONE and NUMA hinting ptes
2432                  */
2433                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2434                         continue;
2435
2436                 do {
2437                         start = max(start, vma->vm_start);
2438                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2439                         end = min(end, vma->vm_end);
2440                         nr_pte_updates = change_prot_numa(vma, start, end);
2441
2442                         /*
2443                          * Try to scan sysctl_numa_balancing_size worth of
2444                          * hpages that have at least one present PTE that
2445                          * is not already pte-numa. If the VMA contains
2446                          * areas that are unused or already full of prot_numa
2447                          * PTEs, scan up to virtpages, to skip through those
2448                          * areas faster.
2449                          */
2450                         if (nr_pte_updates)
2451                                 pages -= (end - start) >> PAGE_SHIFT;
2452                         virtpages -= (end - start) >> PAGE_SHIFT;
2453
2454                         start = end;
2455                         if (pages <= 0 || virtpages <= 0)
2456                                 goto out;
2457
2458                         cond_resched();
2459                 } while (end != vma->vm_end);
2460         }
2461
2462 out:
2463         /*
2464          * It is possible to reach the end of the VMA list but the last few
2465          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2466          * would find the !migratable VMA on the next scan but not reset the
2467          * scanner to the start so check it now.
2468          */
2469         if (vma)
2470                 mm->numa_scan_offset = start;
2471         else
2472                 reset_ptenuma_scan(p);
2473         up_read(&mm->mmap_sem);
2474 }
2475
2476 /*
2477  * Drive the periodic memory faults..
2478  */
2479 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2480 {
2481         struct callback_head *work = &curr->numa_work;
2482         u64 period, now;
2483
2484         /*
2485          * We don't care about NUMA placement if we don't have memory.
2486          */
2487         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2488                 return;
2489
2490         /*
2491          * Using runtime rather than walltime has the dual advantage that
2492          * we (mostly) drive the selection from busy threads and that the
2493          * task needs to have done some actual work before we bother with
2494          * NUMA placement.
2495          */
2496         now = curr->se.sum_exec_runtime;
2497         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2498
2499         if (now > curr->node_stamp + period) {
2500                 if (!curr->node_stamp)
2501                         curr->numa_scan_period = task_scan_min(curr);
2502                 curr->node_stamp += period;
2503
2504                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2505                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2506                         task_work_add(curr, work, true);
2507                 }
2508         }
2509 }
2510 #else
2511 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2512 {
2513 }
2514
2515 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2516 {
2517 }
2518
2519 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2520 {
2521 }
2522 #endif /* CONFIG_NUMA_BALANCING */
2523
2524 static void
2525 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2526 {
2527         update_load_add(&cfs_rq->load, se->load.weight);
2528         if (!parent_entity(se))
2529                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2530 #ifdef CONFIG_SMP
2531         if (entity_is_task(se)) {
2532                 struct rq *rq = rq_of(cfs_rq);
2533
2534                 account_numa_enqueue(rq, task_of(se));
2535                 list_add(&se->group_node, &rq->cfs_tasks);
2536         }
2537 #endif
2538         cfs_rq->nr_running++;
2539 }
2540
2541 static void
2542 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2543 {
2544         update_load_sub(&cfs_rq->load, se->load.weight);
2545         if (!parent_entity(se))
2546                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2547         if (entity_is_task(se)) {
2548                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2549                 list_del_init(&se->group_node);
2550         }
2551         cfs_rq->nr_running--;
2552 }
2553
2554 #ifdef CONFIG_FAIR_GROUP_SCHED
2555 # ifdef CONFIG_SMP
2556 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2557 {
2558         long tg_weight, load, shares;
2559
2560         /*
2561          * This really should be: cfs_rq->avg.load_avg, but instead we use
2562          * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2563          * the shares for small weight interactive tasks.
2564          */
2565         load = scale_load_down(cfs_rq->load.weight);
2566
2567         tg_weight = atomic_long_read(&tg->load_avg);
2568
2569         /* Ensure tg_weight >= load */
2570         tg_weight -= cfs_rq->tg_load_avg_contrib;
2571         tg_weight += load;
2572
2573         shares = (tg->shares * load);
2574         if (tg_weight)
2575                 shares /= tg_weight;
2576
2577         if (shares < MIN_SHARES)
2578                 shares = MIN_SHARES;
2579         if (shares > tg->shares)
2580                 shares = tg->shares;
2581
2582         return shares;
2583 }
2584 # else /* CONFIG_SMP */
2585 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2586 {
2587         return tg->shares;
2588 }
2589 # endif /* CONFIG_SMP */
2590
2591 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2592                             unsigned long weight)
2593 {
2594         if (se->on_rq) {
2595                 /* commit outstanding execution time */
2596                 if (cfs_rq->curr == se)
2597                         update_curr(cfs_rq);
2598                 account_entity_dequeue(cfs_rq, se);
2599         }
2600
2601         update_load_set(&se->load, weight);
2602
2603         if (se->on_rq)
2604                 account_entity_enqueue(cfs_rq, se);
2605 }
2606
2607 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2608
2609 static void update_cfs_shares(struct sched_entity *se)
2610 {
2611         struct cfs_rq *cfs_rq = group_cfs_rq(se);
2612         struct task_group *tg;
2613         long shares;
2614
2615         if (!cfs_rq)
2616                 return;
2617
2618         if (throttled_hierarchy(cfs_rq))
2619                 return;
2620
2621         tg = cfs_rq->tg;
2622
2623 #ifndef CONFIG_SMP
2624         if (likely(se->load.weight == tg->shares))
2625                 return;
2626 #endif
2627         shares = calc_cfs_shares(cfs_rq, tg);
2628
2629         reweight_entity(cfs_rq_of(se), se, shares);
2630 }
2631
2632 #else /* CONFIG_FAIR_GROUP_SCHED */
2633 static inline void update_cfs_shares(struct sched_entity *se)
2634 {
2635 }
2636 #endif /* CONFIG_FAIR_GROUP_SCHED */
2637
2638 #ifdef CONFIG_SMP
2639 u32 sched_get_wake_up_idle(struct task_struct *p)
2640 {
2641         u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2642
2643         return !!enabled;
2644 }
2645 EXPORT_SYMBOL(sched_get_wake_up_idle);
2646
2647 int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2648 {
2649         int enable = !!wake_up_idle;
2650
2651         if (enable)
2652                 p->flags |= PF_WAKE_UP_IDLE;
2653         else
2654                 p->flags &= ~PF_WAKE_UP_IDLE;
2655
2656         return 0;
2657 }
2658 EXPORT_SYMBOL(sched_set_wake_up_idle);
2659
2660 static const u32 runnable_avg_yN_inv[] = {
2661         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2662         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2663         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2664         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2665         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2666         0x85aac367, 0x82cd8698,
2667 };
2668
2669 /*
2670  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2671  * over-estimates when re-combining.
2672  */
2673 static const u32 runnable_avg_yN_sum[] = {
2674             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2675          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2676         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2677 };
2678
2679 /*
2680  * Approximate:
2681  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2682  */
2683 static __always_inline u64 decay_load(u64 val, u64 n)
2684 {
2685         unsigned int local_n;
2686
2687         if (!n)
2688                 return val;
2689         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2690                 return 0;
2691
2692         /* after bounds checking we can collapse to 32-bit */
2693         local_n = n;
2694
2695         /*
2696          * As y^PERIOD = 1/2, we can combine
2697          *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2698          * With a look-up table which covers y^n (n<PERIOD)
2699          *
2700          * To achieve constant time decay_load.
2701          */
2702         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2703                 val >>= local_n / LOAD_AVG_PERIOD;
2704                 local_n %= LOAD_AVG_PERIOD;
2705         }
2706
2707         val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2708         return val;
2709 }
2710
2711 /*
2712  * For updates fully spanning n periods, the contribution to runnable
2713  * average will be: \Sum 1024*y^n
2714  *
2715  * We can compute this reasonably efficiently by combining:
2716  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2717  */
2718 static u32 __compute_runnable_contrib(u64 n)
2719 {
2720         u32 contrib = 0;
2721
2722         if (likely(n <= LOAD_AVG_PERIOD))
2723                 return runnable_avg_yN_sum[n];
2724         else if (unlikely(n >= LOAD_AVG_MAX_N))
2725                 return LOAD_AVG_MAX;
2726
2727         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2728         do {
2729                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2730                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2731
2732                 n -= LOAD_AVG_PERIOD;
2733         } while (n > LOAD_AVG_PERIOD);
2734
2735         contrib = decay_load(contrib, n);
2736         return contrib + runnable_avg_yN_sum[n];
2737 }
2738
2739 #ifdef CONFIG_SCHED_HMP
2740
2741 /* CPU selection flag */
2742 #define SBC_FLAG_PREV_CPU                               0x1
2743 #define SBC_FLAG_BEST_CAP_CPU                           0x2
2744 #define SBC_FLAG_CPU_COST                               0x4
2745 #define SBC_FLAG_MIN_COST                               0x8
2746 #define SBC_FLAG_IDLE_LEAST_LOADED                      0x10
2747 #define SBC_FLAG_IDLE_CSTATE                            0x20
2748 #define SBC_FLAG_COST_CSTATE_TIE_BREAKER                0x40
2749 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER       0x80
2750 #define SBC_FLAG_CSTATE_LOAD                            0x100
2751 #define SBC_FLAG_BEST_SIBLING                           0x200
2752 #define SBC_FLAG_WAKER_CPU                              0x400
2753 #define SBC_FLAG_PACK_TASK                              0x800
2754
2755 /* Cluster selection flag */
2756 #define SBC_FLAG_COLOC_CLUSTER                          0x10000
2757 #define SBC_FLAG_WAKER_CLUSTER                          0x20000
2758 #define SBC_FLAG_BACKUP_CLUSTER                         0x40000
2759 #define SBC_FLAG_BOOST_CLUSTER                          0x80000
2760
2761 struct cpu_select_env {
2762         struct task_struct *p;
2763         struct related_thread_group *rtg;
2764         u8 reason;
2765         u8 need_idle:1;
2766         u8 need_waker_cluster:1;
2767         u8 sync:1;
2768         enum sched_boost_policy boost_policy;
2769         u8 pack_task:1;
2770         int prev_cpu;
2771         DECLARE_BITMAP(candidate_list, NR_CPUS);
2772         DECLARE_BITMAP(backup_list, NR_CPUS);
2773         u64 task_load;
2774         u64 cpu_load;
2775         u32 sbc_best_flag;
2776         u32 sbc_best_cluster_flag;
2777         struct cpumask search_cpus;
2778 };
2779
2780 struct cluster_cpu_stats {
2781         int best_idle_cpu, least_loaded_cpu;
2782         int best_capacity_cpu, best_cpu, best_sibling_cpu;
2783         int min_cost, best_sibling_cpu_cost;
2784         int best_cpu_wakeup_latency;
2785         u64 min_load, best_load, best_sibling_cpu_load;
2786         s64 highest_spare_capacity;
2787 };
2788
2789 /*
2790  * Should task be woken to any available idle cpu?
2791  *
2792  * Waking tasks to idle cpu has mixed implications on both performance and
2793  * power. In many cases, scheduler can't estimate correctly impact of using idle
2794  * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
2795  * module to pass a strong hint to scheduler that the task in question should be
2796  * woken to idle cpu, generally to improve performance.
2797  */
2798 static inline int wake_to_idle(struct task_struct *p)
2799 {
2800         return (current->flags & PF_WAKE_UP_IDLE) ||
2801                  (p->flags & PF_WAKE_UP_IDLE);
2802 }
2803
2804 static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
2805 {
2806         u64 total_load;
2807
2808         total_load = env->task_load + env->cpu_load;
2809
2810         if (total_load > sched_spill_load ||
2811             (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
2812                 return 1;
2813
2814         return 0;
2815 }
2816
2817 static int skip_cpu(int cpu, struct cpu_select_env *env)
2818 {
2819         int tcpu = task_cpu(env->p);
2820         int skip = 0;
2821
2822         if (!env->reason)
2823                 return 0;
2824
2825         if (is_reserved(cpu))
2826                 return 1;
2827
2828         switch (env->reason) {
2829         case UP_MIGRATION:
2830                 skip = !idle_cpu(cpu);
2831                 break;
2832         case IRQLOAD_MIGRATION:
2833                 /* Purposely fall through */
2834         default:
2835                 skip = (cpu == tcpu);
2836                 break;
2837         }
2838
2839         return skip;
2840 }
2841
2842 static inline int
2843 acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
2844 {
2845         int tcpu;
2846
2847         if (!env->reason)
2848                 return 1;
2849
2850         tcpu = task_cpu(env->p);
2851         switch (env->reason) {
2852         case UP_MIGRATION:
2853                 return cluster->capacity > cpu_capacity(tcpu);
2854
2855         case DOWN_MIGRATION:
2856                 return cluster->capacity < cpu_capacity(tcpu);
2857
2858         default:
2859                 break;
2860         }
2861
2862         return 1;
2863 }
2864
2865 static int
2866 skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
2867 {
2868         if (!test_bit(cluster->id, env->candidate_list))
2869                 return 1;
2870
2871         if (!acceptable_capacity(cluster, env)) {
2872                 __clear_bit(cluster->id, env->candidate_list);
2873                 return 1;
2874         }
2875
2876         return 0;
2877 }
2878
2879 static struct sched_cluster *
2880 select_least_power_cluster(struct cpu_select_env *env)
2881 {
2882         struct sched_cluster *cluster;
2883
2884         if (env->rtg) {
2885                 int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
2886
2887                 env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
2888
2889                 if (task_load_will_fit(env->p, env->task_load,
2890                                         cpu, env->boost_policy)) {
2891                         env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
2892
2893                         if (env->boost_policy == SCHED_BOOST_NONE)
2894                                 return env->rtg->preferred_cluster;
2895
2896                         for_each_sched_cluster(cluster) {
2897                                 if (cluster != env->rtg->preferred_cluster) {
2898                                         __set_bit(cluster->id,
2899                                                 env->backup_list);
2900                                         __clear_bit(cluster->id,
2901                                                 env->candidate_list);
2902                                 }
2903                         }
2904
2905                         return env->rtg->preferred_cluster;
2906                 }
2907
2908                 /*
2909                  * Since the task load does not fit on the preferred
2910                  * cluster anymore, pretend that the task does not
2911                  * have any preferred cluster. This allows the waking
2912                  * task to get the appropriate CPU it needs as per the
2913                  * non co-location placement policy without having to
2914                  * wait until the preferred cluster is updated.
2915                  */
2916                 env->rtg = NULL;
2917         }
2918
2919         for_each_sched_cluster(cluster) {
2920                 if (!skip_cluster(cluster, env)) {
2921                         int cpu = cluster_first_cpu(cluster);
2922
2923                         env->task_load = scale_load_to_cpu(task_load(env->p),
2924                                                                          cpu);
2925                         if (task_load_will_fit(env->p, env->task_load, cpu,
2926                                                env->boost_policy))
2927                                 return cluster;
2928
2929                         __set_bit(cluster->id, env->backup_list);
2930                         __clear_bit(cluster->id, env->candidate_list);
2931                 }
2932         }
2933
2934         return NULL;
2935 }
2936
2937 static struct sched_cluster *
2938 next_candidate(const unsigned long *list, int start, int end)
2939 {
2940         int cluster_id;
2941
2942         cluster_id = find_next_bit(list, end, start - 1 + 1);
2943         if (cluster_id >= end)
2944                 return NULL;
2945
2946         return sched_cluster[cluster_id];
2947 }
2948
2949 static void
2950 update_spare_capacity(struct cluster_cpu_stats *stats,
2951                       struct cpu_select_env *env, int cpu, int capacity,
2952                       u64 cpu_load)
2953 {
2954         s64 spare_capacity = sched_ravg_window - cpu_load;
2955
2956         if (spare_capacity > 0 &&
2957             (spare_capacity > stats->highest_spare_capacity ||
2958              (spare_capacity == stats->highest_spare_capacity &&
2959               ((!env->need_waker_cluster &&
2960                 capacity > cpu_capacity(stats->best_capacity_cpu)) ||
2961                (env->need_waker_cluster &&
2962                 cpu_rq(cpu)->nr_running <
2963                 cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
2964                 /*
2965                  * If sync waker is the only runnable of CPU, cr_avg of the
2966                  * CPU is 0 so we have high chance to place the wakee on the
2967                  * waker's CPU which likely causes preemtion of the waker.
2968                  * This can lead migration of preempted waker.  Place the
2969                  * wakee on the real idle CPU when it's possible by checking
2970                  * nr_running to avoid such preemption.
2971                  */
2972                 stats->highest_spare_capacity = spare_capacity;
2973                 stats->best_capacity_cpu = cpu;
2974         }
2975 }
2976
2977 static inline void find_backup_cluster(
2978 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
2979 {
2980         struct sched_cluster *next = NULL;
2981         int i;
2982         struct cpumask search_cpus;
2983
2984         while (!bitmap_empty(env->backup_list, num_clusters)) {
2985                 next = next_candidate(env->backup_list, 0, num_clusters);
2986                 __clear_bit(next->id, env->backup_list);
2987
2988                 cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
2989                 for_each_cpu(i, &search_cpus) {
2990                         trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
2991                         sched_irqload(i), power_cost(i, task_load(env->p) +
2992                                         cpu_cravg_sync(i, env->sync)), 0);
2993
2994                         update_spare_capacity(stats, env, i, next->capacity,
2995                                           cpu_load_sync(i, env->sync));
2996                 }
2997                 env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
2998         }
2999 }
3000
3001 struct sched_cluster *
3002 next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
3003                                         struct cluster_cpu_stats *stats)
3004 {
3005         struct sched_cluster *next = NULL;
3006
3007         __clear_bit(cluster->id, env->candidate_list);
3008
3009         if (env->rtg && preferred_cluster(cluster, env->p))
3010                 return NULL;
3011
3012         do {
3013                 if (bitmap_empty(env->candidate_list, num_clusters))
3014                         return NULL;
3015
3016                 next = next_candidate(env->candidate_list, 0, num_clusters);
3017                 if (next) {
3018                         if (next->min_power_cost > stats->min_cost) {
3019                                 clear_bit(next->id, env->candidate_list);
3020                                 next = NULL;
3021                                 continue;
3022                         }
3023
3024                         if (skip_cluster(next, env))
3025                                 next = NULL;
3026                 }
3027         } while (!next);
3028
3029         env->task_load = scale_load_to_cpu(task_load(env->p),
3030                                         cluster_first_cpu(next));
3031         return next;
3032 }
3033
3034 #ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
3035 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3036                                    struct cpu_select_env *env, int cpu_cost)
3037 {
3038         int wakeup_latency;
3039         int prev_cpu = env->prev_cpu;
3040
3041         wakeup_latency = cpu_rq(cpu)->wakeup_latency;
3042
3043         if (env->need_idle) {
3044                 stats->min_cost = cpu_cost;
3045                 if (idle_cpu(cpu)) {
3046                         if (wakeup_latency < stats->best_cpu_wakeup_latency ||
3047                             (wakeup_latency == stats->best_cpu_wakeup_latency &&
3048                              cpu == prev_cpu)) {
3049                                 stats->best_idle_cpu = cpu;
3050                                 stats->best_cpu_wakeup_latency = wakeup_latency;
3051                         }
3052                 } else {
3053                         if (env->cpu_load < stats->min_load ||
3054                                 (env->cpu_load == stats->min_load &&
3055                                                         cpu == prev_cpu)) {
3056                                 stats->least_loaded_cpu = cpu;
3057                                 stats->min_load = env->cpu_load;
3058                         }
3059                 }
3060
3061                 return;
3062         }
3063
3064         if (cpu_cost < stats->min_cost)  {
3065                 stats->min_cost = cpu_cost;
3066                 stats->best_cpu_wakeup_latency = wakeup_latency;
3067                 stats->best_load = env->cpu_load;
3068                 stats->best_cpu = cpu;
3069                 env->sbc_best_flag = SBC_FLAG_CPU_COST;
3070                 return;
3071         }
3072
3073         /* CPU cost is the same. Start breaking the tie by C-state */
3074
3075         if (wakeup_latency > stats->best_cpu_wakeup_latency)
3076                 return;
3077
3078         if (wakeup_latency < stats->best_cpu_wakeup_latency) {
3079                 stats->best_cpu_wakeup_latency = wakeup_latency;
3080                 stats->best_load = env->cpu_load;
3081                 stats->best_cpu = cpu;
3082                 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
3083                 return;
3084         }
3085
3086         /* C-state is the same. Use prev CPU to break the tie */
3087         if (cpu == prev_cpu) {
3088                 stats->best_cpu = cpu;
3089                 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
3090                 return;
3091         }
3092
3093         if (stats->best_cpu != prev_cpu &&
3094             ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
3095             (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
3096                 stats->best_load = env->cpu_load;
3097                 stats->best_cpu = cpu;
3098                 env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
3099         }
3100 }
3101 #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3102 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3103                                    struct cpu_select_env *env, int cpu_cost)
3104 {
3105         int prev_cpu = env->prev_cpu;
3106
3107         if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
3108                 if (stats->best_sibling_cpu_cost > cpu_cost ||
3109                     (stats->best_sibling_cpu_cost == cpu_cost &&
3110                      stats->best_sibling_cpu_load > env->cpu_load)) {
3111                         stats->best_sibling_cpu_cost = cpu_cost;
3112                         stats->best_sibling_cpu_load = env->cpu_load;
3113                         stats->best_sibling_cpu = cpu;
3114                 }
3115         }
3116
3117         if ((cpu_cost < stats->min_cost) ||
3118             ((stats->best_cpu != prev_cpu &&
3119               stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
3120                 if (env->need_idle) {
3121                         if (idle_cpu(cpu)) {
3122                                 stats->min_cost = cpu_cost;
3123                                 stats->best_idle_cpu = cpu;
3124                         }
3125                 } else {
3126                         stats->min_cost = cpu_cost;
3127                         stats->min_load = env->cpu_load;
3128                         stats->best_cpu = cpu;
3129                         env->sbc_best_flag = SBC_FLAG_MIN_COST;
3130                 }
3131         }
3132 }
3133 #endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3134
3135 static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3136                                          struct cpu_select_env *env)
3137 {
3138         int cpu_cost;
3139
3140         /*
3141          * We try to find the least loaded *busy* CPU irrespective
3142          * of the power cost.
3143          */
3144         if (env->pack_task)
3145                 cpu_cost = cpu_min_power_cost(cpu);
3146
3147         else
3148                 cpu_cost = power_cost(cpu, task_load(env->p) +
3149                                 cpu_cravg_sync(cpu, env->sync));
3150
3151         if (cpu_cost <= stats->min_cost)
3152                 __update_cluster_stats(cpu, stats, env, cpu_cost);
3153 }
3154
3155 static void find_best_cpu_in_cluster(struct sched_cluster *c,
3156          struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3157 {
3158         int i;
3159         struct cpumask search_cpus;
3160
3161         cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
3162
3163         env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
3164
3165         for_each_cpu(i, &search_cpus) {
3166                 env->cpu_load = cpu_load_sync(i, env->sync);
3167
3168                 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3169                         sched_irqload(i),
3170                         power_cost(i, task_load(env->p) +
3171                                         cpu_cravg_sync(i, env->sync)), 0);
3172
3173                 if (skip_cpu(i, env))
3174                         continue;
3175
3176                 update_spare_capacity(stats, env, i, c->capacity,
3177                                       env->cpu_load);
3178
3179                 /*
3180                  * need_idle takes precedence over sched boost but when both
3181                  * are set, idlest CPU with in all the clusters is selected
3182                  * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
3183                  * big cluster is selected within boost_policy = BOOST_ON_BIG.
3184                  */
3185                 if ((!env->need_idle &&
3186                     env->boost_policy != SCHED_BOOST_NONE) ||
3187                     env->need_waker_cluster ||
3188                     sched_cpu_high_irqload(i) ||
3189                     spill_threshold_crossed(env, cpu_rq(i)))
3190                         continue;
3191
3192                 update_cluster_stats(i, stats, env);
3193         }
3194 }
3195
3196 static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
3197 {
3198         stats->best_cpu = stats->best_idle_cpu = -1;
3199         stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
3200         stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
3201         stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
3202         stats->highest_spare_capacity = 0;
3203         stats->least_loaded_cpu = -1;
3204         stats->best_cpu_wakeup_latency = INT_MAX;
3205         /* No need to initialize stats->best_load */
3206 }
3207
3208 static inline bool env_has_special_flags(struct cpu_select_env *env)
3209 {
3210         if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
3211             env->reason)
3212                 return true;
3213
3214         return false;
3215 }
3216
3217 static inline bool
3218 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3219 {
3220         int prev_cpu;
3221         struct task_struct *task = env->p;
3222         struct sched_cluster *cluster;
3223
3224         if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
3225                 return false;
3226
3227         prev_cpu = env->prev_cpu;
3228         if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
3229                 return false;
3230
3231         if (task->ravg.mark_start - task->last_cpu_selected_ts >=
3232                                 sched_long_cpu_selection_threshold)
3233                 return false;
3234
3235         /*
3236          * This function should be used by task wake up path only as it's
3237          * assuming p->last_switch_out_ts as last sleep time.
3238          * p->last_switch_out_ts can denote last preemption time as well as
3239          * last sleep time.
3240          */
3241         if (task->ravg.mark_start - task->last_switch_out_ts >=
3242                                         sched_short_sleep_task_threshold)
3243                 return false;
3244
3245         env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
3246         cluster = cpu_rq(prev_cpu)->cluster;
3247
3248         if (!task_load_will_fit(task, env->task_load, prev_cpu,
3249                                 sched_boost_policy())) {
3250
3251                 __set_bit(cluster->id, env->backup_list);
3252                 __clear_bit(cluster->id, env->candidate_list);
3253                 return false;
3254         }
3255
3256         env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
3257         if (sched_cpu_high_irqload(prev_cpu) ||
3258                         spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
3259                 update_spare_capacity(stats, env, prev_cpu,
3260                                 cluster->capacity, env->cpu_load);
3261                 cpumask_clear_cpu(prev_cpu, &env->search_cpus);
3262                 return false;
3263         }
3264
3265         return true;
3266 }
3267
3268 static inline bool
3269 wake_to_waker_cluster(struct cpu_select_env *env)
3270 {
3271         return env->sync &&
3272                task_load(current) > sched_big_waker_task_load &&
3273                task_load(env->p) < sched_small_wakee_task_load;
3274 }
3275
3276 static inline bool
3277 bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
3278 {
3279         return sysctl_sched_prefer_sync_wakee_to_waker &&
3280                cpu_rq(cpu)->nr_running == 1 &&
3281                cpumask_test_cpu(cpu, &env->search_cpus);
3282 }
3283
3284 static inline int
3285 cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
3286 {
3287         return cpumask_intersects(&env->search_cpus, &cluster->cpus);
3288 }
3289
3290 /* return cheapest cpu that can fit this task */
3291 static int select_best_cpu(struct task_struct *p, int target, int reason,
3292                            int sync)
3293 {
3294         struct sched_cluster *cluster, *pref_cluster = NULL;
3295         struct cluster_cpu_stats stats;
3296         struct related_thread_group *grp;
3297         unsigned int sbc_flag = 0;
3298         int cpu = raw_smp_processor_id();
3299         bool special;
3300
3301         struct cpu_select_env env = {
3302                 .p                      = p,
3303                 .reason                 = reason,
3304                 .need_idle              = wake_to_idle(p),
3305                 .need_waker_cluster     = 0,
3306                 .sync                   = sync,
3307                 .prev_cpu               = target,
3308                 .rtg                    = NULL,
3309                 .sbc_best_flag          = 0,
3310                 .sbc_best_cluster_flag  = 0,
3311                 .pack_task              = false,
3312         };
3313
3314         env.boost_policy = task_sched_boost(p) ?
3315                         sched_boost_policy() : SCHED_BOOST_NONE;
3316
3317         bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
3318         bitmap_zero(env.backup_list, NR_CPUS);
3319
3320         cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
3321         cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
3322
3323         init_cluster_cpu_stats(&stats);
3324         special = env_has_special_flags(&env);
3325
3326         rcu_read_lock();
3327
3328         grp = task_related_thread_group(p);
3329
3330         if (grp && grp->preferred_cluster) {
3331                 pref_cluster = grp->preferred_cluster;
3332                 if (!cluster_allowed(&env, pref_cluster))
3333                         clear_bit(pref_cluster->id, env.candidate_list);
3334                 else
3335                         env.rtg = grp;
3336         } else if (!special) {
3337                 cluster = cpu_rq(cpu)->cluster;
3338                 if (wake_to_waker_cluster(&env)) {
3339                         if (bias_to_waker_cpu(&env, cpu)) {
3340                                 target = cpu;
3341                                 sbc_flag = SBC_FLAG_WAKER_CLUSTER |
3342                                            SBC_FLAG_WAKER_CPU;
3343                                 goto out;
3344                         } else if (cluster_allowed(&env, cluster)) {
3345                                 env.need_waker_cluster = 1;
3346                                 bitmap_zero(env.candidate_list, NR_CPUS);
3347                                 __set_bit(cluster->id, env.candidate_list);
3348                                 env.sbc_best_cluster_flag =
3349                                                         SBC_FLAG_WAKER_CLUSTER;
3350                         }
3351                 } else if (bias_to_prev_cpu(&env, &stats)) {
3352                         sbc_flag = SBC_FLAG_PREV_CPU;
3353                         goto out;
3354                 }
3355         }
3356
3357         if (!special && is_short_burst_task(p)) {
3358                 env.pack_task = true;
3359                 sbc_flag = SBC_FLAG_PACK_TASK;
3360         }
3361 retry:
3362         cluster = select_least_power_cluster(&env);
3363
3364         if (!cluster)
3365                 goto out;
3366
3367         /*
3368          * 'cluster' now points to the minimum power cluster which can satisfy
3369          * task's perf goals. Walk down the cluster list starting with that
3370          * cluster. For non-small tasks, skip clusters that don't have
3371          * mostly_idle/idle cpus
3372          */
3373
3374         do {
3375                 find_best_cpu_in_cluster(cluster, &env, &stats);
3376
3377         } while ((cluster = next_best_cluster(cluster, &env, &stats)));
3378
3379         if (env.need_idle) {
3380                 if (stats.best_idle_cpu >= 0) {
3381                         target = stats.best_idle_cpu;
3382                         sbc_flag |= SBC_FLAG_IDLE_CSTATE;
3383                 } else if (stats.least_loaded_cpu >= 0) {
3384                         target = stats.least_loaded_cpu;
3385                         sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
3386                 }
3387         } else if (stats.best_cpu >= 0) {
3388                 if (stats.best_sibling_cpu >= 0 &&
3389                                 stats.best_cpu != task_cpu(p) &&
3390                                 stats.min_cost == stats.best_sibling_cpu_cost) {
3391                         stats.best_cpu = stats.best_sibling_cpu;
3392                         sbc_flag |= SBC_FLAG_BEST_SIBLING;
3393                 }
3394                 sbc_flag |= env.sbc_best_flag;
3395                 target = stats.best_cpu;
3396         } else {
3397                 if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
3398                         env.rtg = NULL;
3399                         goto retry;
3400                 }
3401
3402                 /*
3403                  * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
3404                  * backup_list = little cluster, candidate_list = none and
3405                  * stats->best_capacity_cpu points the best spare capacity
3406                  * CPU among the CPUs in the big cluster.
3407                  */
3408                 if (env.boost_policy == SCHED_BOOST_ON_BIG &&
3409                     stats.best_capacity_cpu >= 0)
3410                         sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
3411                 else
3412                         find_backup_cluster(&env, &stats);
3413
3414                 if (stats.best_capacity_cpu >= 0) {
3415                         target = stats.best_capacity_cpu;
3416                         sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
3417                 }
3418         }
3419         p->last_cpu_selected_ts = sched_ktime_clock();
3420 out:
3421         sbc_flag |= env.sbc_best_cluster_flag;
3422         rcu_read_unlock();
3423         trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
3424                 env.reason, env.sync, env.need_idle, sbc_flag, target);
3425         return target;
3426 }
3427
3428 #ifdef CONFIG_CFS_BANDWIDTH
3429
3430 static inline struct task_group *next_task_group(struct task_group *tg)
3431 {
3432         tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
3433
3434         return (&tg->list == &task_groups) ? NULL : tg;
3435 }
3436
3437 /* Iterate over all cfs_rq in a cpu */
3438 #define for_each_cfs_rq(cfs_rq, tg, cpu)        \
3439         for (tg = container_of(&task_groups, struct task_group, list);  \
3440                 ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
3441
3442 void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
3443 {
3444         struct task_group *tg;
3445         struct cfs_rq *cfs_rq;
3446
3447         rcu_read_lock();
3448
3449         for_each_cfs_rq(cfs_rq, tg, cpu)
3450                 reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
3451
3452         rcu_read_unlock();
3453 }
3454
3455 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
3456
3457 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3458          struct task_struct *p, int change_cra);
3459 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3460          struct task_struct *p, int change_cra);
3461
3462 /* Add task's contribution to a cpu' HMP statistics */
3463 void _inc_hmp_sched_stats_fair(struct rq *rq,
3464                         struct task_struct *p, int change_cra)
3465 {
3466         struct cfs_rq *cfs_rq;
3467         struct sched_entity *se = &p->se;
3468
3469         /*
3470          * Although below check is not strictly required  (as
3471          * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
3472          * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
3473          * efficiency by short-circuiting for_each_sched_entity() loop when
3474          * sched_disable_window_stats
3475          */
3476         if (sched_disable_window_stats)
3477                 return;
3478
3479         for_each_sched_entity(se) {
3480                 cfs_rq = cfs_rq_of(se);
3481                 inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3482                 if (cfs_rq_throttled(cfs_rq))
3483                         break;
3484         }
3485
3486         /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3487         if (!se)
3488                 inc_rq_hmp_stats(rq, p, change_cra);
3489 }
3490
3491 /* Remove task's contribution from a cpu' HMP statistics */
3492 static void
3493 _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
3494 {
3495         struct cfs_rq *cfs_rq;
3496         struct sched_entity *se = &p->se;
3497
3498         /* See comment on efficiency in _inc_hmp_sched_stats_fair */
3499         if (sched_disable_window_stats)
3500                 return;
3501
3502         for_each_sched_entity(se) {
3503                 cfs_rq = cfs_rq_of(se);
3504                 dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3505                 if (cfs_rq_throttled(cfs_rq))
3506                         break;
3507         }
3508
3509         /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3510         if (!se)
3511                 dec_rq_hmp_stats(rq, p, change_cra);
3512 }
3513
3514 static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3515 {
3516         _inc_hmp_sched_stats_fair(rq, p, 1);
3517 }
3518
3519 static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3520 {
3521         _dec_hmp_sched_stats_fair(rq, p, 1);
3522 }
3523
3524 static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3525                                        u32 new_task_load, u32 new_pred_demand)
3526 {
3527         struct cfs_rq *cfs_rq;
3528         struct sched_entity *se = &p->se;
3529         s64 task_load_delta = (s64)new_task_load - task_load(p);
3530         s64 pred_demand_delta = PRED_DEMAND_DELTA;
3531
3532         for_each_sched_entity(se) {
3533                 cfs_rq = cfs_rq_of(se);
3534
3535                 fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
3536                                               task_load_delta,
3537                                               pred_demand_delta);
3538                 fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
3539                 if (cfs_rq_throttled(cfs_rq))
3540                         break;
3541         }
3542
3543         /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
3544         if (!se) {
3545                 fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
3546                                               task_load_delta,
3547                                               pred_demand_delta);
3548                 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3549         }
3550 }
3551
3552 static int task_will_be_throttled(struct task_struct *p);
3553
3554 #else   /* CONFIG_CFS_BANDWIDTH */
3555
3556 inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
3557
3558 static void
3559 inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3560 {
3561         inc_nr_big_task(&rq->hmp_stats, p);
3562         inc_cumulative_runnable_avg(&rq->hmp_stats, p);
3563 }
3564
3565 static void
3566 dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3567 {
3568         dec_nr_big_task(&rq->hmp_stats, p);
3569         dec_cumulative_runnable_avg(&rq->hmp_stats, p);
3570 }
3571 static void
3572 fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3573                            u32 new_task_load, u32 new_pred_demand)
3574 {
3575         s64 task_load_delta = (s64)new_task_load - task_load(p);
3576         s64 pred_demand_delta = PRED_DEMAND_DELTA;
3577
3578         fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
3579                                       pred_demand_delta);
3580         fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3581 }
3582
3583 static inline int task_will_be_throttled(struct task_struct *p)
3584 {
3585         return 0;
3586 }
3587
3588 void _inc_hmp_sched_stats_fair(struct rq *rq,
3589                         struct task_struct *p, int change_cra)
3590 {
3591         inc_nr_big_task(&rq->hmp_stats, p);
3592 }
3593
3594 #endif  /* CONFIG_CFS_BANDWIDTH */
3595
3596 /*
3597  * Reset balance_interval at all sched_domain levels of given cpu, so that it
3598  * honors kick.
3599  */
3600 static inline void reset_balance_interval(int cpu)
3601 {
3602         struct sched_domain *sd;
3603
3604         if (cpu >= nr_cpu_ids)
3605                 return;
3606
3607         rcu_read_lock();
3608         for_each_domain(cpu, sd)
3609                 sd->balance_interval = 0;
3610         rcu_read_unlock();
3611 }
3612
3613 /*
3614  * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
3615  * cpu as per its demand or priority)
3616  *
3617  * Returns reason why task needs to be migrated
3618  */
3619 static inline int migration_needed(struct task_struct *p, int cpu)
3620 {
3621         int nice;
3622         struct related_thread_group *grp;
3623
3624         if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
3625                 return 0;
3626
3627         /* No need to migrate task that is about to be throttled */
3628         if (task_will_be_throttled(p))
3629                 return 0;
3630
3631         if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
3632                  cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
3633                 return UP_MIGRATION;
3634
3635         if (sched_cpu_high_irqload(cpu))
3636                 return IRQLOAD_MIGRATION;
3637
3638         nice = task_nice(p);
3639         rcu_read_lock();
3640         grp = task_related_thread_group(p);
3641         /*
3642          * Don't assume higher capacity means higher power. If the task
3643          * is running on the power efficient CPU, avoid migrating it
3644          * to a lower capacity cluster.
3645          */
3646         if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
3647                         upmigrate_discouraged(p)) &&
3648                         cpu_capacity(cpu) > min_capacity &&
3649                         cpu_max_power_cost(cpu) == max_power_cost) {
3650                 rcu_read_unlock();
3651                 return DOWN_MIGRATION;
3652         }
3653
3654         if (!task_will_fit(p, cpu)) {
3655                 rcu_read_unlock();
3656                 return UP_MIGRATION;
3657         }
3658         rcu_read_unlock();
3659
3660         return 0;
3661 }
3662
3663 static inline int
3664 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
3665 {
3666         unsigned long flags;
3667         int rc = 0;
3668
3669         /* Invoke active balance to force migrate currently running task */
3670         raw_spin_lock_irqsave(&rq->lock, flags);
3671         if (!rq->active_balance) {
3672                 rq->active_balance = 1;
3673                 rq->push_cpu = new_cpu;
3674                 get_task_struct(p);
3675                 rq->push_task = p;
3676                 rc = 1;
3677         }
3678         raw_spin_unlock_irqrestore(&rq->lock, flags);
3679
3680         return rc;
3681 }
3682
3683 static DEFINE_RAW_SPINLOCK(migration_lock);
3684
3685 static bool do_migration(int reason, int new_cpu, int cpu)
3686 {
3687         if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
3688                                 && same_cluster(new_cpu, cpu))
3689                 return false;
3690
3691         /* Inter cluster high irqload migrations are OK */
3692         return new_cpu != cpu;
3693 }
3694
3695 /*
3696  * Check if currently running task should be migrated to a better cpu.
3697  *
3698  * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
3699  */
3700 void check_for_migration(struct rq *rq, struct task_struct *p)
3701 {
3702         int cpu = cpu_of(rq), new_cpu;
3703         int active_balance = 0, reason;
3704
3705         reason = migration_needed(p, cpu);
3706         if (!reason)
3707                 return;
3708
3709         raw_spin_lock(&migration_lock);
3710         new_cpu = select_best_cpu(p, cpu, reason, 0);
3711
3712         if (do_migration(reason, new_cpu, cpu)) {
3713                 active_balance = kick_active_balance(rq, p, new_cpu);
3714                 if (active_balance)
3715                         mark_reserved(new_cpu);
3716         }
3717
3718         raw_spin_unlock(&migration_lock);
3719
3720         if (active_balance)
3721                 stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
3722                                         &rq->active_balance_work);
3723 }
3724
3725 #ifdef CONFIG_CFS_BANDWIDTH
3726
3727 static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
3728 {
3729         cfs_rq->hmp_stats.nr_big_tasks = 0;
3730         cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
3731         cfs_rq->hmp_stats.pred_demands_sum = 0;
3732 }
3733
3734 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3735                  struct task_struct *p, int change_cra)
3736 {
3737         inc_nr_big_task(&cfs_rq->hmp_stats, p);
3738         if (change_cra)
3739                 inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3740 }
3741
3742 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3743                  struct task_struct *p, int change_cra)
3744 {
3745         dec_nr_big_task(&cfs_rq->hmp_stats, p);
3746         if (change_cra)
3747                 dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3748 }
3749
3750 static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3751                          struct cfs_rq *cfs_rq)
3752 {
3753         stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
3754         stats->cumulative_runnable_avg +=
3755                                 cfs_rq->hmp_stats.cumulative_runnable_avg;
3756         stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
3757 }
3758
3759 static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3760                                  struct cfs_rq *cfs_rq)
3761 {
3762         stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
3763         stats->cumulative_runnable_avg -=
3764                                 cfs_rq->hmp_stats.cumulative_runnable_avg;
3765         stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
3766
3767         BUG_ON(stats->nr_big_tasks < 0 ||
3768                 (s64)stats->cumulative_runnable_avg < 0);
3769         BUG_ON((s64)stats->pred_demands_sum < 0);
3770 }
3771
3772 #else   /* CONFIG_CFS_BANDWIDTH */
3773
3774 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3775          struct task_struct *p, int change_cra) { }
3776
3777 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3778          struct task_struct *p, int change_cra) { }
3779
3780 #endif  /* CONFIG_CFS_BANDWIDTH */
3781
3782 #else   /* CONFIG_SCHED_HMP */
3783
3784 static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
3785
3786 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3787          struct task_struct *p, int change_cra) { }
3788
3789 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3790          struct task_struct *p, int change_cra) { }
3791
3792 #define dec_throttled_cfs_rq_hmp_stats(...)
3793 #define inc_throttled_cfs_rq_hmp_stats(...)
3794
3795 #endif  /* CONFIG_SCHED_HMP */
3796
3797 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
3798 #error "load tracking assumes 2^10 as unit"
3799 #endif
3800
3801 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
3802
3803 /*
3804  * We can represent the historical contribution to runnable average as the
3805  * coefficients of a geometric series.  To do this we sub-divide our runnable
3806  * history into segments of approximately 1ms (1024us); label the segment that
3807  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3808  *
3809  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3810  *      p0            p1           p2
3811  *     (now)       (~1ms ago)  (~2ms ago)
3812  *
3813  * Let u_i denote the fraction of p_i that the entity was runnable.
3814  *
3815  * We then designate the fractions u_i as our co-efficients, yielding the
3816  * following representation of historical load:
3817  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3818  *
3819  * We choose y based on the with of a reasonably scheduling period, fixing:
3820  *   y^32 = 0.5
3821  *
3822  * This means that the contribution to load ~32ms ago (u_32) will be weighted
3823  * approximately half as much as the contribution to load within the last ms
3824  * (u_0).
3825  *
3826  * When a period "rolls over" and we have new u_0`, multiplying the previous
3827  * sum again by y is sufficient to update:
3828  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3829  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3830  */
3831 static __always_inline int
3832 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3833                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
3834 {
3835         u64 delta, scaled_delta, periods;
3836         u32 contrib;
3837         unsigned int delta_w, scaled_delta_w, decayed = 0;
3838         unsigned long scale_freq, scale_cpu;
3839
3840         delta = now - sa->last_update_time;
3841         /*
3842          * This should only happen when time goes backwards, which it
3843          * unfortunately does during sched clock init when we swap over to TSC.
3844          */
3845         if ((s64)delta < 0) {
3846                 sa->last_update_time = now;
3847                 return 0;
3848         }
3849
3850         /*
3851          * Use 1024ns as the unit of measurement since it's a reasonable
3852          * approximation of 1us and fast to compute.
3853          */
3854         delta >>= 10;
3855         if (!delta)
3856                 return 0;
3857         sa->last_update_time = now;
3858
3859         scale_freq = arch_scale_freq_capacity(NULL, cpu);
3860         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3861         trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
3862
3863         /* delta_w is the amount already accumulated against our next period */
3864         delta_w = sa->period_contrib;
3865         if (delta + delta_w >= 1024) {
3866                 decayed = 1;
3867
3868                 /* how much left for next period will start over, we don't know yet */
3869                 sa->period_contrib = 0;
3870
3871                 /*
3872                  * Now that we know we're crossing a period boundary, figure
3873                  * out how much from delta we need to complete the current
3874                  * period and accrue it.
3875                  */
3876                 delta_w = 1024 - delta_w;
3877                 scaled_delta_w = cap_scale(delta_w, scale_freq);
3878                 if (weight) {
3879                         sa->load_sum += weight * scaled_delta_w;
3880                         if (cfs_rq) {
3881                                 cfs_rq->runnable_load_sum +=
3882                                                 weight * scaled_delta_w;
3883                         }
3884                 }
3885                 if (running)
3886                         sa->util_sum += scaled_delta_w * scale_cpu;
3887
3888                 delta -= delta_w;
3889
3890                 /* Figure out how many additional periods this update spans */
3891                 periods = delta / 1024;
3892                 delta %= 1024;
3893
3894                 sa->load_sum = decay_load(sa->load_sum, periods + 1);
3895                 if (cfs_rq) {
3896                         cfs_rq->runnable_load_sum =
3897                                 decay_load(cfs_rq->runnable_load_sum, periods + 1);
3898                 }
3899                 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
3900
3901                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
3902                 contrib = __compute_runnable_contrib(periods);
3903                 contrib = cap_scale(contrib, scale_freq);
3904                 if (weight) {
3905                         sa->load_sum += weight * contrib;
3906                         if (cfs_rq)
3907                                 cfs_rq->runnable_load_sum += weight * contrib;
3908                 }
3909                 if (running)
3910                         sa->util_sum += contrib * scale_cpu;
3911         }
3912
3913         /* Remainder of delta accrued against u_0` */
3914         scaled_delta = cap_scale(delta, scale_freq);
3915         if (weight) {
3916                 sa->load_sum += weight * scaled_delta;
3917                 if (cfs_rq)
3918                         cfs_rq->runnable_load_sum += weight * scaled_delta;
3919         }
3920
3921         if (running)
3922                 sa->util_sum += scaled_delta * scale_cpu;
3923
3924         sa->period_contrib += delta;
3925
3926         if (decayed) {
3927                 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
3928                 if (cfs_rq) {
3929                         cfs_rq->runnable_load_avg =
3930                                 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3931                 }
3932                 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
3933         }
3934
3935         return decayed;
3936 }
3937
3938 /*
3939  * Signed add and clamp on underflow.
3940  *
3941  * Explicitly do a load-store to ensure the intermediate value never hits
3942  * memory. This allows lockless observations without ever seeing the negative
3943  * values.
3944  */
3945 #define add_positive(_ptr, _val) do {                           \
3946         typeof(_ptr) ptr = (_ptr);                              \
3947         typeof(_val) val = (_val);                              \
3948         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3949                                                                 \
3950         res = var + val;                                        \
3951                                                                 \
3952         if (val < 0 && res > var)                               \
3953                 res = 0;                                        \
3954                                                                 \
3955         WRITE_ONCE(*ptr, res);                                  \
3956 } while (0)
3957
3958 #ifdef CONFIG_FAIR_GROUP_SCHED
3959 /**
3960  * update_tg_load_avg - update the tg's load avg
3961  * @cfs_rq: the cfs_rq whose avg changed
3962  * @force: update regardless of how small the difference
3963  *
3964  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3965  * However, because tg->load_avg is a global value there are performance
3966  * considerations.
3967  *
3968  * In order to avoid having to look at the other cfs_rq's, we use a
3969  * differential update where we store the last value we propagated. This in
3970  * turn allows skipping updates if the differential is 'small'.
3971  *
3972  * Updating tg's load_avg is necessary before update_cfs_share() (which is
3973  * done) and effective_load() (which is not done because it is too costly).
3974  */
3975 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3976 {
3977         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3978
3979         /*
3980          * No need to update load_avg for root_task_group as it is not used.
3981          */
3982         if (cfs_rq->tg == &root_task_group)
3983                 return;
3984
3985         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3986                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3987                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3988         }
3989 }
3990
3991 /*
3992  * Called within set_task_rq() right before setting a task's cpu. The
3993  * caller only guarantees p->pi_lock is held; no other assumptions,
3994  * including the state of rq->lock, should be made.
3995  */
3996 void set_task_rq_fair(struct sched_entity *se,
3997                       struct cfs_rq *prev, struct cfs_rq *next)
3998 {
3999         if (!sched_feat(ATTACH_AGE_LOAD))
4000                 return;
4001
4002         /*
4003          * We are supposed to update the task to "current" time, then its up to
4004          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4005          * getting what current time is, so simply throw away the out-of-date
4006          * time. This will result in the wakee task is less decayed, but giving
4007          * the wakee more load sounds not bad.
4008          */
4009         if (se->avg.last_update_time && prev) {
4010                 u64 p_last_update_time;
4011                 u64 n_last_update_time;
4012
4013 #ifndef CONFIG_64BIT
4014                 u64 p_last_update_time_copy;
4015                 u64 n_last_update_time_copy;
4016
4017                 do {
4018                         p_last_update_time_copy = prev->load_last_update_time_copy;
4019                         n_last_update_time_copy = next->load_last_update_time_copy;
4020
4021                         smp_rmb();
4022
4023                         p_last_update_time = prev->avg.last_update_time;
4024                         n_last_update_time = next->avg.last_update_time;
4025
4026                 } while (p_last_update_time != p_last_update_time_copy ||
4027                          n_last_update_time != n_last_update_time_copy);
4028 #else
4029                 p_last_update_time = prev->avg.last_update_time;
4030                 n_last_update_time = next->avg.last_update_time;
4031 #endif
4032                 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
4033                                   &se->avg, 0, 0, NULL);
4034                 se->avg.last_update_time = n_last_update_time;
4035         }
4036 }
4037
4038 /* Take into account change of utilization of a child task group */
4039 static inline void
4040 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
4041 {
4042         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4043         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
4044
4045         /* Nothing to update */
4046         if (!delta)
4047                 return;
4048
4049         /* Set new sched_entity's utilization */
4050         se->avg.util_avg = gcfs_rq->avg.util_avg;
4051         se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
4052
4053         /* Update parent cfs_rq utilization */
4054         add_positive(&cfs_rq->avg.util_avg, delta);
4055         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
4056 }
4057
4058 /* Take into account change of load of a child task group */
4059 static inline void
4060 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
4061 {
4062         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4063         long delta, load = gcfs_rq->avg.load_avg;
4064
4065         /*
4066          * If the load of group cfs_rq is null, the load of the
4067          * sched_entity will also be null so we can skip the formula
4068          */
4069         if (load) {
4070                 long tg_load;
4071
4072                 /* Get tg's load and ensure tg_load > 0 */
4073                 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
4074
4075                 /* Ensure tg_load >= load and updated with current load*/
4076                 tg_load -= gcfs_rq->tg_load_avg_contrib;
4077                 tg_load += load;
4078
4079                 /*
4080                  * We need to compute a correction term in the case that the
4081                  * task group is consuming more CPU than a task of equal
4082                  * weight. A task with a weight equals to tg->shares will have
4083                  * a load less or equal to scale_load_down(tg->shares).
4084                  * Similarly, the sched_entities that represent the task group
4085                  * at parent level, can't have a load higher than
4086                  * scale_load_down(tg->shares). And the Sum of sched_entities'
4087                  * load must be <= scale_load_down(tg->shares).
4088                  */
4089                 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
4090                         /* scale gcfs_rq's load into tg's shares*/
4091                         load *= scale_load_down(gcfs_rq->tg->shares);
4092                         load /= tg_load;
4093                 }
4094         }
4095
4096         delta = load - se->avg.load_avg;
4097
4098         /* Nothing to update */
4099         if (!delta)
4100                 return;
4101
4102         /* Set new sched_entity's load */
4103         se->avg.load_avg = load;
4104         se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
4105
4106         /* Update parent cfs_rq load */
4107         add_positive(&cfs_rq->avg.load_avg, delta);
4108         cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
4109
4110         /*
4111          * If the sched_entity is already enqueued, we also have to update the
4112          * runnable load avg.
4113          */
4114         if (se->on_rq) {
4115                 /* Update parent cfs_rq runnable_load_avg */
4116                 add_positive(&cfs_rq->runnable_load_avg, delta);
4117                 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
4118         }
4119 }
4120
4121 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
4122 {
4123         cfs_rq->propagate_avg = 1;
4124 }
4125
4126 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
4127 {
4128         struct cfs_rq *cfs_rq = group_cfs_rq(se);
4129
4130         if (!cfs_rq->propagate_avg)
4131                 return 0;
4132
4133         cfs_rq->propagate_avg = 0;
4134         return 1;
4135 }
4136
4137 /* Update task and its cfs_rq load average */
4138 static inline int propagate_entity_load_avg(struct sched_entity *se)
4139 {
4140         struct cfs_rq *cfs_rq;
4141
4142         if (entity_is_task(se))
4143                 return 0;
4144
4145         if (!test_and_clear_tg_cfs_propagate(se))
4146                 return 0;
4147
4148         cfs_rq = cfs_rq_of(se);
4149
4150         set_tg_cfs_propagate(cfs_rq);
4151
4152         update_tg_cfs_util(cfs_rq, se);
4153         update_tg_cfs_load(cfs_rq, se);
4154
4155         return 1;
4156 }
4157
4158 #else /* CONFIG_FAIR_GROUP_SCHED */
4159
4160 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
4161
4162 static inline int propagate_entity_load_avg(struct sched_entity *se)
4163 {
4164         return 0;
4165 }
4166
4167 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
4168
4169 #endif /* CONFIG_FAIR_GROUP_SCHED */
4170
4171 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
4172 {
4173         if (&this_rq()->cfs == cfs_rq) {
4174                 /*
4175                  * There are a few boundary cases this might miss but it should
4176                  * get called often enough that that should (hopefully) not be
4177                  * a real problem -- added to that it only calls on the local
4178                  * CPU, so if we enqueue remotely we'll miss an update, but
4179                  * the next tick/schedule should update.
4180                  *
4181                  * It will not get called when we go idle, because the idle
4182                  * thread is a different class (!fair), nor will the utilization
4183                  * number include things like RT tasks.
4184                  *
4185                  * As is, the util number is not freq-invariant (we'd have to
4186                  * implement arch_scale_freq_capacity() for that).
4187                  *
4188                  * See cpu_util().
4189                  */
4190                 cpufreq_update_util(rq_of(cfs_rq), 0);
4191         }
4192 }
4193
4194 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
4195
4196 /*
4197  * Unsigned subtract and clamp on underflow.
4198  *
4199  * Explicitly do a load-store to ensure the intermediate value never hits
4200  * memory. This allows lockless observations without ever seeing the negative
4201  * values.
4202  */
4203 #define sub_positive(_ptr, _val) do {                           \
4204         typeof(_ptr) ptr = (_ptr);                              \
4205         typeof(*ptr) val = (_val);                              \
4206         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
4207         res = var - val;                                        \
4208         if (res > var)                                          \
4209                 res = 0;                                        \
4210         WRITE_ONCE(*ptr, res);                                  \
4211 } while (0)
4212
4213 /**
4214  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4215  * @now: current time, as per cfs_rq_clock_task()
4216  * @cfs_rq: cfs_rq to update
4217  * @update_freq: should we call cfs_rq_util_change() or will the call do so
4218  *
4219  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4220  * avg. The immediate corollary is that all (fair) tasks must be attached, see
4221  * post_init_entity_util_avg().
4222  *
4223  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4224  *
4225  * Returns true if the load decayed or we removed load.
4226  *
4227  * Since both these conditions indicate a changed cfs_rq->avg.load we should
4228  * call update_tg_load_avg() when this function returns true.
4229  */
4230 static inline int
4231 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4232 {
4233         struct sched_avg *sa = &cfs_rq->avg;
4234         int decayed, removed = 0, removed_util = 0;
4235
4236         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
4237                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
4238                 sub_positive(&sa->load_avg, r);
4239                 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
4240                 removed = 1;
4241                 set_tg_cfs_propagate(cfs_rq);
4242         }
4243
4244         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
4245                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
4246                 sub_positive(&sa->util_avg, r);
4247                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
4248                 removed_util = 1;
4249                 set_tg_cfs_propagate(cfs_rq);
4250         }
4251
4252         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
4253                 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
4254
4255 #ifndef CONFIG_64BIT
4256         smp_wmb();
4257         cfs_rq->load_last_update_time_copy = sa->last_update_time;
4258 #endif
4259
4260         /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
4261         if (cfs_rq == &rq_of(cfs_rq)->cfs)
4262                 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
4263
4264         if (update_freq && (decayed || removed_util))
4265                 cfs_rq_util_change(cfs_rq);
4266
4267         return decayed || removed;
4268 }
4269
4270 /*
4271  * Optional action to be done while updating the load average
4272  */
4273 #define UPDATE_TG       0x1
4274 #define SKIP_AGE_LOAD   0x2
4275
4276 /* Update task and its cfs_rq load average */
4277 static inline void update_load_avg(struct sched_entity *se, int flags)
4278 {
4279         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4280         u64 now = cfs_rq_clock_task(cfs_rq);
4281         int cpu = cpu_of(rq_of(cfs_rq));
4282         int decayed;
4283         void *ptr = NULL;
4284
4285         /*
4286          * Track task load average for carrying it to new CPU after migrated, and
4287          * track group sched_entity load average for task_h_load calc in migration
4288          */
4289         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
4290                 __update_load_avg(now, cpu, &se->avg,
4291                           se->on_rq * scale_load_down(se->load.weight),
4292                           cfs_rq->curr == se, NULL);
4293         }
4294
4295         decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
4296         decayed |= propagate_entity_load_avg(se);
4297
4298         if (decayed && (flags & UPDATE_TG))
4299                 update_tg_load_avg(cfs_rq, 0);
4300
4301         if (entity_is_task(se)) {
4302 #ifdef CONFIG_SCHED_WALT
4303                 ptr = (void *)&(task_of(se)->ravg);
4304 #endif
4305                 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
4306         }
4307 }
4308
4309 /**
4310  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4311  * @cfs_rq: cfs_rq to attach to
4312  * @se: sched_entity to attach
4313  *
4314  * Must call update_cfs_rq_load_avg() before this, since we rely on
4315  * cfs_rq->avg.last_update_time being current.
4316  */
4317 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4318 {
4319         se->avg.last_update_time = cfs_rq->avg.last_update_time;
4320         cfs_rq->avg.load_avg += se->avg.load_avg;
4321         cfs_rq->avg.load_sum += se->avg.load_sum;
4322         cfs_rq->avg.util_avg += se->avg.util_avg;
4323         cfs_rq->avg.util_sum += se->avg.util_sum;
4324         set_tg_cfs_propagate(cfs_rq);
4325
4326         cfs_rq_util_change(cfs_rq);
4327 }
4328
4329 /**
4330  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4331  * @cfs_rq: cfs_rq to detach from
4332  * @se: sched_entity to detach
4333  *
4334  * Must call update_cfs_rq_load_avg() before this, since we rely on
4335  * cfs_rq->avg.last_update_time being current.
4336  */
4337 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4338 {
4339
4340         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
4341         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
4342         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4343         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4344         set_tg_cfs_propagate(cfs_rq);
4345
4346         cfs_rq_util_change(cfs_rq);
4347 }
4348
4349 /* Add the load generated by se into cfs_rq's load average */
4350 static inline void
4351 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4352 {
4353         struct sched_avg *sa = &se->avg;
4354
4355         cfs_rq->runnable_load_avg += sa->load_avg;
4356         cfs_rq->runnable_load_sum += sa->load_sum;
4357
4358         if (!sa->last_update_time) {
4359                 attach_entity_load_avg(cfs_rq, se);
4360                 update_tg_load_avg(cfs_rq, 0);
4361         }
4362 }
4363
4364 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
4365 static inline void
4366 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4367 {
4368         cfs_rq->runnable_load_avg =
4369                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
4370         cfs_rq->runnable_load_sum =
4371                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
4372 }
4373
4374 #ifndef CONFIG_64BIT
4375 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4376 {
4377         u64 last_update_time_copy;
4378         u64 last_update_time;
4379
4380         do {
4381                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
4382                 smp_rmb();
4383                 last_update_time = cfs_rq->avg.last_update_time;
4384         } while (last_update_time != last_update_time_copy);
4385
4386         return last_update_time;
4387 }
4388 #else
4389 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4390 {
4391         return cfs_rq->avg.last_update_time;
4392 }
4393 #endif
4394
4395 /*
4396  * Synchronize entity load avg of dequeued entity without locking
4397  * the previous rq.
4398  */
4399 void sync_entity_load_avg(struct sched_entity *se)
4400 {
4401         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4402         u64 last_update_time;
4403
4404         last_update_time = cfs_rq_last_update_time(cfs_rq);
4405         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
4406 }
4407
4408 /*
4409  * Task first catches up with cfs_rq, and then subtract
4410  * itself from the cfs_rq (task must be off the queue now).
4411  */
4412 void remove_entity_load_avg(struct sched_entity *se)
4413 {
4414         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4415
4416         /*
4417          * tasks cannot exit without having gone through wake_up_new_task() ->
4418          * post_init_entity_util_avg() which will have added things to the
4419          * cfs_rq, so we can remove unconditionally.
4420          *
4421          * Similarly for groups, they will have passed through
4422          * post_init_entity_util_avg() before unregister_sched_fair_group()
4423          * calls this.
4424          */
4425
4426         sync_entity_load_avg(se);
4427         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
4428         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
4429 }
4430
4431 /*
4432  * Update the rq's load with the elapsed running time before entering
4433  * idle. if the last scheduled task is not a CFS task, idle_enter will
4434  * be the only way to update the runnable statistic.
4435  */
4436 void idle_enter_fair(struct rq *this_rq)
4437 {
4438 }
4439
4440 /*
4441  * Update the rq's load with the elapsed idle time before a task is
4442  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
4443  * be the only way to update the runnable statistic.
4444  */
4445 void idle_exit_fair(struct rq *this_rq)
4446 {
4447 }
4448
4449 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
4450 {
4451         return cfs_rq->runnable_load_avg;
4452 }
4453
4454 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4455 {
4456         return cfs_rq->avg.load_avg;
4457 }
4458
4459 static int idle_balance(struct rq *this_rq);
4460
4461 #else /* CONFIG_SMP */
4462
4463 static inline int
4464 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4465 {
4466         return 0;
4467 }
4468
4469 #define UPDATE_TG       0x0
4470 #define SKIP_AGE_LOAD   0x0
4471
4472 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
4473 static inline void
4474 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4475 static inline void
4476 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4477 static inline void remove_entity_load_avg(struct sched_entity *se) {}
4478
4479 static inline void
4480 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4481 static inline void
4482 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4483
4484 static inline int idle_balance(struct rq *rq)
4485 {
4486         return 0;
4487 }
4488
4489 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4490          struct task_struct *p, int change_cra) { }
4491
4492 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4493          struct task_struct *p, int change_cra) { }
4494
4495 #endif /* CONFIG_SMP */
4496
4497 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
4498 {
4499 #ifdef CONFIG_SCHEDSTATS
4500         struct task_struct *tsk = NULL;
4501
4502         if (entity_is_task(se))
4503                 tsk = task_of(se);
4504
4505         if (se->statistics.sleep_start) {
4506                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
4507
4508                 if ((s64)delta < 0)
4509                         delta = 0;
4510
4511                 if (unlikely(delta > se->statistics.sleep_max))
4512                         se->statistics.sleep_max = delta;
4513
4514                 se->statistics.sleep_start = 0;
4515                 se->statistics.sum_sleep_runtime += delta;
4516
4517                 if (tsk) {
4518                         account_scheduler_latency(tsk, delta >> 10, 1);
4519                         trace_sched_stat_sleep(tsk, delta);
4520                 }
4521         }
4522         if (se->statistics.block_start) {
4523                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
4524
4525                 if ((s64)delta < 0)
4526                         delta = 0;
4527
4528                 if (unlikely(delta > se->statistics.block_max))
4529                         se->statistics.block_max = delta;
4530
4531                 se->statistics.block_start = 0;
4532                 se->statistics.sum_sleep_runtime += delta;
4533
4534                 if (tsk) {
4535                         if (tsk->in_iowait) {
4536                                 se->statistics.iowait_sum += delta;
4537                                 se->statistics.iowait_count++;
4538                                 trace_sched_stat_iowait(tsk, delta);
4539                         }
4540
4541                         trace_sched_stat_blocked(tsk, delta);
4542                         trace_sched_blocked_reason(tsk);
4543
4544                         /*
4545                          * Blocking time is in units of nanosecs, so shift by
4546                          * 20 to get a milliseconds-range estimation of the
4547                          * amount of time that the task spent sleeping:
4548                          */
4549                         if (unlikely(prof_on == SLEEP_PROFILING)) {
4550                                 profile_hits(SLEEP_PROFILING,
4551                                                 (void *)get_wchan(tsk),
4552                                                 delta >> 20);
4553                         }
4554                         account_scheduler_latency(tsk, delta >> 10, 0);
4555                 }
4556         }
4557 #endif
4558 }
4559
4560 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4561 {
4562 #ifdef CONFIG_SCHED_DEBUG
4563         s64 d = se->vruntime - cfs_rq->min_vruntime;
4564
4565         if (d < 0)
4566                 d = -d;
4567
4568         if (d > 3*sysctl_sched_latency)
4569                 schedstat_inc(cfs_rq, nr_spread_over);
4570 #endif
4571 }
4572
4573 static void
4574 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4575 {
4576         u64 vruntime = cfs_rq->min_vruntime;
4577
4578         /*
4579          * The 'current' period is already promised to the current tasks,
4580          * however the extra weight of the new task will slow them down a
4581          * little, place the new task so that it fits in the slot that
4582          * stays open at the end.
4583          */
4584         if (initial && sched_feat(START_DEBIT))
4585                 vruntime += sched_vslice(cfs_rq, se);
4586
4587         /* sleeps up to a single latency don't count. */
4588         if (!initial) {
4589                 unsigned long thresh = sysctl_sched_latency;
4590
4591                 /*
4592                  * Halve their sleep time's effect, to allow
4593                  * for a gentler effect of sleepers:
4594                  */
4595                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4596                         thresh >>= 1;
4597
4598                 vruntime -= thresh;
4599         }
4600
4601         /* ensure we never gain time by being placed backwards. */
4602         se->vruntime = max_vruntime(se->vruntime, vruntime);
4603 }
4604
4605 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4606
4607 static void
4608 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4609 {
4610         /*
4611          * Update the normalized vruntime before updating min_vruntime
4612          * through calling update_curr().
4613          */
4614         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
4615                 se->vruntime += cfs_rq->min_vruntime;
4616
4617         /*
4618          * Update run-time statistics of the 'current'.
4619          */
4620         update_curr(cfs_rq);
4621         update_load_avg(se, UPDATE_TG);
4622         enqueue_entity_load_avg(cfs_rq, se);
4623         update_cfs_shares(se);
4624         account_entity_enqueue(cfs_rq, se);
4625
4626         if (flags & ENQUEUE_WAKEUP) {
4627                 place_entity(cfs_rq, se, 0);
4628                 enqueue_sleeper(cfs_rq, se);
4629         }
4630
4631         update_stats_enqueue(cfs_rq, se);
4632         check_spread(cfs_rq, se);
4633         if (se != cfs_rq->curr)
4634                 __enqueue_entity(cfs_rq, se);
4635         se->on_rq = 1;
4636
4637         if (cfs_rq->nr_running == 1) {
4638                 list_add_leaf_cfs_rq(cfs_rq);
4639                 check_enqueue_throttle(cfs_rq);
4640         }
4641 }
4642
4643 static void __clear_buddies_last(struct sched_entity *se)
4644 {
4645         for_each_sched_entity(se) {
4646                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4647                 if (cfs_rq->last != se)
4648                         break;
4649
4650                 cfs_rq->last = NULL;
4651         }
4652 }
4653
4654 static void __clear_buddies_next(struct sched_entity *se)
4655 {
4656         for_each_sched_entity(se) {
4657                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4658                 if (cfs_rq->next != se)
4659                         break;
4660
4661                 cfs_rq->next = NULL;
4662         }
4663 }
4664
4665 static void __clear_buddies_skip(struct sched_entity *se)
4666 {
4667         for_each_sched_entity(se) {
4668                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4669                 if (cfs_rq->skip != se)
4670                         break;
4671
4672                 cfs_rq->skip = NULL;
4673         }
4674 }
4675
4676 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4677 {
4678         if (cfs_rq->last == se)
4679                 __clear_buddies_last(se);
4680
4681         if (cfs_rq->next == se)
4682                 __clear_buddies_next(se);
4683
4684         if (cfs_rq->skip == se)
4685                 __clear_buddies_skip(se);
4686 }
4687
4688 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4689
4690 static void
4691 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4692 {
4693         /*
4694          * Update run-time statistics of the 'current'.
4695          */
4696         update_curr(cfs_rq);
4697
4698         /*
4699          * When dequeuing a sched_entity, we must:
4700          *   - Update loads to have both entity and cfs_rq synced with now.
4701          *   - Substract its load from the cfs_rq->runnable_avg.
4702          *   - Substract its previous weight from cfs_rq->load.weight.
4703          *   - For group entity, update its weight to reflect the new share
4704          *     of its group cfs_rq.
4705          */
4706         update_load_avg(se, UPDATE_TG);
4707         dequeue_entity_load_avg(cfs_rq, se);
4708
4709         update_stats_dequeue(cfs_rq, se);
4710         if (flags & DEQUEUE_SLEEP) {
4711 #ifdef CONFIG_SCHEDSTATS
4712                 if (entity_is_task(se)) {
4713                         struct task_struct *tsk = task_of(se);
4714
4715                         if (tsk->state & TASK_INTERRUPTIBLE)
4716                                 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
4717                         if (tsk->state & TASK_UNINTERRUPTIBLE)
4718                                 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
4719                 }
4720 #endif
4721         }
4722
4723         clear_buddies(cfs_rq, se);
4724
4725         if (se != cfs_rq->curr)
4726                 __dequeue_entity(cfs_rq, se);
4727         se->on_rq = 0;
4728         account_entity_dequeue(cfs_rq, se);
4729
4730         /*
4731          * Normalize the entity after updating the min_vruntime because the
4732          * update can refer to the ->curr item and we need to reflect this
4733          * movement in our normalized position.
4734          */
4735         if (!(flags & DEQUEUE_SLEEP))
4736                 se->vruntime -= cfs_rq->min_vruntime;
4737
4738         /* return excess runtime on last dequeue */
4739         return_cfs_rq_runtime(cfs_rq);
4740
4741         update_min_vruntime(cfs_rq);
4742         update_cfs_shares(se);
4743 }
4744
4745 /*
4746  * Preempt the current task with a newly woken task if needed:
4747  */
4748 static void
4749 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4750 {
4751         unsigned long ideal_runtime, delta_exec;
4752         struct sched_entity *se;
4753         s64 delta;
4754
4755         ideal_runtime = sched_slice(cfs_rq, curr);
4756         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4757         if (delta_exec > ideal_runtime) {
4758                 resched_curr(rq_of(cfs_rq));
4759                 /*
4760                  * The current task ran long enough, ensure it doesn't get
4761                  * re-elected due to buddy favours.
4762                  */
4763                 clear_buddies(cfs_rq, curr);
4764                 return;
4765         }
4766
4767         /*
4768          * Ensure that a task that missed wakeup preemption by a
4769          * narrow margin doesn't have to wait for a full slice.
4770          * This also mitigates buddy induced latencies under load.
4771          */
4772         if (delta_exec < sysctl_sched_min_granularity)
4773                 return;
4774
4775         se = __pick_first_entity(cfs_rq);
4776         delta = curr->vruntime - se->vruntime;
4777
4778         if (delta < 0)
4779                 return;
4780
4781         if (delta > ideal_runtime)
4782                 resched_curr(rq_of(cfs_rq));
4783 }
4784
4785 static void
4786 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4787 {
4788         /* 'current' is not kept within the tree. */
4789         if (se->on_rq) {
4790                 /*
4791                  * Any task has to be enqueued before it get to execute on
4792                  * a CPU. So account for the time it spent waiting on the
4793                  * runqueue.
4794                  */
4795                 update_stats_wait_end(cfs_rq, se);
4796                 __dequeue_entity(cfs_rq, se);
4797                 update_load_avg(se, UPDATE_TG);
4798         }
4799
4800         update_stats_curr_start(cfs_rq, se);
4801         cfs_rq->curr = se;
4802 #ifdef CONFIG_SCHEDSTATS
4803         /*
4804          * Track our maximum slice length, if the CPU's load is at
4805          * least twice that of our own weight (i.e. dont track it
4806          * when there are only lesser-weight tasks around):
4807          */
4808         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4809                 se->statistics.slice_max = max(se->statistics.slice_max,
4810                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
4811         }
4812 #endif
4813         se->prev_sum_exec_runtime = se->sum_exec_runtime;
4814 }
4815
4816 static int
4817 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4818
4819 /*
4820  * Pick the next process, keeping these things in mind, in this order:
4821  * 1) keep things fair between processes/task groups
4822  * 2) pick the "next" process, since someone really wants that to run
4823  * 3) pick the "last" process, for cache locality
4824  * 4) do not run the "skip" process, if something else is available
4825  */
4826 static struct sched_entity *
4827 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4828 {
4829         struct sched_entity *left = __pick_first_entity(cfs_rq);
4830         struct sched_entity *se;
4831
4832         /*
4833          * If curr is set we have to see if its left of the leftmost entity
4834          * still in the tree, provided there was anything in the tree at all.
4835          */
4836         if (!left || (curr && entity_before(curr, left)))
4837                 left = curr;
4838
4839         se = left; /* ideally we run the leftmost entity */
4840
4841         /*
4842          * Avoid running the skip buddy, if running something else can
4843          * be done without getting too unfair.
4844          */
4845         if (cfs_rq->skip == se) {
4846                 struct sched_entity *second;
4847
4848                 if (se == curr) {
4849                         second = __pick_first_entity(cfs_rq);
4850                 } else {
4851                         second = __pick_next_entity(se);
4852                         if (!second || (curr && entity_before(curr, second)))
4853                                 second = curr;
4854                 }
4855
4856                 if (second && wakeup_preempt_entity(second, left) < 1)
4857                         se = second;
4858         }
4859
4860         /*
4861          * Prefer last buddy, try to return the CPU to a preempted task.
4862          */
4863         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4864                 se = cfs_rq->last;
4865
4866         /*
4867          * Someone really wants this to run. If it's not unfair, run it.
4868          */
4869         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4870                 se = cfs_rq->next;
4871
4872         clear_buddies(cfs_rq, se);
4873
4874         return se;
4875 }
4876
4877 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4878
4879 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4880 {
4881         /*
4882          * If still on the runqueue then deactivate_task()
4883          * was not called and update_curr() has to be done:
4884          */
4885         if (prev->on_rq)
4886                 update_curr(cfs_rq);
4887
4888         /* throttle cfs_rqs exceeding runtime */
4889         check_cfs_rq_runtime(cfs_rq);
4890
4891         check_spread(cfs_rq, prev);
4892         if (prev->on_rq) {
4893                 update_stats_wait_start(cfs_rq, prev);
4894                 /* Put 'current' back into the tree. */
4895                 __enqueue_entity(cfs_rq, prev);
4896                 /* in !on_rq case, update occurred at dequeue */
4897                 update_load_avg(prev, 0);
4898         }
4899         cfs_rq->curr = NULL;
4900 }
4901
4902 static void
4903 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4904 {
4905         /*
4906          * Update run-time statistics of the 'current'.
4907          */
4908         update_curr(cfs_rq);
4909
4910         /*
4911          * Ensure that runnable average is periodically updated.
4912          */
4913         update_load_avg(curr, UPDATE_TG);
4914         update_cfs_shares(curr);
4915
4916 #ifdef CONFIG_SCHED_HRTICK
4917         /*
4918          * queued ticks are scheduled to match the slice, so don't bother
4919          * validating it and just reschedule.
4920          */
4921         if (queued) {
4922                 resched_curr(rq_of(cfs_rq));
4923                 return;
4924         }
4925         /*
4926          * don't let the period tick interfere with the hrtick preemption
4927          */
4928         if (!sched_feat(DOUBLE_TICK) &&
4929                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4930                 return;
4931 #endif
4932
4933         if (cfs_rq->nr_running > 1)
4934                 check_preempt_tick(cfs_rq, curr);
4935 }
4936
4937
4938 /**************************************************
4939  * CFS bandwidth control machinery
4940  */
4941
4942 #ifdef CONFIG_CFS_BANDWIDTH
4943
4944 #ifdef HAVE_JUMP_LABEL
4945 static struct static_key __cfs_bandwidth_used;
4946
4947 static inline bool cfs_bandwidth_used(void)
4948 {
4949         return static_key_false(&__cfs_bandwidth_used);
4950 }
4951
4952 void cfs_bandwidth_usage_inc(void)
4953 {
4954         static_key_slow_inc(&__cfs_bandwidth_used);
4955 }
4956
4957 void cfs_bandwidth_usage_dec(void)
4958 {
4959         static_key_slow_dec(&__cfs_bandwidth_used);
4960 }
4961 #else /* HAVE_JUMP_LABEL */
4962 static bool cfs_bandwidth_used(void)
4963 {
4964         return true;
4965 }
4966
4967 void cfs_bandwidth_usage_inc(void) {}
4968 void cfs_bandwidth_usage_dec(void) {}
4969 #endif /* HAVE_JUMP_LABEL */
4970
4971 /*
4972  * default period for cfs group bandwidth.
4973  * default: 0.1s, units: nanoseconds
4974  */
4975 static inline u64 default_cfs_period(void)
4976 {
4977         return 100000000ULL;
4978 }
4979
4980 static inline u64 sched_cfs_bandwidth_slice(void)
4981 {
4982         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4983 }
4984
4985 /*
4986  * Replenish runtime according to assigned quota and update expiration time.
4987  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4988  * additional synchronization around rq->lock.
4989  *
4990  * requires cfs_b->lock
4991  */
4992 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4993 {
4994         u64 now;
4995
4996         if (cfs_b->quota == RUNTIME_INF)
4997                 return;
4998
4999         now = sched_clock_cpu(smp_processor_id());
5000         cfs_b->runtime = cfs_b->quota;
5001         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
5002 }
5003
5004 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5005 {
5006         return &tg->cfs_bandwidth;
5007 }
5008
5009 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
5010 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5011 {
5012         if (unlikely(cfs_rq->throttle_count))
5013                 return cfs_rq->throttled_clock_task;
5014
5015         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
5016 }
5017
5018 /* returns 0 on failure to allocate runtime */
5019 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5020 {
5021         struct task_group *tg = cfs_rq->tg;
5022         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
5023         u64 amount = 0, min_amount, expires;
5024
5025         /* note: this is a positive sum as runtime_remaining <= 0 */
5026         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
5027
5028         raw_spin_lock(&cfs_b->lock);
5029         if (cfs_b->quota == RUNTIME_INF)
5030                 amount = min_amount;
5031         else {
5032                 start_cfs_bandwidth(cfs_b);
5033
5034                 if (cfs_b->runtime > 0) {
5035                         amount = min(cfs_b->runtime, min_amount);
5036                         cfs_b->runtime -= amount;
5037                         cfs_b->idle = 0;
5038                 }
5039         }
5040         expires = cfs_b->runtime_expires;
5041         raw_spin_unlock(&cfs_b->lock);
5042
5043         cfs_rq->runtime_remaining += amount;
5044         /*
5045          * we may have advanced our local expiration to account for allowed
5046          * spread between our sched_clock and the one on which runtime was
5047          * issued.
5048          */
5049         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
5050                 cfs_rq->runtime_expires = expires;
5051
5052         return cfs_rq->runtime_remaining > 0;
5053 }
5054
5055 /*
5056  * Note: This depends on the synchronization provided by sched_clock and the
5057  * fact that rq->clock snapshots this value.
5058  */
5059 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5060 {
5061         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5062
5063         /* if the deadline is ahead of our clock, nothing to do */
5064         if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
5065                 return;
5066
5067         if (cfs_rq->runtime_remaining < 0)
5068                 return;
5069
5070         /*
5071          * If the local deadline has passed we have to consider the
5072          * possibility that our sched_clock is 'fast' and the global deadline
5073          * has not truly expired.
5074          *
5075          * Fortunately we can check determine whether this the case by checking
5076          * whether the global deadline has advanced. It is valid to compare
5077          * cfs_b->runtime_expires without any locks since we only care about
5078          * exact equality, so a partial write will still work.
5079          */
5080
5081         if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
5082                 /* extend local deadline, drift is bounded above by 2 ticks */
5083                 cfs_rq->runtime_expires += TICK_NSEC;
5084         } else {
5085                 /* global deadline is ahead, expiration has passed */
5086                 cfs_rq->runtime_remaining = 0;
5087         }
5088 }
5089
5090 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5091 {
5092         /* dock delta_exec before expiring quota (as it could span periods) */
5093         cfs_rq->runtime_remaining -= delta_exec;
5094         expire_cfs_rq_runtime(cfs_rq);
5095
5096         if (likely(cfs_rq->runtime_remaining > 0))
5097                 return;
5098
5099         /*
5100          * if we're unable to extend our runtime we resched so that the active
5101          * hierarchy can be throttled
5102          */
5103         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5104                 resched_curr(rq_of(cfs_rq));
5105 }
5106
5107 static __always_inline
5108 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5109 {
5110         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5111                 return;
5112
5113         __account_cfs_rq_runtime(cfs_rq, delta_exec);
5114 }
5115
5116 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5117 {
5118         return cfs_bandwidth_used() && cfs_rq->throttled;
5119 }
5120
5121 #ifdef CONFIG_SCHED_HMP
5122 /*
5123  * Check if task is part of a hierarchy where some cfs_rq does not have any
5124  * runtime left.
5125  *
5126  * We can't rely on throttled_hierarchy() to do this test, as
5127  * cfs_rq->throttle_count will not be updated yet when this function is called
5128  * from scheduler_tick()
5129  */
5130 static int task_will_be_throttled(struct task_struct *p)
5131 {
5132         struct sched_entity *se = &p->se;
5133         struct cfs_rq *cfs_rq;
5134
5135         if (!cfs_bandwidth_used())
5136                 return 0;
5137
5138         for_each_sched_entity(se) {
5139                 cfs_rq = cfs_rq_of(se);
5140                 if (!cfs_rq->runtime_enabled)
5141                         continue;
5142                 if (cfs_rq->runtime_remaining <= 0)
5143                         return 1;
5144         }
5145
5146         return 0;
5147 }
5148 #endif
5149
5150 /* check whether cfs_rq, or any parent, is throttled */
5151 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5152 {
5153         return cfs_bandwidth_used() && cfs_rq->throttle_count;
5154 }
5155
5156 /*
5157  * Ensure that neither of the group entities corresponding to src_cpu or
5158  * dest_cpu are members of a throttled hierarchy when performing group
5159  * load-balance operations.
5160  */
5161 static inline int throttled_lb_pair(struct task_group *tg,
5162                                     int src_cpu, int dest_cpu)
5163 {
5164         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5165
5166         src_cfs_rq = tg->cfs_rq[src_cpu];
5167         dest_cfs_rq = tg->cfs_rq[dest_cpu];
5168
5169         return throttled_hierarchy(src_cfs_rq) ||
5170                throttled_hierarchy(dest_cfs_rq);
5171 }
5172
5173 /* updated child weight may affect parent so we have to do this bottom up */
5174 static int tg_unthrottle_up(struct task_group *tg, void *data)
5175 {
5176         struct rq *rq = data;
5177         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5178
5179         cfs_rq->throttle_count--;
5180 #ifdef CONFIG_SMP
5181         if (!cfs_rq->throttle_count) {
5182                 /* adjust cfs_rq_clock_task() */
5183                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
5184                                              cfs_rq->throttled_clock_task;
5185         }
5186 #endif
5187
5188         return 0;
5189 }
5190
5191 static int tg_throttle_down(struct task_group *tg, void *data)
5192 {
5193         struct rq *rq = data;
5194         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5195
5196         /* group is entering throttled state, stop time */
5197         if (!cfs_rq->throttle_count)
5198                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5199         cfs_rq->throttle_count++;
5200
5201         return 0;
5202 }
5203
5204 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
5205 {
5206         struct rq *rq = rq_of(cfs_rq);
5207         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5208         struct sched_entity *se;
5209         long task_delta, dequeue = 1;
5210         bool empty;
5211
5212         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5213
5214         /* freeze hierarchy runnable averages while throttled */
5215         rcu_read_lock();
5216         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5217         rcu_read_unlock();
5218
5219         task_delta = cfs_rq->h_nr_running;
5220         for_each_sched_entity(se) {
5221                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5222                 /* throttled entity or throttle-on-deactivate */
5223                 if (!se->on_rq)
5224                         break;
5225
5226                 if (dequeue)
5227                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5228                 qcfs_rq->h_nr_running -= task_delta;
5229                 dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
5230
5231                 if (qcfs_rq->load.weight)
5232                         dequeue = 0;
5233         }
5234
5235         if (!se) {
5236                 sub_nr_running(rq, task_delta);
5237                 dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
5238         }
5239
5240         cfs_rq->throttled = 1;
5241         cfs_rq->throttled_clock = rq_clock(rq);
5242         raw_spin_lock(&cfs_b->lock);
5243         empty = list_empty(&cfs_b->throttled_cfs_rq);
5244
5245         /*
5246          * Add to the _head_ of the list, so that an already-started
5247          * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
5248          * not running add to the tail so that later runqueues don't get starved.
5249          */
5250         if (cfs_b->distribute_running)
5251                 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5252         else
5253                 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5254
5255         /*
5256          * If we're the first throttled task, make sure the bandwidth
5257          * timer is running.
5258          */
5259         if (empty)
5260                 start_cfs_bandwidth(cfs_b);
5261
5262         raw_spin_unlock(&cfs_b->lock);
5263
5264         /* Log effect on hmp stats after throttling */
5265         trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5266                              sched_irqload(cpu_of(rq)),
5267                              power_cost(cpu_of(rq), 0),
5268                              cpu_temp(cpu_of(rq)));
5269 }
5270
5271 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5272 {
5273         struct rq *rq = rq_of(cfs_rq);
5274         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5275         struct sched_entity *se;
5276         int enqueue = 1;
5277         long task_delta;
5278         struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
5279
5280         se = cfs_rq->tg->se[cpu_of(rq)];
5281
5282         cfs_rq->throttled = 0;
5283
5284         update_rq_clock(rq);
5285
5286         raw_spin_lock(&cfs_b->lock);
5287         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5288         list_del_rcu(&cfs_rq->throttled_list);
5289         raw_spin_unlock(&cfs_b->lock);
5290
5291         /* update hierarchical throttle state */
5292         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5293
5294         if (!cfs_rq->load.weight)
5295                 return;
5296
5297         task_delta = cfs_rq->h_nr_running;
5298         for_each_sched_entity(se) {
5299                 if (se->on_rq)
5300                         enqueue = 0;
5301
5302                 cfs_rq = cfs_rq_of(se);
5303                 if (enqueue)
5304                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
5305                 cfs_rq->h_nr_running += task_delta;
5306                 inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
5307
5308                 if (cfs_rq_throttled(cfs_rq))
5309                         break;
5310         }
5311
5312         if (!se) {
5313                 add_nr_running(rq, task_delta);
5314                 inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
5315         }
5316
5317         /* determine whether we need to wake up potentially idle cpu */
5318         if (rq->curr == rq->idle && rq->cfs.nr_running)
5319                 resched_curr(rq);
5320
5321         /* Log effect on hmp stats after un-throttling */
5322         trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5323                              sched_irqload(cpu_of(rq)),
5324                              power_cost(cpu_of(rq), 0),
5325                              cpu_temp(cpu_of(rq)));
5326 }
5327
5328 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
5329                 u64 remaining, u64 expires)
5330 {
5331         struct cfs_rq *cfs_rq;
5332         u64 runtime;
5333         u64 starting_runtime = remaining;
5334
5335         rcu_read_lock();
5336         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5337                                 throttled_list) {
5338                 struct rq *rq = rq_of(cfs_rq);
5339
5340                 raw_spin_lock(&rq->lock);
5341                 if (!cfs_rq_throttled(cfs_rq))
5342                         goto next;
5343
5344                 runtime = -cfs_rq->runtime_remaining + 1;
5345                 if (runtime > remaining)
5346                         runtime = remaining;
5347                 remaining -= runtime;
5348
5349                 cfs_rq->runtime_remaining += runtime;
5350                 cfs_rq->runtime_expires = expires;
5351
5352                 /* we check whether we're throttled above */
5353                 if (cfs_rq->runtime_remaining > 0)
5354                         unthrottle_cfs_rq(cfs_rq);
5355
5356 next:
5357                 raw_spin_unlock(&rq->lock);
5358
5359                 if (!remaining)
5360                         break;
5361         }
5362         rcu_read_unlock();
5363
5364         return starting_runtime - remaining;
5365 }
5366
5367 /*
5368  * Responsible for refilling a task_group's bandwidth and unthrottling its
5369  * cfs_rqs as appropriate. If there has been no activity within the last
5370  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5371  * used to track this state.
5372  */
5373 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5374 {
5375         u64 runtime, runtime_expires;
5376         int throttled;
5377
5378         /* no need to continue the timer with no bandwidth constraint */
5379         if (cfs_b->quota == RUNTIME_INF)
5380                 goto out_deactivate;
5381
5382         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5383         cfs_b->nr_periods += overrun;
5384
5385         /*
5386          * idle depends on !throttled (for the case of a large deficit), and if
5387          * we're going inactive then everything else can be deferred
5388          */
5389         if (cfs_b->idle && !throttled)
5390                 goto out_deactivate;
5391
5392         __refill_cfs_bandwidth_runtime(cfs_b);
5393
5394         if (!throttled) {
5395                 /* mark as potentially idle for the upcoming period */
5396                 cfs_b->idle = 1;
5397                 return 0;
5398         }
5399
5400         /* account preceding periods in which throttling occurred */
5401         cfs_b->nr_throttled += overrun;
5402
5403         runtime_expires = cfs_b->runtime_expires;
5404
5405         /*
5406          * This check is repeated as we are holding onto the new bandwidth while
5407          * we unthrottle. This can potentially race with an unthrottled group
5408          * trying to acquire new bandwidth from the global pool. This can result
5409          * in us over-using our runtime if it is all used during this loop, but
5410          * only by limited amounts in that extreme case.
5411          */
5412         while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
5413                 runtime = cfs_b->runtime;
5414                 cfs_b->distribute_running = 1;
5415                 raw_spin_unlock(&cfs_b->lock);
5416                 /* we can't nest cfs_b->lock while distributing bandwidth */
5417                 runtime = distribute_cfs_runtime(cfs_b, runtime,
5418                                                  runtime_expires);
5419                 raw_spin_lock(&cfs_b->lock);
5420
5421                 cfs_b->distribute_running = 0;
5422                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5423
5424                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5425         }
5426
5427         /*
5428          * While we are ensured activity in the period following an
5429          * unthrottle, this also covers the case in which the new bandwidth is
5430          * insufficient to cover the existing bandwidth deficit.  (Forcing the
5431          * timer to remain active while there are any throttled entities.)
5432          */
5433         cfs_b->idle = 0;
5434
5435         return 0;
5436
5437 out_deactivate:
5438         return 1;
5439 }
5440
5441 /* a cfs_rq won't donate quota below this amount */
5442 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5443 /* minimum remaining period time to redistribute slack quota */
5444 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5445 /* how long we wait to gather additional slack before distributing */
5446 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5447
5448 /*
5449  * Are we near the end of the current quota period?
5450  *
5451  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
5452  * hrtimer base being cleared by hrtimer_start. In the case of
5453  * migrate_hrtimers, base is never cleared, so we are fine.
5454  */
5455 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5456 {
5457         struct hrtimer *refresh_timer = &cfs_b->period_timer;
5458         u64 remaining;
5459
5460         /* if the call-back is running a quota refresh is already occurring */
5461         if (hrtimer_callback_running(refresh_timer))
5462                 return 1;
5463
5464         /* is a quota refresh about to occur? */
5465         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5466         if (remaining < min_expire)
5467                 return 1;
5468
5469         return 0;
5470 }
5471
5472 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5473 {
5474         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5475
5476         /* if there's a quota refresh soon don't bother with slack */
5477         if (runtime_refresh_within(cfs_b, min_left))
5478                 return;
5479
5480         hrtimer_start(&cfs_b->slack_timer,
5481                         ns_to_ktime(cfs_bandwidth_slack_period),
5482                         HRTIMER_MODE_REL);
5483 }
5484
5485 /* we know any runtime found here is valid as update_curr() precedes return */
5486 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5487 {
5488         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5489         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5490
5491         if (slack_runtime <= 0)
5492                 return;
5493
5494         raw_spin_lock(&cfs_b->lock);
5495         if (cfs_b->quota != RUNTIME_INF &&
5496             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5497                 cfs_b->runtime += slack_runtime;
5498
5499                 /* we are under rq->lock, defer unthrottling using a timer */
5500                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5501                     !list_empty(&cfs_b->throttled_cfs_rq))
5502                         start_cfs_slack_bandwidth(cfs_b);
5503         }
5504         raw_spin_unlock(&cfs_b->lock);
5505
5506         /* even if it's not valid for return we don't want to try again */
5507         cfs_rq->runtime_remaining -= slack_runtime;
5508 }
5509
5510 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5511 {
5512         if (!cfs_bandwidth_used())
5513                 return;
5514
5515         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5516                 return;
5517
5518         __return_cfs_rq_runtime(cfs_rq);
5519 }
5520
5521 /*
5522  * This is done with a timer (instead of inline with bandwidth return) since
5523  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5524  */
5525 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5526 {
5527         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5528         u64 expires;
5529
5530         /* confirm we're still not at a refresh boundary */
5531         raw_spin_lock(&cfs_b->lock);
5532         if (cfs_b->distribute_running) {
5533                 raw_spin_unlock(&cfs_b->lock);
5534                 return;
5535         }
5536
5537         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5538                 raw_spin_unlock(&cfs_b->lock);
5539                 return;
5540         }
5541
5542         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5543                 runtime = cfs_b->runtime;
5544
5545         expires = cfs_b->runtime_expires;
5546         if (runtime)
5547                 cfs_b->distribute_running = 1;
5548
5549         raw_spin_unlock(&cfs_b->lock);
5550
5551         if (!runtime)
5552                 return;
5553
5554         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5555
5556         raw_spin_lock(&cfs_b->lock);
5557         if (expires == cfs_b->runtime_expires)
5558                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5559         cfs_b->distribute_running = 0;
5560         raw_spin_unlock(&cfs_b->lock);
5561 }
5562
5563 /*
5564  * When a group wakes up we want to make sure that its quota is not already
5565  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5566  * runtime as update_curr() throttling can not not trigger until it's on-rq.
5567  */
5568 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5569 {
5570         if (!cfs_bandwidth_used())
5571                 return;
5572
5573         /* Synchronize hierarchical throttle counter: */
5574         if (unlikely(!cfs_rq->throttle_uptodate)) {
5575                 struct rq *rq = rq_of(cfs_rq);
5576                 struct cfs_rq *pcfs_rq;
5577                 struct task_group *tg;
5578
5579                 cfs_rq->throttle_uptodate = 1;
5580
5581                 /* Get closest up-to-date node, because leaves go first: */
5582                 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
5583                         pcfs_rq = tg->cfs_rq[cpu_of(rq)];
5584                         if (pcfs_rq->throttle_uptodate)
5585                                 break;
5586                 }
5587                 if (tg) {
5588                         cfs_rq->throttle_count = pcfs_rq->throttle_count;
5589                         cfs_rq->throttled_clock_task = rq_clock_task(rq);
5590                 }
5591         }
5592
5593         /* an active group must be handled by the update_curr()->put() path */
5594         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5595                 return;
5596
5597         /* ensure the group is not already throttled */
5598         if (cfs_rq_throttled(cfs_rq))
5599                 return;
5600
5601         /* update runtime allocation */
5602         account_cfs_rq_runtime(cfs_rq, 0);
5603         if (cfs_rq->runtime_remaining <= 0)
5604                 throttle_cfs_rq(cfs_rq);
5605 }
5606
5607 /* conditionally throttle active cfs_rq's from put_prev_entity() */
5608 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5609 {
5610         if (!cfs_bandwidth_used())
5611                 return false;
5612
5613         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5614                 return false;
5615
5616         /*
5617          * it's possible for a throttled entity to be forced into a running
5618          * state (e.g. set_curr_task), in this case we're finished.
5619          */
5620         if (cfs_rq_throttled(cfs_rq))
5621                 return true;
5622
5623         throttle_cfs_rq(cfs_rq);
5624         return true;
5625 }
5626
5627 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5628 {
5629         struct cfs_bandwidth *cfs_b =
5630                 container_of(timer, struct cfs_bandwidth, slack_timer);
5631
5632         do_sched_cfs_slack_timer(cfs_b);
5633
5634         return HRTIMER_NORESTART;
5635 }
5636
5637 extern const u64 max_cfs_quota_period;
5638
5639 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5640 {
5641         struct cfs_bandwidth *cfs_b =
5642                 container_of(timer, struct cfs_bandwidth, period_timer);
5643         int overrun;
5644         int idle = 0;
5645         int count = 0;
5646
5647         raw_spin_lock(&cfs_b->lock);
5648         for (;;) {
5649                 overrun = hrtimer_forward_now(timer, cfs_b->period);
5650                 if (!overrun)
5651                         break;
5652
5653                 if (++count > 3) {
5654                         u64 new, old = ktime_to_ns(cfs_b->period);
5655
5656                         new = (old * 147) / 128; /* ~115% */
5657                         new = min(new, max_cfs_quota_period);
5658
5659                         cfs_b->period = ns_to_ktime(new);
5660
5661                         /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
5662                         cfs_b->quota *= new;
5663                         cfs_b->quota = div64_u64(cfs_b->quota, old);
5664
5665                         pr_warn_ratelimited(
5666         "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
5667                                 smp_processor_id(),
5668                                 div_u64(new, NSEC_PER_USEC),
5669                                 div_u64(cfs_b->quota, NSEC_PER_USEC));
5670
5671                         /* reset count so we don't come right back in here */
5672                         count = 0;
5673                 }
5674
5675                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5676         }
5677         if (idle)
5678                 cfs_b->period_active = 0;
5679         raw_spin_unlock(&cfs_b->lock);
5680
5681         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5682 }
5683
5684 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5685 {
5686         raw_spin_lock_init(&cfs_b->lock);
5687         cfs_b->runtime = 0;
5688         cfs_b->quota = RUNTIME_INF;
5689         cfs_b->period = ns_to_ktime(default_cfs_period());
5690
5691         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5692         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5693         cfs_b->period_timer.function = sched_cfs_period_timer;
5694         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5695         cfs_b->slack_timer.function = sched_cfs_slack_timer;
5696         cfs_b->distribute_running = 0;
5697 }
5698
5699 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5700 {
5701         cfs_rq->runtime_enabled = 0;
5702         INIT_LIST_HEAD(&cfs_rq->throttled_list);
5703         init_cfs_rq_hmp_stats(cfs_rq);
5704 }
5705
5706 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5707 {
5708         lockdep_assert_held(&cfs_b->lock);
5709
5710         if (!cfs_b->period_active) {
5711                 cfs_b->period_active = 1;
5712                 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5713                 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5714         }
5715 }
5716
5717 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5718 {
5719         /* init_cfs_bandwidth() was not called */
5720         if (!cfs_b->throttled_cfs_rq.next)
5721                 return;
5722
5723         hrtimer_cancel(&cfs_b->period_timer);
5724         hrtimer_cancel(&cfs_b->slack_timer);
5725 }
5726
5727 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5728 {
5729         struct cfs_rq *cfs_rq;
5730
5731         for_each_leaf_cfs_rq(rq, cfs_rq) {
5732                 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
5733
5734                 raw_spin_lock(&cfs_b->lock);
5735                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5736                 raw_spin_unlock(&cfs_b->lock);
5737         }
5738 }
5739
5740 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5741 {
5742         struct cfs_rq *cfs_rq;
5743
5744         for_each_leaf_cfs_rq(rq, cfs_rq) {
5745                 if (!cfs_rq->runtime_enabled)
5746                         continue;
5747
5748                 /*
5749                  * clock_task is not advancing so we just need to make sure
5750                  * there's some valid quota amount
5751                  */
5752                 cfs_rq->runtime_remaining = 1;
5753                 /*
5754                  * Offline rq is schedulable till cpu is completely disabled
5755                  * in take_cpu_down(), so we prevent new cfs throttling here.
5756                  */
5757                 cfs_rq->runtime_enabled = 0;
5758
5759                 if (cfs_rq_throttled(cfs_rq))
5760                         unthrottle_cfs_rq(cfs_rq);
5761         }
5762 }
5763
5764 #else /* CONFIG_CFS_BANDWIDTH */
5765 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5766 {
5767         return rq_clock_task(rq_of(cfs_rq));
5768 }
5769
5770 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5771 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5772 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5773 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5774
5775 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5776 {
5777         return 0;
5778 }
5779
5780 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5781 {
5782         return 0;
5783 }
5784
5785 static inline int throttled_lb_pair(struct task_group *tg,
5786                                     int src_cpu, int dest_cpu)
5787 {
5788         return 0;
5789 }
5790
5791 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5792
5793 #ifdef CONFIG_FAIR_GROUP_SCHED
5794 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5795 #endif
5796
5797 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5798 {
5799         return NULL;
5800 }
5801 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5802 static inline void update_runtime_enabled(struct rq *rq) {}
5803 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5804
5805 #endif /* CONFIG_CFS_BANDWIDTH */
5806
5807 /**************************************************
5808  * CFS operations on tasks:
5809  */
5810
5811 #ifdef CONFIG_SCHED_HRTICK
5812 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5813 {
5814         struct sched_entity *se = &p->se;
5815         struct cfs_rq *cfs_rq = cfs_rq_of(se);
5816
5817         WARN_ON(task_rq(p) != rq);
5818
5819         if (rq->cfs.h_nr_running > 1) {
5820                 u64 slice = sched_slice(cfs_rq, se);
5821                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5822                 s64 delta = slice - ran;
5823
5824                 if (delta < 0) {
5825                         if (rq->curr == p)
5826                                 resched_curr(rq);
5827                         return;
5828                 }
5829                 hrtick_start(rq, delta);
5830         }
5831 }
5832
5833 /*
5834  * called from enqueue/dequeue and updates the hrtick when the
5835  * current task is from our class.
5836  */
5837 static void hrtick_update(struct rq *rq)
5838 {
5839         struct task_struct *curr = rq->curr;
5840
5841         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5842                 return;
5843
5844         hrtick_start_fair(rq, curr);
5845 }
5846 #else /* !CONFIG_SCHED_HRTICK */
5847 static inline void
5848 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5849 {
5850 }
5851
5852 static inline void hrtick_update(struct rq *rq)
5853 {
5854 }
5855 #endif
5856
5857 #ifdef CONFIG_SMP
5858 static bool __cpu_overutilized(int cpu, int delta);
5859 static bool cpu_overutilized(int cpu);
5860 unsigned long boosted_cpu_util(int cpu);
5861 #else
5862 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
5863 #endif
5864
5865 /*
5866  * The enqueue_task method is called before nr_running is
5867  * increased. Here we update the fair scheduling stats and
5868  * then put the task into the rbtree:
5869  */
5870 static void
5871 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5872 {
5873         struct cfs_rq *cfs_rq;
5874         struct sched_entity *se = &p->se;
5875 #ifdef CONFIG_SMP
5876         int task_new = flags & ENQUEUE_WAKEUP_NEW;
5877 #endif
5878
5879         /*
5880          * If in_iowait is set, the code below may not trigger any cpufreq
5881          * utilization updates, so do it here explicitly with the IOWAIT flag
5882          * passed.
5883          */
5884         if (p->in_iowait)
5885                 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5886
5887         for_each_sched_entity(se) {
5888                 if (se->on_rq)
5889                         break;
5890                 cfs_rq = cfs_rq_of(se);
5891                 enqueue_entity(cfs_rq, se, flags);
5892
5893                 /*
5894                  * end evaluation on encountering a throttled cfs_rq
5895                  *
5896                  * note: in the case of encountering a throttled cfs_rq we will
5897                  * post the final h_nr_running increment below.
5898                  */
5899                 if (cfs_rq_throttled(cfs_rq))
5900                         break;
5901                 cfs_rq->h_nr_running++;
5902                 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5903
5904                 flags = ENQUEUE_WAKEUP;
5905         }
5906
5907         for_each_sched_entity(se) {
5908                 cfs_rq = cfs_rq_of(se);
5909                 cfs_rq->h_nr_running++;
5910                 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5911
5912                 if (cfs_rq_throttled(cfs_rq))
5913                         break;
5914
5915                 update_load_avg(se, UPDATE_TG);
5916                 update_cfs_shares(se);
5917         }
5918
5919         if (!se) {
5920                 add_nr_running(rq, 1);
5921                 inc_rq_hmp_stats(rq, p, 1);
5922         }
5923
5924 #ifdef CONFIG_SMP
5925
5926         /*
5927          * Update SchedTune accounting.
5928          *
5929          * We do it before updating the CPU capacity to ensure the
5930          * boost value of the current task is accounted for in the
5931          * selection of the OPP.
5932          *
5933          * We do it also in the case where we enqueue a throttled task;
5934          * we could argue that a throttled task should not boost a CPU,
5935          * however:
5936          * a) properly implementing CPU boosting considering throttled
5937          *    tasks will increase a lot the complexity of the solution
5938          * b) it's not easy to quantify the benefits introduced by
5939          *    such a more complex solution.
5940          * Thus, for the time being we go for the simple solution and boost
5941          * also for throttled RQs.
5942          */
5943         schedtune_enqueue_task(p, cpu_of(rq));
5944
5945         if (energy_aware() && !se) {
5946                 if (!task_new && !rq->rd->overutilized &&
5947                     cpu_overutilized(rq->cpu)) {
5948                         rq->rd->overutilized = true;
5949                         trace_sched_overutilized(true);
5950                 }
5951         }
5952
5953 #endif /* CONFIG_SMP */
5954         hrtick_update(rq);
5955 }
5956
5957 static void set_next_buddy(struct sched_entity *se);
5958
5959 /*
5960  * The dequeue_task method is called before nr_running is
5961  * decreased. We remove the task from the rbtree and
5962  * update the fair scheduling stats:
5963  */
5964 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5965 {
5966         struct cfs_rq *cfs_rq;
5967         struct sched_entity *se = &p->se;
5968         int task_sleep = flags & DEQUEUE_SLEEP;
5969
5970         for_each_sched_entity(se) {
5971                 cfs_rq = cfs_rq_of(se);
5972                 dequeue_entity(cfs_rq, se, flags);
5973
5974                 /*
5975                  * end evaluation on encountering a throttled cfs_rq
5976                  *
5977                  * note: in the case of encountering a throttled cfs_rq we will
5978                  * post the final h_nr_running decrement below.
5979                 */
5980                 if (cfs_rq_throttled(cfs_rq))
5981                         break;
5982                 cfs_rq->h_nr_running--;
5983                 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
5984
5985                 /* Don't dequeue parent if it has other entities besides us */
5986                 if (cfs_rq->load.weight) {
5987                         /* Avoid re-evaluating load for this entity: */
5988                         se = parent_entity(se);
5989                         /*
5990                          * Bias pick_next to pick a task from this cfs_rq, as
5991                          * p is sleeping when it is within its sched_slice.
5992                          */
5993                         if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5994                                 set_next_buddy(se);
5995                         break;
5996                 }
5997                 flags |= DEQUEUE_SLEEP;
5998         }
5999
6000         for_each_sched_entity(se) {
6001                 cfs_rq = cfs_rq_of(se);
6002                 cfs_rq->h_nr_running--;
6003                 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6004
6005                 if (cfs_rq_throttled(cfs_rq))
6006                         break;
6007
6008                 update_load_avg(se, UPDATE_TG);
6009                 update_cfs_shares(se);
6010         }
6011
6012         if (!se) {
6013                 sub_nr_running(rq, 1);
6014                 dec_rq_hmp_stats(rq, p, 1);
6015         }
6016
6017 #ifdef CONFIG_SMP
6018
6019         /*
6020          * Update SchedTune accounting
6021          *
6022          * We do it before updating the CPU capacity to ensure the
6023          * boost value of the current task is accounted for in the
6024          * selection of the OPP.
6025          */
6026         schedtune_dequeue_task(p, cpu_of(rq));
6027
6028 #endif /* CONFIG_SMP */
6029
6030         hrtick_update(rq);
6031 }
6032
6033 #ifdef CONFIG_SMP
6034
6035 /*
6036  * per rq 'load' arrray crap; XXX kill this.
6037  */
6038
6039 /*
6040  * The exact cpuload at various idx values, calculated at every tick would be
6041  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
6042  *
6043  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
6044  * on nth tick when cpu may be busy, then we have:
6045  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6046  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
6047  *
6048  * decay_load_missed() below does efficient calculation of
6049  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6050  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
6051  *
6052  * The calculation is approximated on a 128 point scale.
6053  * degrade_zero_ticks is the number of ticks after which load at any
6054  * particular idx is approximated to be zero.
6055  * degrade_factor is a precomputed table, a row for each load idx.
6056  * Each column corresponds to degradation factor for a power of two ticks,
6057  * based on 128 point scale.
6058  * Example:
6059  * row 2, col 3 (=12) says that the degradation at load idx 2 after
6060  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
6061  *
6062  * With this power of 2 load factors, we can degrade the load n times
6063  * by looking at 1 bits in n and doing as many mult/shift instead of
6064  * n mult/shifts needed by the exact degradation.
6065  */
6066 #define DEGRADE_SHIFT           7
6067 static const unsigned char
6068                 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
6069 static const unsigned char
6070                 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
6071                                         {0, 0, 0, 0, 0, 0, 0, 0},
6072                                         {64, 32, 8, 0, 0, 0, 0, 0},
6073                                         {96, 72, 40, 12, 1, 0, 0},
6074                                         {112, 98, 75, 43, 15, 1, 0},
6075                                         {120, 112, 98, 76, 45, 16, 2} };
6076
6077 /*
6078  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
6079  * would be when CPU is idle and so we just decay the old load without
6080  * adding any new load.
6081  */
6082 static unsigned long
6083 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
6084 {
6085         int j = 0;
6086
6087         if (!missed_updates)
6088                 return load;
6089
6090         if (missed_updates >= degrade_zero_ticks[idx])
6091                 return 0;
6092
6093         if (idx == 1)
6094                 return load >> missed_updates;
6095
6096         while (missed_updates) {
6097                 if (missed_updates % 2)
6098                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
6099
6100                 missed_updates >>= 1;
6101                 j++;
6102         }
6103         return load;
6104 }
6105
6106 /*
6107  * Update rq->cpu_load[] statistics. This function is usually called every
6108  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
6109  * every tick. We fix it up based on jiffies.
6110  */
6111 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
6112                               unsigned long pending_updates)
6113 {
6114         int i, scale;
6115
6116         this_rq->nr_load_updates++;
6117
6118         /* Update our load: */
6119         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
6120         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
6121                 unsigned long old_load, new_load;
6122
6123                 /* scale is effectively 1 << i now, and >> i divides by scale */
6124
6125                 old_load = this_rq->cpu_load[i];
6126                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
6127                 new_load = this_load;
6128                 /*
6129                  * Round up the averaging division if load is increasing. This
6130                  * prevents us from getting stuck on 9 if the load is 10, for
6131                  * example.
6132                  */
6133                 if (new_load > old_load)
6134                         new_load += scale - 1;
6135
6136                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
6137         }
6138
6139         sched_avg_update(this_rq);
6140 }
6141
6142 /* Used instead of source_load when we know the type == 0 */
6143 static unsigned long weighted_cpuload(const int cpu)
6144 {
6145         return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
6146 }
6147
6148 #ifdef CONFIG_NO_HZ_COMMON
6149 /*
6150  * There is no sane way to deal with nohz on smp when using jiffies because the
6151  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
6152  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
6153  *
6154  * Therefore we cannot use the delta approach from the regular tick since that
6155  * would seriously skew the load calculation. However we'll make do for those
6156  * updates happening while idle (nohz_idle_balance) or coming out of idle
6157  * (tick_nohz_idle_exit).
6158  *
6159  * This means we might still be one tick off for nohz periods.
6160  */
6161
6162 /*
6163  * Called from nohz_idle_balance() to update the load ratings before doing the
6164  * idle balance.
6165  */
6166 static void update_idle_cpu_load(struct rq *this_rq)
6167 {
6168         unsigned long curr_jiffies = READ_ONCE(jiffies);
6169         unsigned long load = weighted_cpuload(cpu_of(this_rq));
6170         unsigned long pending_updates;
6171
6172         /*
6173          * bail if there's load or we're actually up-to-date.
6174          */
6175         if (load || curr_jiffies == this_rq->last_load_update_tick)
6176                 return;
6177
6178         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6179         this_rq->last_load_update_tick = curr_jiffies;
6180
6181         __update_cpu_load(this_rq, load, pending_updates);
6182 }
6183
6184 /*
6185  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
6186  */
6187 void update_cpu_load_nohz(void)
6188 {
6189         struct rq *this_rq = this_rq();
6190         unsigned long curr_jiffies = READ_ONCE(jiffies);
6191         unsigned long pending_updates;
6192
6193         if (curr_jiffies == this_rq->last_load_update_tick)
6194                 return;
6195
6196         raw_spin_lock(&this_rq->lock);
6197         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6198         if (pending_updates) {
6199                 this_rq->last_load_update_tick = curr_jiffies;
6200                 /*
6201                  * We were idle, this means load 0, the current load might be
6202                  * !0 due to remote wakeups and the sort.
6203                  */
6204                 __update_cpu_load(this_rq, 0, pending_updates);
6205         }
6206         raw_spin_unlock(&this_rq->lock);
6207 }
6208 #endif /* CONFIG_NO_HZ */
6209
6210 /*
6211  * Called from scheduler_tick()
6212  */
6213 void update_cpu_load_active(struct rq *this_rq)
6214 {
6215         unsigned long load = weighted_cpuload(cpu_of(this_rq));
6216         /*
6217          * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
6218          */
6219         this_rq->last_load_update_tick = jiffies;
6220         __update_cpu_load(this_rq, load, 1);
6221 }
6222
6223 /*
6224  * Return a low guess at the load of a migration-source cpu weighted
6225  * according to the scheduling class and "nice" value.
6226  *
6227  * We want to under-estimate the load of migration sources, to
6228  * balance conservatively.
6229  */
6230 static unsigned long source_load(int cpu, int type)
6231 {
6232         struct rq *rq = cpu_rq(cpu);
6233         unsigned long total = weighted_cpuload(cpu);
6234
6235         if (type == 0 || !sched_feat(LB_BIAS))
6236                 return total;
6237
6238         return min(rq->cpu_load[type-1], total);
6239 }
6240
6241 /*
6242  * Return a high guess at the load of a migration-target cpu weighted
6243  * according to the scheduling class and "nice" value.
6244  */
6245 static unsigned long target_load(int cpu, int type)
6246 {
6247         struct rq *rq = cpu_rq(cpu);
6248         unsigned long total = weighted_cpuload(cpu);
6249
6250         if (type == 0 || !sched_feat(LB_BIAS))
6251                 return total;
6252
6253         return max(rq->cpu_load[type-1], total);
6254 }
6255
6256
6257 static unsigned long cpu_avg_load_per_task(int cpu)
6258 {
6259         struct rq *rq = cpu_rq(cpu);
6260         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
6261         unsigned long load_avg = weighted_cpuload(cpu);
6262
6263         if (nr_running)
6264                 return load_avg / nr_running;
6265
6266         return 0;
6267 }
6268
6269 static void record_wakee(struct task_struct *p)
6270 {
6271         /*
6272          * Rough decay (wiping) for cost saving, don't worry
6273          * about the boundary, really active task won't care
6274          * about the loss.
6275          */
6276         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6277                 current->wakee_flips >>= 1;
6278                 current->wakee_flip_decay_ts = jiffies;
6279         }
6280
6281         if (current->last_wakee != p) {
6282                 current->last_wakee = p;
6283                 current->wakee_flips++;
6284         }
6285 }
6286
6287 static void task_waking_fair(struct task_struct *p)
6288 {
6289         struct sched_entity *se = &p->se;
6290         struct cfs_rq *cfs_rq = cfs_rq_of(se);
6291         u64 min_vruntime;
6292
6293 #ifndef CONFIG_64BIT
6294         u64 min_vruntime_copy;
6295
6296         do {
6297                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6298                 smp_rmb();
6299                 min_vruntime = cfs_rq->min_vruntime;
6300         } while (min_vruntime != min_vruntime_copy);
6301 #else
6302         min_vruntime = cfs_rq->min_vruntime;
6303 #endif
6304
6305         se->vruntime -= min_vruntime;
6306         record_wakee(p);
6307 }
6308
6309 #ifdef CONFIG_FAIR_GROUP_SCHED
6310 /*
6311  * effective_load() calculates the load change as seen from the root_task_group
6312  *
6313  * Adding load to a group doesn't make a group heavier, but can cause movement
6314  * of group shares between cpus. Assuming the shares were perfectly aligned one
6315  * can calculate the shift in shares.
6316  *
6317  * Calculate the effective load difference if @wl is added (subtracted) to @tg
6318  * on this @cpu and results in a total addition (subtraction) of @wg to the
6319  * total group weight.
6320  *
6321  * Given a runqueue weight distribution (rw_i) we can compute a shares
6322  * distribution (s_i) using:
6323  *
6324  *   s_i = rw_i / \Sum rw_j                                             (1)
6325  *
6326  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
6327  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
6328  * shares distribution (s_i):
6329  *
6330  *   rw_i = {   2,   4,   1,   0 }
6331  *   s_i  = { 2/7, 4/7, 1/7,   0 }
6332  *
6333  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
6334  * task used to run on and the CPU the waker is running on), we need to
6335  * compute the effect of waking a task on either CPU and, in case of a sync
6336  * wakeup, compute the effect of the current task going to sleep.
6337  *
6338  * So for a change of @wl to the local @cpu with an overall group weight change
6339  * of @wl we can compute the new shares distribution (s'_i) using:
6340  *
6341  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
6342  *
6343  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
6344  * differences in waking a task to CPU 0. The additional task changes the
6345  * weight and shares distributions like:
6346  *
6347  *   rw'_i = {   3,   4,   1,   0 }
6348  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
6349  *
6350  * We can then compute the difference in effective weight by using:
6351  *
6352  *   dw_i = S * (s'_i - s_i)                                            (3)
6353  *
6354  * Where 'S' is the group weight as seen by its parent.
6355  *
6356  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
6357  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
6358  * 4/7) times the weight of the group.
6359  */
6360 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6361 {
6362         struct sched_entity *se = tg->se[cpu];
6363
6364         if (!tg->parent)        /* the trivial, non-cgroup case */
6365                 return wl;
6366
6367         for_each_sched_entity(se) {
6368                 struct cfs_rq *cfs_rq = se->my_q;
6369                 long W, w = cfs_rq_load_avg(cfs_rq);
6370
6371                 tg = cfs_rq->tg;
6372
6373                 /*
6374                  * W = @wg + \Sum rw_j
6375                  */
6376                 W = wg + atomic_long_read(&tg->load_avg);
6377
6378                 /* Ensure \Sum rw_j >= rw_i */
6379                 W -= cfs_rq->tg_load_avg_contrib;
6380                 W += w;
6381
6382                 /*
6383                  * w = rw_i + @wl
6384                  */
6385                 w += wl;
6386
6387                 /*
6388                  * wl = S * s'_i; see (2)
6389                  */
6390                 if (W > 0 && w < W)
6391                         wl = (w * (long)tg->shares) / W;
6392                 else
6393                         wl = tg->shares;
6394
6395                 /*
6396                  * Per the above, wl is the new se->load.weight value; since
6397                  * those are clipped to [MIN_SHARES, ...) do so now. See
6398                  * calc_cfs_shares().
6399                  */
6400                 if (wl < MIN_SHARES)
6401                         wl = MIN_SHARES;
6402
6403                 /*
6404                  * wl = dw_i = S * (s'_i - s_i); see (3)
6405                  */
6406                 wl -= se->avg.load_avg;
6407
6408                 /*
6409                  * Recursively apply this logic to all parent groups to compute
6410                  * the final effective load change on the root group. Since
6411                  * only the @tg group gets extra weight, all parent groups can
6412                  * only redistribute existing shares. @wl is the shift in shares
6413                  * resulting from this level per the above.
6414                  */
6415                 wg = 0;
6416         }
6417
6418         return wl;
6419 }
6420 #else
6421
6422 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6423 {
6424         return wl;
6425 }
6426
6427 #endif
6428
6429 /*
6430  * Returns the current capacity of cpu after applying both
6431  * cpu and freq scaling.
6432  */
6433 unsigned long capacity_curr_of(int cpu)
6434 {
6435         return cpu_rq(cpu)->cpu_capacity_orig *
6436                arch_scale_freq_capacity(NULL, cpu)
6437                >> SCHED_CAPACITY_SHIFT;
6438 }
6439
6440 struct energy_env {
6441         struct sched_group      *sg_top;
6442         struct sched_group      *sg_cap;
6443         int                     cap_idx;
6444         int                     util_delta;
6445         int                     src_cpu;
6446         int                     dst_cpu;
6447         int                     trg_cpu;
6448         int                     energy;
6449         int                     payoff;
6450         struct task_struct      *task;
6451         struct {
6452                 int before;
6453                 int after;
6454                 int delta;
6455                 int diff;
6456         } nrg;
6457         struct {
6458                 int before;
6459                 int after;
6460                 int delta;
6461         } cap;
6462 };
6463
6464 static int cpu_util_wake(int cpu, struct task_struct *p);
6465
6466 /*
6467  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
6468  * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
6469  * energy calculations.
6470  *
6471  * Since util is a scale-invariant utilization defined as:
6472  *
6473  *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
6474  *
6475  * the normalized util can be found using the specific capacity.
6476  *
6477  *   capacity = capacity_orig * curr_freq/max_freq
6478  *
6479  *   norm_util = running_time/time ~ util/capacity
6480  */
6481 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
6482 {
6483         if (util >= capacity)
6484                 return SCHED_CAPACITY_SCALE;
6485
6486         return (util << SCHED_CAPACITY_SHIFT)/capacity;
6487 }
6488
6489 static unsigned long group_max_util(struct energy_env *eenv)
6490 {
6491         unsigned long max_util = 0;
6492         unsigned long util;
6493         int cpu;
6494
6495         for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
6496                 util = cpu_util_wake(cpu, eenv->task);
6497
6498                 /*
6499                  * If we are looking at the target CPU specified by the eenv,
6500                  * then we should add the (estimated) utilization of the task
6501                  * assuming we will wake it up on that CPU.
6502                  */
6503                 if (unlikely(cpu == eenv->trg_cpu))
6504                         util += eenv->util_delta;
6505
6506                 max_util = max(max_util, util);
6507         }
6508
6509         return max_util;
6510 }
6511
6512 /*
6513  * group_norm_util() returns the approximated group util relative to it's
6514  * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
6515  * in energy calculations.
6516  *
6517  * Since task executions may or may not overlap in time in the group the true
6518  * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
6519  * when iterating over all CPUs in the group.
6520  * The latter estimate is used as it leads to a more pessimistic energy
6521  * estimate (more busy).
6522  */
6523 static unsigned
6524 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
6525 {
6526         unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
6527         unsigned long util, util_sum = 0;
6528         int cpu;
6529
6530         for_each_cpu(cpu, sched_group_cpus(sg)) {
6531                 util = cpu_util_wake(cpu, eenv->task);
6532
6533                 /*
6534                  * If we are looking at the target CPU specified by the eenv,
6535                  * then we should add the (estimated) utilization of the task
6536                  * assuming we will wake it up on that CPU.
6537                  */
6538                 if (unlikely(cpu == eenv->trg_cpu))
6539                         util += eenv->util_delta;
6540
6541                 util_sum += __cpu_norm_util(util, capacity);
6542         }
6543
6544         return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
6545 }
6546
6547 static int find_new_capacity(struct energy_env *eenv,
6548         const struct sched_group_energy * const sge)
6549 {
6550         int idx, max_idx = sge->nr_cap_states - 1;
6551         unsigned long util = group_max_util(eenv);
6552
6553         /* default is max_cap if we don't find a match */
6554         eenv->cap_idx = max_idx;
6555
6556         for (idx = 0; idx < sge->nr_cap_states; idx++) {
6557                 if (sge->cap_states[idx].cap >= util) {
6558                         eenv->cap_idx = idx;
6559                         break;
6560                 }
6561         }
6562
6563         return eenv->cap_idx;
6564 }
6565
6566 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
6567 {
6568         int i, state = INT_MAX;
6569         int src_in_grp, dst_in_grp;
6570         long grp_util = 0;
6571
6572         /* Find the shallowest idle state in the sched group. */
6573         for_each_cpu(i, sched_group_cpus(sg))
6574                 state = min(state, idle_get_state_idx(cpu_rq(i)));
6575
6576         /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
6577         state++;
6578
6579         src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
6580         dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
6581         if (src_in_grp == dst_in_grp) {
6582                 /* both CPUs under consideration are in the same group or not in
6583                  * either group, migration should leave idle state the same.
6584                  */
6585                 goto end;
6586         }
6587
6588         /*
6589          * Try to estimate if a deeper idle state is
6590          * achievable when we move the task.
6591          */
6592         for_each_cpu(i, sched_group_cpus(sg)) {
6593                 grp_util += cpu_util_wake(i, eenv->task);
6594                 if (unlikely(i == eenv->trg_cpu))
6595                         grp_util += eenv->util_delta;
6596         }
6597
6598         if (grp_util <=
6599                 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
6600                 /* after moving, this group is at most partly
6601                  * occupied, so it should have some idle time.
6602                  */
6603                 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
6604                 int new_state = grp_util * max_idle_state_idx;
6605                 if (grp_util <= 0)
6606                         /* group will have no util, use lowest state */
6607                         new_state = max_idle_state_idx + 1;
6608                 else {
6609                         /* for partially idle, linearly map util to idle
6610                          * states, excluding the lowest one. This does not
6611                          * correspond to the state we expect to enter in
6612                          * reality, but an indication of what might happen.
6613                          */
6614                         new_state = min(max_idle_state_idx, (int)
6615                                         (new_state / sg->sgc->max_capacity));
6616                         new_state = max_idle_state_idx - new_state;
6617                 }
6618                 state = new_state;
6619         } else {
6620                 /* After moving, the group will be fully occupied
6621                  * so assume it will not be idle at all.
6622                  */
6623                 state = 0;
6624         }
6625 end:
6626         return state;
6627 }
6628
6629 /*
6630  * sched_group_energy(): Computes the absolute energy consumption of cpus
6631  * belonging to the sched_group including shared resources shared only by
6632  * members of the group. Iterates over all cpus in the hierarchy below the
6633  * sched_group starting from the bottom working it's way up before going to
6634  * the next cpu until all cpus are covered at all levels. The current
6635  * implementation is likely to gather the same util statistics multiple times.
6636  * This can probably be done in a faster but more complex way.
6637  * Note: sched_group_energy() may fail when racing with sched_domain updates.
6638  */
6639 static int sched_group_energy(struct energy_env *eenv)
6640 {
6641         struct cpumask visit_cpus;
6642         u64 total_energy = 0;
6643         int cpu_count;
6644
6645         WARN_ON(!eenv->sg_top->sge);
6646
6647         cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
6648         /* If a cpu is hotplugged in while we are in this function,
6649          * it does not appear in the existing visit_cpus mask
6650          * which came from the sched_group pointer of the
6651          * sched_domain pointed at by sd_ea for either the prev
6652          * or next cpu and was dereferenced in __energy_diff.
6653          * Since we will dereference sd_scs later as we iterate
6654          * through the CPUs we expect to visit, new CPUs can
6655          * be present which are not in the visit_cpus mask.
6656          * Guard this with cpu_count.
6657          */
6658         cpu_count = cpumask_weight(&visit_cpus);
6659
6660         while (!cpumask_empty(&visit_cpus)) {
6661                 struct sched_group *sg_shared_cap = NULL;
6662                 int cpu = cpumask_first(&visit_cpus);
6663                 struct sched_domain *sd;
6664
6665                 /*
6666                  * Is the group utilization affected by cpus outside this
6667                  * sched_group?
6668                  * This sd may have groups with cpus which were not present
6669                  * when we took visit_cpus.
6670                  */
6671                 sd = rcu_dereference(per_cpu(sd_scs, cpu));
6672
6673                 if (sd && sd->parent)
6674                         sg_shared_cap = sd->parent->groups;
6675
6676                 for_each_domain(cpu, sd) {
6677                         struct sched_group *sg = sd->groups;
6678
6679                         /* Has this sched_domain already been visited? */
6680                         if (sd->child && group_first_cpu(sg) != cpu)
6681                                 break;
6682
6683                         do {
6684                                 unsigned long group_util;
6685                                 int sg_busy_energy, sg_idle_energy;
6686                                 int cap_idx, idle_idx;
6687
6688                                 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
6689                                         eenv->sg_cap = sg_shared_cap;
6690                                 else
6691                                         eenv->sg_cap = sg;
6692
6693                                 cap_idx = find_new_capacity(eenv, sg->sge);
6694
6695                                 if (sg->group_weight == 1) {
6696                                         /* Remove capacity of src CPU (before task move) */
6697                                         if (eenv->trg_cpu == eenv->src_cpu &&
6698                                             cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
6699                                                 eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
6700                                                 eenv->cap.delta -= eenv->cap.before;
6701                                         }
6702                                         /* Add capacity of dst CPU  (after task move) */
6703                                         if (eenv->trg_cpu == eenv->dst_cpu &&
6704                                             cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
6705                                                 eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
6706                                                 eenv->cap.delta += eenv->cap.after;
6707                                         }
6708                                 }
6709
6710                                 idle_idx = group_idle_state(eenv, sg);
6711                                 group_util = group_norm_util(eenv, sg);
6712
6713                                 sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
6714                                 sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
6715                                                                 * sg->sge->idle_states[idle_idx].power);
6716
6717                                 total_energy += sg_busy_energy + sg_idle_energy;
6718
6719                                 if (!sd->child) {
6720                                         /*
6721                                          * cpu_count here is the number of
6722                                          * cpus we expect to visit in this
6723                                          * calculation. If we race against
6724                                          * hotplug, we can have extra cpus
6725                                          * added to the groups we are
6726                                          * iterating which do not appear in
6727                                          * the visit_cpus mask. In that case
6728                                          * we are not able to calculate energy
6729                                          * without restarting so we will bail
6730                                          * out and use prev_cpu this time.
6731                                          */
6732                                         if (!cpu_count)
6733                                                 return -EINVAL;
6734                                         cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
6735                                         cpu_count--;
6736                                 }
6737
6738                                 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
6739                                         goto next_cpu;
6740
6741                         } while (sg = sg->next, sg != sd->groups);
6742                 }
6743
6744                 /*
6745                  * If we raced with hotplug and got an sd NULL-pointer;
6746                  * returning a wrong energy estimation is better than
6747                  * entering an infinite loop.
6748                  * Specifically: If a cpu is unplugged after we took
6749                  * the visit_cpus mask, it no longer has an sd_scs
6750                  * pointer, so when we dereference it, we get NULL.
6751                  */
6752                 if (cpumask_test_cpu(cpu, &visit_cpus))
6753                         return -EINVAL;
6754 next_cpu:
6755                 cpumask_clear_cpu(cpu, &visit_cpus);
6756                 continue;
6757         }
6758
6759         eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
6760         return 0;
6761 }
6762
6763 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
6764 {
6765         return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
6766 }
6767
6768 static inline unsigned long task_util(struct task_struct *p);
6769
6770 /*
6771  * energy_diff(): Estimate the energy impact of changing the utilization
6772  * distribution. eenv specifies the change: utilisation amount, source, and
6773  * destination cpu. Source or destination cpu may be -1 in which case the
6774  * utilization is removed from or added to the system (e.g. task wake-up). If
6775  * both are specified, the utilization is migrated.
6776  */
6777 static inline int __energy_diff(struct energy_env *eenv)
6778 {
6779         struct sched_domain *sd;
6780         struct sched_group *sg;
6781         int sd_cpu = -1, energy_before = 0, energy_after = 0;
6782         int diff, margin;
6783
6784         struct energy_env eenv_before = {
6785                 .util_delta     = task_util(eenv->task),
6786                 .src_cpu        = eenv->src_cpu,
6787                 .dst_cpu        = eenv->dst_cpu,
6788                 .trg_cpu        = eenv->src_cpu,
6789                 .nrg            = { 0, 0, 0, 0},
6790                 .cap            = { 0, 0, 0 },
6791                 .task           = eenv->task,
6792         };
6793
6794         if (eenv->src_cpu == eenv->dst_cpu)
6795                 return 0;
6796
6797         sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
6798         sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
6799
6800         if (!sd)
6801                 return 0; /* Error */
6802
6803         sg = sd->groups;
6804
6805         do {
6806                 if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
6807                         eenv_before.sg_top = eenv->sg_top = sg;
6808
6809                         if (sched_group_energy(&eenv_before))
6810                                 return 0; /* Invalid result abort */
6811                         energy_before += eenv_before.energy;
6812
6813                         /* Keep track of SRC cpu (before) capacity */
6814                         eenv->cap.before = eenv_before.cap.before;
6815                         eenv->cap.delta = eenv_before.cap.delta;
6816
6817                         if (sched_group_energy(eenv))
6818                                 return 0; /* Invalid result abort */
6819                         energy_after += eenv->energy;
6820                 }
6821         } while (sg = sg->next, sg != sd->groups);
6822
6823         eenv->nrg.before = energy_before;
6824         eenv->nrg.after = energy_after;
6825         eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
6826         eenv->payoff = 0;
6827 #ifndef CONFIG_SCHED_TUNE
6828         trace_sched_energy_diff(eenv->task,
6829                         eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6830                         eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6831                         eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6832                         eenv->nrg.delta, eenv->payoff);
6833 #endif
6834         /*
6835          * Dead-zone margin preventing too many migrations.
6836          */
6837
6838         margin = eenv->nrg.before >> 6; /* ~1.56% */
6839
6840         diff = eenv->nrg.after - eenv->nrg.before;
6841
6842         eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
6843
6844         return eenv->nrg.diff;
6845 }
6846
6847 #ifdef CONFIG_SCHED_TUNE
6848
6849 struct target_nrg schedtune_target_nrg;
6850
6851 #ifdef CONFIG_CGROUP_SCHEDTUNE
6852 extern bool schedtune_initialized;
6853 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6854
6855 /*
6856  * System energy normalization
6857  * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
6858  * corresponding to the specified energy variation.
6859  */
6860 static inline int
6861 normalize_energy(int energy_diff)
6862 {
6863         u32 normalized_nrg;
6864
6865 #ifdef CONFIG_CGROUP_SCHEDTUNE
6866         /* during early setup, we don't know the extents */
6867         if (unlikely(!schedtune_initialized))
6868                 return energy_diff < 0 ? -1 : 1 ;
6869 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6870
6871 #ifdef CONFIG_SCHED_DEBUG
6872         {
6873         int max_delta;
6874
6875         /* Check for boundaries */
6876         max_delta  = schedtune_target_nrg.max_power;
6877         max_delta -= schedtune_target_nrg.min_power;
6878         WARN_ON(abs(energy_diff) >= max_delta);
6879         }
6880 #endif
6881
6882         /* Do scaling using positive numbers to increase the range */
6883         normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
6884
6885         /* Scale by energy magnitude */
6886         normalized_nrg <<= SCHED_CAPACITY_SHIFT;
6887
6888         /* Normalize on max energy for target platform */
6889         normalized_nrg = reciprocal_divide(
6890                         normalized_nrg, schedtune_target_nrg.rdiv);
6891
6892         return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
6893 }
6894
6895 static inline int
6896 energy_diff(struct energy_env *eenv)
6897 {
6898         int boost = schedtune_task_boost(eenv->task);
6899         int nrg_delta;
6900
6901         /* Conpute "absolute" energy diff */
6902         __energy_diff(eenv);
6903
6904         /* Return energy diff when boost margin is 0 */
6905         if (boost == 0) {
6906                 trace_sched_energy_diff(eenv->task,
6907                                 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6908                                 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6909                                 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6910                                 0, -eenv->nrg.diff);
6911                 return eenv->nrg.diff;
6912         }
6913
6914         /* Compute normalized energy diff */
6915         nrg_delta = normalize_energy(eenv->nrg.diff);
6916         eenv->nrg.delta = nrg_delta;
6917
6918         eenv->payoff = schedtune_accept_deltas(
6919                         eenv->nrg.delta,
6920                         eenv->cap.delta,
6921                         eenv->task);
6922
6923         trace_sched_energy_diff(eenv->task,
6924                         eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6925                         eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6926                         eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6927                         eenv->nrg.delta, eenv->payoff);
6928
6929         /*
6930          * When SchedTune is enabled, the energy_diff() function will return
6931          * the computed energy payoff value. Since the energy_diff() return
6932          * value is expected to be negative by its callers, this evaluation
6933          * function return a negative value each time the evaluation return a
6934          * positive payoff, which is the condition for the acceptance of
6935          * a scheduling decision
6936          */
6937         return -eenv->payoff;
6938 }
6939 #else /* CONFIG_SCHED_TUNE */
6940 #define energy_diff(eenv) __energy_diff(eenv)
6941 #endif
6942
6943 /*
6944  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
6945  * A waker of many should wake a different task than the one last awakened
6946  * at a frequency roughly N times higher than one of its wakees.  In order
6947  * to determine whether we should let the load spread vs consolodating to
6948  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
6949  * partner, and a factor of lls_size higher frequency in the other.  With
6950  * both conditions met, we can be relatively sure that the relationship is
6951  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
6952  * being client/server, worker/dispatcher, interrupt source or whatever is
6953  * irrelevant, spread criteria is apparent partner count exceeds socket size.
6954  */
6955 static int wake_wide(struct task_struct *p, int sibling_count_hint)
6956 {
6957         unsigned int master = current->wakee_flips;
6958         unsigned int slave = p->wakee_flips;
6959         int llc_size = this_cpu_read(sd_llc_size);
6960
6961         if (sibling_count_hint >= llc_size)
6962                 return 1;
6963
6964         if (master < slave)
6965                 swap(master, slave);
6966         if (slave < llc_size || master < slave * llc_size)
6967                 return 0;
6968         return 1;
6969 }
6970
6971 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6972                        int prev_cpu, int sync)
6973 {
6974         s64 this_load, load;
6975         s64 this_eff_load, prev_eff_load;
6976         int idx, this_cpu;
6977         struct task_group *tg;
6978         unsigned long weight;
6979         int balanced;
6980
6981         idx       = sd->wake_idx;
6982         this_cpu  = smp_processor_id();
6983         load      = source_load(prev_cpu, idx);
6984         this_load = target_load(this_cpu, idx);
6985
6986         /*
6987          * If sync wakeup then subtract the (maximum possible)
6988          * effect of the currently running task from the load
6989          * of the current CPU:
6990          */
6991         if (sync) {
6992                 tg = task_group(current);
6993                 weight = current->se.avg.load_avg;
6994
6995                 this_load += effective_load(tg, this_cpu, -weight, -weight);
6996                 load += effective_load(tg, prev_cpu, 0, -weight);
6997         }
6998
6999         tg = task_group(p);
7000         weight = p->se.avg.load_avg;
7001
7002         /*
7003          * In low-load situations, where prev_cpu is idle and this_cpu is idle
7004          * due to the sync cause above having dropped this_load to 0, we'll
7005          * always have an imbalance, but there's really nothing you can do
7006          * about that, so that's good too.
7007          *
7008          * Otherwise check if either cpus are near enough in load to allow this
7009          * task to be woken on this_cpu.
7010          */
7011         this_eff_load = 100;
7012         this_eff_load *= capacity_of(prev_cpu);
7013
7014         prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
7015         prev_eff_load *= capacity_of(this_cpu);
7016
7017         if (this_load > 0) {
7018                 this_eff_load *= this_load +
7019                         effective_load(tg, this_cpu, weight, weight);
7020
7021                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
7022         }
7023
7024         balanced = this_eff_load <= prev_eff_load;
7025
7026         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
7027
7028         if (!balanced)
7029                 return 0;
7030
7031         schedstat_inc(sd, ttwu_move_affine);
7032         schedstat_inc(p, se.statistics.nr_wakeups_affine);
7033
7034         return 1;
7035 }
7036
7037 static inline unsigned long task_util(struct task_struct *p)
7038 {
7039         return p->se.avg.util_avg;
7040 }
7041
7042 static inline unsigned long boosted_task_util(struct task_struct *task);
7043
7044 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
7045 {
7046         unsigned long capacity = capacity_of(cpu);
7047
7048         util += boosted_task_util(p);
7049
7050         return (capacity * 1024) > (util * capacity_margin);
7051 }
7052
7053 static inline bool task_fits_max(struct task_struct *p, int cpu)
7054 {
7055         unsigned long capacity = capacity_of(cpu);
7056         unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7057
7058         if (capacity == max_capacity)
7059                 return true;
7060
7061         if (capacity * capacity_margin > max_capacity * 1024)
7062                 return true;
7063
7064         return __task_fits(p, cpu, 0);
7065 }
7066
7067 static bool __cpu_overutilized(int cpu, int delta)
7068 {
7069         return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
7070 }
7071
7072 static bool cpu_overutilized(int cpu)
7073 {
7074         return __cpu_overutilized(cpu, 0);
7075 }
7076
7077 #ifdef CONFIG_SCHED_TUNE
7078
7079 struct reciprocal_value schedtune_spc_rdiv;
7080
7081 static long
7082 schedtune_margin(unsigned long signal, long boost)
7083 {
7084         long long margin = 0;
7085
7086         /*
7087          * Signal proportional compensation (SPC)
7088          *
7089          * The Boost (B) value is used to compute a Margin (M) which is
7090          * proportional to the complement of the original Signal (S):
7091          *   M = B * (SCHED_CAPACITY_SCALE - S)
7092          * The obtained M could be used by the caller to "boost" S.
7093          */
7094         if (boost >= 0) {
7095                 margin  = SCHED_CAPACITY_SCALE - signal;
7096                 margin *= boost;
7097         } else
7098                 margin = -signal * boost;
7099
7100         margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
7101
7102         if (boost < 0)
7103                 margin *= -1;
7104         return margin;
7105 }
7106
7107 static inline int
7108 schedtune_cpu_margin(unsigned long util, int cpu)
7109 {
7110         int boost = schedtune_cpu_boost(cpu);
7111
7112         if (boost == 0)
7113                 return 0;
7114
7115         return schedtune_margin(util, boost);
7116 }
7117
7118 static inline long
7119 schedtune_task_margin(struct task_struct *task)
7120 {
7121         int boost = schedtune_task_boost(task);
7122         unsigned long util;
7123         long margin;
7124
7125         if (boost == 0)
7126                 return 0;
7127
7128         util = task_util(task);
7129         margin = schedtune_margin(util, boost);
7130
7131         return margin;
7132 }
7133
7134 #else /* CONFIG_SCHED_TUNE */
7135
7136 static inline int
7137 schedtune_cpu_margin(unsigned long util, int cpu)
7138 {
7139         return 0;
7140 }
7141
7142 static inline int
7143 schedtune_task_margin(struct task_struct *task)
7144 {
7145         return 0;
7146 }
7147
7148 #endif /* CONFIG_SCHED_TUNE */
7149
7150 unsigned long
7151 boosted_cpu_util(int cpu)
7152 {
7153         unsigned long util = cpu_util_freq(cpu);
7154         long margin = schedtune_cpu_margin(util, cpu);
7155
7156         trace_sched_boost_cpu(cpu, util, margin);
7157
7158         return util + margin;
7159 }
7160
7161 static inline unsigned long
7162 boosted_task_util(struct task_struct *task)
7163 {
7164         unsigned long util = task_util(task);
7165         long margin = schedtune_task_margin(task);
7166
7167         trace_sched_boost_task(task, util, margin);
7168
7169         return util + margin;
7170 }
7171
7172 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
7173 {
7174         return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
7175 }
7176
7177 /*
7178  * find_idlest_group finds and returns the least busy CPU group within the
7179  * domain.
7180  *
7181  * Assumes p is allowed on at least one CPU in sd.
7182  */
7183 static struct sched_group *
7184 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
7185                   int this_cpu, int sd_flag)
7186 {
7187         struct sched_group *idlest = NULL, *group = sd->groups;
7188         struct sched_group *most_spare_sg = NULL;
7189         unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
7190         unsigned long most_spare = 0, this_spare = 0;
7191         int load_idx = sd->forkexec_idx;
7192         int imbalance = 100 + (sd->imbalance_pct-100)/2;
7193
7194         if (sd_flag & SD_BALANCE_WAKE)
7195                 load_idx = sd->wake_idx;
7196
7197         do {
7198                 unsigned long load, avg_load, spare_cap, max_spare_cap;
7199                 int local_group;
7200                 int i;
7201
7202                 /* Skip over this group if it has no CPUs allowed */
7203                 if (!cpumask_intersects(sched_group_cpus(group),
7204                                         tsk_cpus_allowed(p)))
7205                         continue;
7206
7207                 local_group = cpumask_test_cpu(this_cpu,
7208                                                sched_group_cpus(group));
7209
7210                 /*
7211                  * Tally up the load of all CPUs in the group and find
7212                  * the group containing the CPU with most spare capacity.
7213                  */
7214                 avg_load = 0;
7215                 max_spare_cap = 0;
7216
7217                 for_each_cpu(i, sched_group_cpus(group)) {
7218                         /* Bias balancing toward cpus of our domain */
7219                         if (local_group)
7220                                 load = source_load(i, load_idx);
7221                         else
7222                                 load = target_load(i, load_idx);
7223
7224                         avg_load += load;
7225
7226                         spare_cap = capacity_spare_wake(i, p);
7227
7228                         if (spare_cap > max_spare_cap)
7229                                 max_spare_cap = spare_cap;
7230                 }
7231
7232                 /* Adjust by relative CPU capacity of the group */
7233                 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
7234
7235                 if (local_group) {
7236                         this_load = avg_load;
7237                         this_spare = max_spare_cap;
7238                 } else {
7239                         if (avg_load < min_load) {
7240                                 min_load = avg_load;
7241                                 idlest = group;
7242                         }
7243
7244                         if (most_spare < max_spare_cap) {
7245                                 most_spare = max_spare_cap;
7246                                 most_spare_sg = group;
7247                         }
7248                 }
7249         } while (group = group->next, group != sd->groups);
7250
7251         /*
7252          * The cross-over point between using spare capacity or least load
7253          * is too conservative for high utilization tasks on partially
7254          * utilized systems if we require spare_capacity > task_util(p),
7255          * so we allow for some task stuffing by using
7256          * spare_capacity > task_util(p)/2.
7257          *
7258          * Spare capacity can't be used for fork because the utilization has
7259          * not been set yet, we must first select a rq to compute the initial
7260          * utilization.
7261          */
7262         if (sd_flag & SD_BALANCE_FORK)
7263                 goto skip_spare;
7264
7265         if (this_spare > task_util(p) / 2 &&
7266             imbalance*this_spare > 100*most_spare)
7267                 return NULL;
7268         else if (most_spare > task_util(p) / 2)
7269                 return most_spare_sg;
7270
7271 skip_spare:
7272         if (!idlest || 100*this_load < imbalance*min_load)
7273                 return NULL;
7274         return idlest;
7275 }
7276
7277 /*
7278  * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
7279  */
7280 static int
7281 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7282 {
7283         unsigned long load, min_load = ULONG_MAX;
7284         unsigned int min_exit_latency = UINT_MAX;
7285         u64 latest_idle_timestamp = 0;
7286         int least_loaded_cpu = this_cpu;
7287         int shallowest_idle_cpu = -1;
7288         int i;
7289
7290         /* Check if we have any choice: */
7291         if (group->group_weight == 1)
7292                 return cpumask_first(sched_group_cpus(group));
7293
7294         /* Traverse only the allowed CPUs */
7295         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
7296                 if (idle_cpu(i)) {
7297                         struct rq *rq = cpu_rq(i);
7298                         struct cpuidle_state *idle = idle_get_state(rq);
7299                         if (idle && idle->exit_latency < min_exit_latency) {
7300                                 /*
7301                                  * We give priority to a CPU whose idle state
7302                                  * has the smallest exit latency irrespective
7303                                  * of any idle timestamp.
7304                                  */
7305                                 min_exit_latency = idle->exit_latency;
7306                                 latest_idle_timestamp = rq->idle_stamp;
7307                                 shallowest_idle_cpu = i;
7308                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7309                                    rq->idle_stamp > latest_idle_timestamp) {
7310                                 /*
7311                                  * If equal or no active idle state, then
7312                                  * the most recently idled CPU might have
7313                                  * a warmer cache.
7314                                  */
7315                                 latest_idle_timestamp = rq->idle_stamp;
7316                                 shallowest_idle_cpu = i;
7317                         }
7318                 } else if (shallowest_idle_cpu == -1) {
7319                         load = weighted_cpuload(i);
7320                         if (load < min_load || (load == min_load && i == this_cpu)) {
7321                                 min_load = load;
7322                                 least_loaded_cpu = i;
7323                         }
7324                 }
7325         }
7326
7327         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7328  }
7329
7330 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
7331                                   int cpu, int prev_cpu, int sd_flag)
7332 {
7333         int new_cpu = cpu;
7334         int wu = sd_flag & SD_BALANCE_WAKE;
7335         int cas_cpu = -1;
7336
7337         if (wu) {
7338                 schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
7339                 schedstat_inc(this_rq(), eas_stats.cas_attempts);
7340         }
7341
7342         if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
7343                 return prev_cpu;
7344
7345         while (sd) {
7346                 struct sched_group *group;
7347                 struct sched_domain *tmp;
7348                 int weight;
7349
7350                 if (wu)
7351                         schedstat_inc(sd, eas_stats.cas_attempts);
7352
7353                 if (!(sd->flags & sd_flag)) {
7354                         sd = sd->child;
7355                         continue;
7356                 }
7357
7358                 group = find_idlest_group(sd, p, cpu, sd_flag);
7359                 if (!group) {
7360                         sd = sd->child;
7361                         continue;
7362                 }
7363
7364                 new_cpu = find_idlest_group_cpu(group, p, cpu);
7365                 if (new_cpu == cpu) {
7366                         /* Now try balancing at a lower domain level of cpu */
7367                         sd = sd->child;
7368                         continue;
7369                 }
7370
7371                 /* Now try balancing at a lower domain level of new_cpu */
7372                 cpu = cas_cpu = new_cpu;
7373                 weight = sd->span_weight;
7374                 sd = NULL;
7375                 for_each_domain(cpu, tmp) {
7376                         if (weight <= tmp->span_weight)
7377                                 break;
7378                         if (tmp->flags & sd_flag)
7379                                 sd = tmp;
7380                 }
7381                 /* while loop will break here if sd == NULL */
7382         }
7383
7384         if (wu && (cas_cpu >= 0)) {
7385                 schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
7386                 schedstat_inc(this_rq(), eas_stats.cas_count);
7387         }
7388
7389         return new_cpu;
7390 }
7391
7392 /*
7393  * Try and locate an idle CPU in the sched_domain.
7394  */
7395 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7396 {
7397         struct sched_domain *sd;
7398         struct sched_group *sg;
7399         int best_idle_cpu = -1;
7400         int best_idle_cstate = INT_MAX;
7401         unsigned long best_idle_capacity = ULONG_MAX;
7402
7403         schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
7404         schedstat_inc(this_rq(), eas_stats.sis_attempts);
7405
7406         if (!sysctl_sched_cstate_aware) {
7407                 if (idle_cpu(target)) {
7408                         schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
7409                         schedstat_inc(this_rq(), eas_stats.sis_idle);
7410                         return target;
7411                 }
7412
7413                 /*
7414                  * If the prevous cpu is cache affine and idle, don't be stupid.
7415                  */
7416                 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
7417                         schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
7418                         schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
7419                         return prev;
7420                 }
7421         }
7422
7423         if (!(current->flags & PF_WAKE_UP_IDLE) &&
7424                         !(p->flags & PF_WAKE_UP_IDLE))
7425                 return target;
7426
7427         /*
7428          * Otherwise, iterate the domains and find an elegible idle cpu.
7429          */
7430         sd = rcu_dereference(per_cpu(sd_llc, target));
7431         for_each_lower_domain(sd) {
7432                 sg = sd->groups;
7433                 do {
7434                         int i;
7435                         if (!cpumask_intersects(sched_group_cpus(sg),
7436                                                 tsk_cpus_allowed(p)))
7437                                 goto next;
7438
7439                         if (sysctl_sched_cstate_aware) {
7440                                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7441                                         int idle_idx = idle_get_state_idx(cpu_rq(i));
7442                                         unsigned long new_usage = boosted_task_util(p);
7443                                         unsigned long capacity_orig = capacity_orig_of(i);
7444
7445                                         if (new_usage > capacity_orig || !idle_cpu(i))
7446                                                 goto next;
7447
7448                                         if (i == target && new_usage <= capacity_curr_of(target)) {
7449                                                 schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
7450                                                 schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
7451                                                 schedstat_inc(sd, eas_stats.sis_suff_cap);
7452                                                 return target;
7453                                         }
7454
7455                                         if (idle_idx < best_idle_cstate &&
7456                                             capacity_orig <= best_idle_capacity) {
7457                                                 best_idle_cpu = i;
7458                                                 best_idle_cstate = idle_idx;
7459                                                 best_idle_capacity = capacity_orig;
7460                                         }
7461                                 }
7462                         } else {
7463                                 for_each_cpu(i, sched_group_cpus(sg)) {
7464                                         if (i == target || !idle_cpu(i))
7465                                                 goto next;
7466                                 }
7467
7468                                 target = cpumask_first_and(sched_group_cpus(sg),
7469                                         tsk_cpus_allowed(p));
7470                                 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
7471                                 schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
7472                                 schedstat_inc(sd, eas_stats.sis_idle_cpu);
7473                                 goto done;
7474                         }
7475 next:
7476                         sg = sg->next;
7477                 } while (sg != sd->groups);
7478         }
7479
7480         if (best_idle_cpu >= 0)
7481                 target = best_idle_cpu;
7482
7483 done:
7484         schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
7485         schedstat_inc(this_rq(), eas_stats.sis_count);
7486
7487         return target;
7488 }
7489
7490 /*
7491  * cpu_util_wake: Compute cpu utilization with any contributions from
7492  * the waking task p removed.  check_for_migration() looks for a better CPU of
7493  * rq->curr. For that case we should return cpu util with contributions from
7494  * currently running task p removed.
7495  */
7496 static int cpu_util_wake(int cpu, struct task_struct *p)
7497 {
7498         unsigned long util, capacity;
7499
7500 #ifdef CONFIG_SCHED_WALT
7501         /*
7502          * WALT does not decay idle tasks in the same manner
7503          * as PELT, so it makes little sense to subtract task
7504          * utilization from cpu utilization. Instead just use
7505          * cpu_util for this case.
7506          */
7507         if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7508             p->state == TASK_WAKING)
7509                 return cpu_util(cpu);
7510 #endif
7511         /* Task has no contribution or is new */
7512         if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
7513                 return cpu_util(cpu);
7514
7515         capacity = capacity_orig_of(cpu);
7516         util = max_t(long, cpu_util(cpu) - task_util(p), 0);
7517
7518         return (util >= capacity) ? capacity : util;
7519 }
7520
7521 static int start_cpu(bool boosted)
7522 {
7523         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7524
7525         return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
7526 }
7527
7528 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
7529                                    bool boosted, bool prefer_idle)
7530 {
7531         unsigned long best_idle_min_cap_orig = ULONG_MAX;
7532         unsigned long min_util = boosted_task_util(p);
7533         unsigned long target_capacity = ULONG_MAX;
7534         unsigned long min_wake_util = ULONG_MAX;
7535         unsigned long target_max_spare_cap = 0;
7536         unsigned long best_active_util = ULONG_MAX;
7537         int best_idle_cstate = INT_MAX;
7538         struct sched_domain *sd;
7539         struct sched_group *sg;
7540         int best_active_cpu = -1;
7541         int best_idle_cpu = -1;
7542         int target_cpu = -1;
7543         int cpu, i;
7544
7545         *backup_cpu = -1;
7546
7547         schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
7548         schedstat_inc(this_rq(), eas_stats.fbt_attempts);
7549
7550         /* Find start CPU based on boost value */
7551         cpu = start_cpu(boosted);
7552         if (cpu < 0) {
7553                 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
7554                 schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
7555                 return -1;
7556         }
7557
7558         /* Find SD for the start CPU */
7559         sd = rcu_dereference(per_cpu(sd_ea, cpu));
7560         if (!sd) {
7561                 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
7562                 schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
7563                 return -1;
7564         }
7565
7566         /* Scan CPUs in all SDs */
7567         sg = sd->groups;
7568         do {
7569                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7570                         unsigned long capacity_curr = capacity_curr_of(i);
7571                         unsigned long capacity_orig = capacity_orig_of(i);
7572                         unsigned long wake_util, new_util;
7573
7574                         if (!cpu_online(i))
7575                                 continue;
7576
7577                         if (walt_cpu_high_irqload(i))
7578                                 continue;
7579
7580                         /*
7581                          * p's blocked utilization is still accounted for on prev_cpu
7582                          * so prev_cpu will receive a negative bias due to the double
7583                          * accounting. However, the blocked utilization may be zero.
7584                          */
7585                         wake_util = cpu_util_wake(i, p);
7586                         new_util = wake_util + task_util(p);
7587
7588                         /*
7589                          * Ensure minimum capacity to grant the required boost.
7590                          * The target CPU can be already at a capacity level higher
7591                          * than the one required to boost the task.
7592                          */
7593                         new_util = max(min_util, new_util);
7594                         if (new_util > capacity_orig)
7595                                 continue;
7596
7597                         /*
7598                          * Case A) Latency sensitive tasks
7599                          *
7600                          * Unconditionally favoring tasks that prefer idle CPU to
7601                          * improve latency.
7602                          *
7603                          * Looking for:
7604                          * - an idle CPU, whatever its idle_state is, since
7605                          *   the first CPUs we explore are more likely to be
7606                          *   reserved for latency sensitive tasks.
7607                          * - a non idle CPU where the task fits in its current
7608                          *   capacity and has the maximum spare capacity.
7609                          * - a non idle CPU with lower contention from other
7610                          *   tasks and running at the lowest possible OPP.
7611                          *
7612                          * The last two goals tries to favor a non idle CPU
7613                          * where the task can run as if it is "almost alone".
7614                          * A maximum spare capacity CPU is favoured since
7615                          * the task already fits into that CPU's capacity
7616                          * without waiting for an OPP chance.
7617                          *
7618                          * The following code path is the only one in the CPUs
7619                          * exploration loop which is always used by
7620                          * prefer_idle tasks. It exits the loop with wither a
7621                          * best_active_cpu or a target_cpu which should
7622                          * represent an optimal choice for latency sensitive
7623                          * tasks.
7624                          */
7625                         if (prefer_idle) {
7626
7627                                 /*
7628                                  * Case A.1: IDLE CPU
7629                                  * Return the first IDLE CPU we find.
7630                                  */
7631                                 if (idle_cpu(i)) {
7632                                         schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
7633                                         schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
7634
7635                                         trace_sched_find_best_target(p,
7636                                                         prefer_idle, min_util,
7637                                                         cpu, best_idle_cpu,
7638                                                         best_active_cpu, i);
7639
7640                                         return i;
7641                                 }
7642
7643                                 /*
7644                                  * Case A.2: Target ACTIVE CPU
7645                                  * Favor CPUs with max spare capacity.
7646                                  */
7647                                 if ((capacity_curr > new_util) &&
7648                                         (capacity_orig - new_util > target_max_spare_cap)) {
7649                                         target_max_spare_cap = capacity_orig - new_util;
7650                                         target_cpu = i;
7651                                         continue;
7652                                 }
7653                                 if (target_cpu != -1)
7654                                         continue;
7655
7656
7657                                 /*
7658                                  * Case A.3: Backup ACTIVE CPU
7659                                  * Favor CPUs with:
7660                                  * - lower utilization due to other tasks
7661                                  * - lower utilization with the task in
7662                                  */
7663                                 if (wake_util > min_wake_util)
7664                                         continue;
7665                                 if (new_util > best_active_util)
7666                                         continue;
7667                                 min_wake_util = wake_util;
7668                                 best_active_util = new_util;
7669                                 best_active_cpu = i;
7670                                 continue;
7671                         }
7672
7673                         /*
7674                          * Enforce EAS mode
7675                          *
7676                          * For non latency sensitive tasks, skip CPUs that
7677                          * will be overutilized by moving the task there.
7678                          *
7679                          * The goal here is to remain in EAS mode as long as
7680                          * possible at least for !prefer_idle tasks.
7681                          */
7682                         if ((new_util * capacity_margin) >
7683                             (capacity_orig * SCHED_CAPACITY_SCALE))
7684                                 continue;
7685
7686                         /*
7687                          * Case B) Non latency sensitive tasks on IDLE CPUs.
7688                          *
7689                          * Find an optimal backup IDLE CPU for non latency
7690                          * sensitive tasks.
7691                          *
7692                          * Looking for:
7693                          * - minimizing the capacity_orig,
7694                          *   i.e. preferring LITTLE CPUs
7695                          * - favoring shallowest idle states
7696                          *   i.e. avoid to wakeup deep-idle CPUs
7697                          *
7698                          * The following code path is used by non latency
7699                          * sensitive tasks if IDLE CPUs are available. If at
7700                          * least one of such CPUs are available it sets the
7701                          * best_idle_cpu to the most suitable idle CPU to be
7702                          * selected.
7703                          *
7704                          * If idle CPUs are available, favour these CPUs to
7705                          * improve performances by spreading tasks.
7706                          * Indeed, the energy_diff() computed by the caller
7707                          * will take care to ensure the minimization of energy
7708                          * consumptions without affecting performance.
7709                          */
7710                         if (idle_cpu(i)) {
7711                                 int idle_idx = idle_get_state_idx(cpu_rq(i));
7712
7713                                 /* Select idle CPU with lower cap_orig */
7714                                 if (capacity_orig > best_idle_min_cap_orig)
7715                                         continue;
7716
7717                                 /*
7718                                  * Skip CPUs in deeper idle state, but only
7719                                  * if they are also less energy efficient.
7720                                  * IOW, prefer a deep IDLE LITTLE CPU vs a
7721                                  * shallow idle big CPU.
7722                                  */
7723                                 if (sysctl_sched_cstate_aware &&
7724                                     best_idle_cstate <= idle_idx)
7725                                         continue;
7726
7727                                 /* Keep track of best idle CPU */
7728                                 best_idle_min_cap_orig = capacity_orig;
7729                                 best_idle_cstate = idle_idx;
7730                                 best_idle_cpu = i;
7731                                 continue;
7732                         }
7733
7734                         /*
7735                          * Case C) Non latency sensitive tasks on ACTIVE CPUs.
7736                          *
7737                          * Pack tasks in the most energy efficient capacities.
7738                          *
7739                          * This task packing strategy prefers more energy
7740                          * efficient CPUs (i.e. pack on smaller maximum
7741                          * capacity CPUs) while also trying to spread tasks to
7742                          * run them all at the lower OPP.
7743                          *
7744                          * This assumes for example that it's more energy
7745                          * efficient to run two tasks on two CPUs at a lower
7746                          * OPP than packing both on a single CPU but running
7747                          * that CPU at an higher OPP.
7748                          *
7749                          * Thus, this case keep track of the CPU with the
7750                          * smallest maximum capacity and highest spare maximum
7751                          * capacity.
7752                          */
7753
7754                         /* Favor CPUs with smaller capacity */
7755                         if (capacity_orig > target_capacity)
7756                                 continue;
7757
7758                         /* Favor CPUs with maximum spare capacity */
7759                         if ((capacity_orig - new_util) < target_max_spare_cap)
7760                                 continue;
7761
7762                         target_max_spare_cap = capacity_orig - new_util;
7763                         target_capacity = capacity_orig;
7764                         target_cpu = i;
7765                 }
7766
7767         } while (sg = sg->next, sg != sd->groups);
7768
7769         /*
7770          * For non latency sensitive tasks, cases B and C in the previous loop,
7771          * we pick the best IDLE CPU only if we was not able to find a target
7772          * ACTIVE CPU.
7773          *
7774          * Policies priorities:
7775          *
7776          * - prefer_idle tasks:
7777          *
7778          *   a) IDLE CPU available, we return immediately
7779          *   b) ACTIVE CPU where task fits and has the bigger maximum spare
7780          *      capacity (i.e. target_cpu)
7781          *   c) ACTIVE CPU with less contention due to other tasks
7782          *      (i.e. best_active_cpu)
7783          *
7784          * - NON prefer_idle tasks:
7785          *
7786          *   a) ACTIVE CPU: target_cpu
7787          *   b) IDLE CPU: best_idle_cpu
7788          */
7789         if (target_cpu == -1)
7790                 target_cpu = prefer_idle
7791                         ? best_active_cpu
7792                         : best_idle_cpu;
7793         else
7794                 *backup_cpu = prefer_idle
7795                 ? best_active_cpu
7796                 : best_idle_cpu;
7797
7798         trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
7799                                      best_idle_cpu, best_active_cpu,
7800                                      target_cpu);
7801
7802         schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
7803         schedstat_inc(this_rq(), eas_stats.fbt_count);
7804
7805         return target_cpu;
7806 }
7807
7808 /*
7809  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
7810  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
7811  *
7812  * In that case WAKE_AFFINE doesn't make sense and we'll let
7813  * BALANCE_WAKE sort things out.
7814  */
7815 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
7816 {
7817         long min_cap, max_cap;
7818
7819         min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
7820         max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7821
7822         /* Minimum capacity is close to max, no need to abort wake_affine */
7823         if (max_cap - min_cap < max_cap >> 3)
7824                 return 0;
7825
7826         /* Bring task utilization in sync with prev_cpu */
7827         sync_entity_load_avg(&p->se);
7828
7829         return min_cap * 1024 < task_util(p) * capacity_margin;
7830 }
7831
7832 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
7833 {
7834         struct sched_domain *sd;
7835         int target_cpu = prev_cpu, tmp_target, tmp_backup;
7836         bool boosted, prefer_idle;
7837
7838         schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
7839         schedstat_inc(this_rq(), eas_stats.secb_attempts);
7840
7841         if (sysctl_sched_sync_hint_enable && sync) {
7842                 int cpu = smp_processor_id();
7843
7844                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7845                         schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
7846                         schedstat_inc(this_rq(), eas_stats.secb_sync);
7847                         return cpu;
7848                 }
7849         }
7850
7851         rcu_read_lock();
7852 #ifdef CONFIG_CGROUP_SCHEDTUNE
7853         boosted = schedtune_task_boost(p) > 0;
7854         prefer_idle = schedtune_prefer_idle(p) > 0;
7855 #else
7856         boosted = get_sysctl_sched_cfs_boost() > 0;
7857         prefer_idle = 0;
7858 #endif
7859
7860         sync_entity_load_avg(&p->se);
7861
7862         sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
7863         /* Find a cpu with sufficient capacity */
7864         tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
7865
7866         if (!sd)
7867                 goto unlock;
7868         if (tmp_target >= 0) {
7869                 target_cpu = tmp_target;
7870                 if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
7871                         schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
7872                         schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
7873                         goto unlock;
7874                 }
7875         }
7876
7877         if (target_cpu != prev_cpu) {
7878                 int delta = 0;
7879                 struct energy_env eenv = {
7880                         .util_delta     = task_util(p),
7881                         .src_cpu        = prev_cpu,
7882                         .dst_cpu        = target_cpu,
7883                         .task           = p,
7884                         .trg_cpu        = target_cpu,
7885                 };
7886
7887
7888 #ifdef CONFIG_SCHED_WALT
7889                 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7890                         p->state == TASK_WAKING)
7891                         delta = task_util(p);
7892 #endif
7893                 /* Not enough spare capacity on previous cpu */
7894                 if (__cpu_overutilized(prev_cpu, delta)) {
7895                         schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
7896                         schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
7897                         goto unlock;
7898                 }
7899
7900                 if (energy_diff(&eenv) >= 0) {
7901                         /* No energy saving for target_cpu, try backup */
7902                         target_cpu = tmp_backup;
7903                         eenv.dst_cpu = target_cpu;
7904                         eenv.trg_cpu = target_cpu;
7905                         if (tmp_backup < 0 ||
7906                             tmp_backup == prev_cpu ||
7907                             energy_diff(&eenv) >= 0) {
7908                                 schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
7909                                 schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
7910                                 target_cpu = prev_cpu;
7911                                 goto unlock;
7912                         }
7913                 }
7914
7915                 schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
7916                 schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
7917                 goto unlock;
7918         }
7919
7920         schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
7921         schedstat_inc(this_rq(), eas_stats.secb_count);
7922
7923 unlock:
7924         rcu_read_unlock();
7925
7926         return target_cpu;
7927 }
7928
7929 /*
7930  * select_task_rq_fair: Select target runqueue for the waking task in domains
7931  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
7932  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
7933  *
7934  * Balances load by selecting the idlest cpu in the idlest group, or under
7935  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
7936  *
7937  * Returns the target cpu number.
7938  *
7939  * preempt must be disabled.
7940  */
7941 static int
7942 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7943                     int sibling_count_hint)
7944 {
7945         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
7946         int cpu = smp_processor_id();
7947         int new_cpu = prev_cpu;
7948         int want_affine = 0;
7949         int sync = wake_flags & WF_SYNC;
7950
7951 #ifdef CONFIG_SCHED_HMP
7952         return select_best_cpu(p, prev_cpu, 0, sync);
7953 #endif
7954
7955         if (sd_flag & SD_BALANCE_WAKE) {
7956                 record_wakee(p);
7957                 want_affine = !wake_wide(p, sibling_count_hint) &&
7958                               !wake_cap(p, cpu, prev_cpu) &&
7959                               cpumask_test_cpu(cpu, &p->cpus_allowed);
7960         }
7961
7962         if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
7963                 return select_energy_cpu_brute(p, prev_cpu, sync);
7964
7965         rcu_read_lock();
7966         for_each_domain(cpu, tmp) {
7967                 if (!(tmp->flags & SD_LOAD_BALANCE))
7968                         break;
7969
7970                 /*
7971                  * If both cpu and prev_cpu are part of this domain,
7972                  * cpu is a valid SD_WAKE_AFFINE target.
7973                  */
7974                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7975                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
7976                         affine_sd = tmp;
7977                         break;
7978                 }
7979
7980                 if (tmp->flags & sd_flag)
7981                         sd = tmp;
7982                 else if (!want_affine)
7983                         break;
7984         }
7985
7986         if (affine_sd) {
7987                 sd = NULL; /* Prefer wake_affine over balance flags */
7988                 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
7989                         new_cpu = cpu;
7990         }
7991
7992         if (sd && !(sd_flag & SD_BALANCE_FORK)) {
7993                 /*
7994                  * We're going to need the task's util for capacity_spare_wake
7995                  * in find_idlest_group. Sync it up to prev_cpu's
7996                  * last_update_time.
7997                  */
7998                 sync_entity_load_avg(&p->se);
7999         }
8000
8001         if (!sd) {
8002                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
8003                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8004
8005         } else {
8006                 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
8007         }
8008         rcu_read_unlock();
8009
8010         return new_cpu;
8011 }
8012
8013 /*
8014  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
8015  * cfs_rq_of(p) references at time of call are still valid and identify the
8016  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
8017  * other assumptions, including the state of rq->lock, should be made.
8018  */
8019 static void migrate_task_rq_fair(struct task_struct *p)
8020 {
8021         /*
8022          * We are supposed to update the task to "current" time, then its up to date
8023          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
8024          * what current time is, so simply throw away the out-of-date time. This
8025          * will result in the wakee task is less decayed, but giving the wakee more
8026          * load sounds not bad.
8027          */
8028         remove_entity_load_avg(&p->se);
8029
8030         /* Tell new CPU we are migrated */
8031         p->se.avg.last_update_time = 0;
8032
8033         /* We have migrated, no longer consider this task hot */
8034         p->se.exec_start = 0;
8035 }
8036
8037 static void task_dead_fair(struct task_struct *p)
8038 {
8039         remove_entity_load_avg(&p->se);
8040 }
8041 #else
8042 #define task_fits_max(p, cpu) true
8043 #endif /* CONFIG_SMP */
8044
8045 static unsigned long
8046 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
8047 {
8048         unsigned long gran = sysctl_sched_wakeup_granularity;
8049
8050         /*
8051          * Since its curr running now, convert the gran from real-time
8052          * to virtual-time in his units.
8053          *
8054          * By using 'se' instead of 'curr' we penalize light tasks, so
8055          * they get preempted easier. That is, if 'se' < 'curr' then
8056          * the resulting gran will be larger, therefore penalizing the
8057          * lighter, if otoh 'se' > 'curr' then the resulting gran will
8058          * be smaller, again penalizing the lighter task.
8059          *
8060          * This is especially important for buddies when the leftmost
8061          * task is higher priority than the buddy.
8062          */
8063         return calc_delta_fair(gran, se);
8064 }
8065
8066 /*
8067  * Should 'se' preempt 'curr'.
8068  *
8069  *             |s1
8070  *        |s2
8071  *   |s3
8072  *         g
8073  *      |<--->|c
8074  *
8075  *  w(c, s1) = -1
8076  *  w(c, s2) =  0
8077  *  w(c, s3) =  1
8078  *
8079  */
8080 static int
8081 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
8082 {
8083         s64 gran, vdiff = curr->vruntime - se->vruntime;
8084
8085         if (vdiff <= 0)
8086                 return -1;
8087
8088         gran = wakeup_gran(curr, se);
8089         if (vdiff > gran)
8090                 return 1;
8091
8092         return 0;
8093 }
8094
8095 static void set_last_buddy(struct sched_entity *se)
8096 {
8097         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8098                 return;
8099
8100         for_each_sched_entity(se)
8101                 cfs_rq_of(se)->last = se;
8102 }
8103
8104 static void set_next_buddy(struct sched_entity *se)
8105 {
8106         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8107                 return;
8108
8109         for_each_sched_entity(se)
8110                 cfs_rq_of(se)->next = se;
8111 }
8112
8113 static void set_skip_buddy(struct sched_entity *se)
8114 {
8115         for_each_sched_entity(se)
8116                 cfs_rq_of(se)->skip = se;
8117 }
8118
8119 /*
8120  * Preempt the current task with a newly woken task if needed:
8121  */
8122 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
8123 {
8124         struct task_struct *curr = rq->curr;
8125         struct sched_entity *se = &curr->se, *pse = &p->se;
8126         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8127         int scale = cfs_rq->nr_running >= sched_nr_latency;
8128         int next_buddy_marked = 0;
8129
8130         if (unlikely(se == pse))
8131                 return;
8132
8133         /*
8134          * This is possible from callers such as attach_tasks(), in which we
8135          * unconditionally check_prempt_curr() after an enqueue (which may have
8136          * lead to a throttle).  This both saves work and prevents false
8137          * next-buddy nomination below.
8138          */
8139         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8140                 return;
8141
8142         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
8143                 set_next_buddy(pse);
8144                 next_buddy_marked = 1;
8145         }
8146
8147         /*
8148          * We can come here with TIF_NEED_RESCHED already set from new task
8149          * wake up path.
8150          *
8151          * Note: this also catches the edge-case of curr being in a throttled
8152          * group (e.g. via set_curr_task), since update_curr() (in the
8153          * enqueue of curr) will have resulted in resched being set.  This
8154          * prevents us from potentially nominating it as a false LAST_BUDDY
8155          * below.
8156          */
8157         if (test_tsk_need_resched(curr))
8158                 return;
8159
8160         /* Idle tasks are by definition preempted by non-idle tasks. */
8161         if (unlikely(curr->policy == SCHED_IDLE) &&
8162             likely(p->policy != SCHED_IDLE))
8163                 goto preempt;
8164
8165         /*
8166          * Batch and idle tasks do not preempt non-idle tasks (their preemption
8167          * is driven by the tick):
8168          */
8169         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8170                 return;
8171
8172         find_matching_se(&se, &pse);
8173         update_curr(cfs_rq_of(se));
8174         BUG_ON(!pse);
8175         if (wakeup_preempt_entity(se, pse) == 1) {
8176                 /*
8177                  * Bias pick_next to pick the sched entity that is
8178                  * triggering this preemption.
8179                  */
8180                 if (!next_buddy_marked)
8181                         set_next_buddy(pse);
8182                 goto preempt;
8183         }
8184
8185         return;
8186
8187 preempt:
8188         resched_curr(rq);
8189         /*
8190          * Only set the backward buddy when the current task is still
8191          * on the rq. This can happen when a wakeup gets interleaved
8192          * with schedule on the ->pre_schedule() or idle_balance()
8193          * point, either of which can * drop the rq lock.
8194          *
8195          * Also, during early boot the idle thread is in the fair class,
8196          * for obvious reasons its a bad idea to schedule back to it.
8197          */
8198         if (unlikely(!se->on_rq || curr == rq->idle))
8199                 return;
8200
8201         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
8202                 set_last_buddy(se);
8203 }
8204
8205 static struct task_struct *
8206 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
8207 {
8208         struct cfs_rq *cfs_rq = &rq->cfs;
8209         struct sched_entity *se;
8210         struct task_struct *p;
8211         int new_tasks;
8212
8213 again:
8214 #ifdef CONFIG_FAIR_GROUP_SCHED
8215         if (!cfs_rq->nr_running)
8216                 goto idle;
8217
8218         if (prev->sched_class != &fair_sched_class)
8219                 goto simple;
8220
8221         /*
8222          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8223          * likely that a next task is from the same cgroup as the current.
8224          *
8225          * Therefore attempt to avoid putting and setting the entire cgroup
8226          * hierarchy, only change the part that actually changes.
8227          */
8228
8229         do {
8230                 struct sched_entity *curr = cfs_rq->curr;
8231
8232                 /*
8233                  * Since we got here without doing put_prev_entity() we also
8234                  * have to consider cfs_rq->curr. If it is still a runnable
8235                  * entity, update_curr() will update its vruntime, otherwise
8236                  * forget we've ever seen it.
8237                  */
8238                 if (curr) {
8239                         if (curr->on_rq)
8240                                 update_curr(cfs_rq);
8241                         else
8242                                 curr = NULL;
8243
8244                         /*
8245                          * This call to check_cfs_rq_runtime() will do the
8246                          * throttle and dequeue its entity in the parent(s).
8247                          * Therefore the 'simple' nr_running test will indeed
8248                          * be correct.
8249                          */
8250                         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8251                                 goto simple;
8252                 }
8253
8254                 se = pick_next_entity(cfs_rq, curr);
8255                 cfs_rq = group_cfs_rq(se);
8256         } while (cfs_rq);
8257
8258         p = task_of(se);
8259
8260         /*
8261          * Since we haven't yet done put_prev_entity and if the selected task
8262          * is a different task than we started out with, try and touch the
8263          * least amount of cfs_rqs.
8264          */
8265         if (prev != p) {
8266                 struct sched_entity *pse = &prev->se;
8267
8268                 while (!(cfs_rq = is_same_group(se, pse))) {
8269                         int se_depth = se->depth;
8270                         int pse_depth = pse->depth;
8271
8272                         if (se_depth <= pse_depth) {
8273                                 put_prev_entity(cfs_rq_of(pse), pse);
8274                                 pse = parent_entity(pse);
8275                         }
8276                         if (se_depth >= pse_depth) {
8277                                 set_next_entity(cfs_rq_of(se), se);
8278                                 se = parent_entity(se);
8279                         }
8280                 }
8281
8282                 put_prev_entity(cfs_rq, pse);
8283                 set_next_entity(cfs_rq, se);
8284         }
8285
8286         if (hrtick_enabled(rq))
8287                 hrtick_start_fair(rq, p);
8288
8289         rq->misfit_task = !task_fits_max(p, rq->cpu);
8290
8291         return p;
8292 simple:
8293         cfs_rq = &rq->cfs;
8294 #endif
8295
8296         if (!cfs_rq->nr_running)
8297                 goto idle;
8298
8299         put_prev_task(rq, prev);
8300
8301         do {
8302                 se = pick_next_entity(cfs_rq, NULL);
8303                 set_next_entity(cfs_rq, se);
8304                 cfs_rq = group_cfs_rq(se);
8305         } while (cfs_rq);
8306
8307         p = task_of(se);
8308
8309         if (hrtick_enabled(rq))
8310                 hrtick_start_fair(rq, p);
8311
8312         rq->misfit_task = !task_fits_max(p, rq->cpu);
8313
8314         return p;
8315
8316 idle:
8317         rq->misfit_task = 0;
8318         /*
8319          * This is OK, because current is on_cpu, which avoids it being picked
8320          * for load-balance and preemption/IRQs are still disabled avoiding
8321          * further scheduler activity on it and we're being very careful to
8322          * re-start the picking loop.
8323          */
8324         lockdep_unpin_lock(&rq->lock);
8325         new_tasks = idle_balance(rq);
8326         lockdep_pin_lock(&rq->lock);
8327         /*
8328          * Because idle_balance() releases (and re-acquires) rq->lock, it is
8329          * possible for any higher priority task to appear. In that case we
8330          * must re-start the pick_next_entity() loop.
8331          */
8332         if (new_tasks < 0)
8333                 return RETRY_TASK;
8334
8335         if (new_tasks > 0)
8336                 goto again;
8337
8338         return NULL;
8339 }
8340
8341 /*
8342  * Account for a descheduled task:
8343  */
8344 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8345 {
8346         struct sched_entity *se = &prev->se;
8347         struct cfs_rq *cfs_rq;
8348
8349         for_each_sched_entity(se) {
8350                 cfs_rq = cfs_rq_of(se);
8351                 put_prev_entity(cfs_rq, se);
8352         }
8353 }
8354
8355 /*
8356  * sched_yield() is very simple
8357  *
8358  * The magic of dealing with the ->skip buddy is in pick_next_entity.
8359  */
8360 static void yield_task_fair(struct rq *rq)
8361 {
8362         struct task_struct *curr = rq->curr;
8363         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8364         struct sched_entity *se = &curr->se;
8365
8366         /*
8367          * Are we the only task in the tree?
8368          */
8369         if (unlikely(rq->nr_running == 1))
8370                 return;
8371
8372         clear_buddies(cfs_rq, se);
8373
8374         if (curr->policy != SCHED_BATCH) {
8375                 update_rq_clock(rq);
8376                 /*
8377                  * Update run-time statistics of the 'current'.
8378                  */
8379                 update_curr(cfs_rq);
8380                 /*
8381                  * Tell update_rq_clock() that we've just updated,
8382                  * so we don't do microscopic update in schedule()
8383                  * and double the fastpath cost.
8384                  */
8385                 rq_clock_skip_update(rq, true);
8386         }
8387
8388         set_skip_buddy(se);
8389 }
8390
8391 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
8392 {
8393         struct sched_entity *se = &p->se;
8394
8395         /* throttled hierarchies are not runnable */
8396         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8397                 return false;
8398
8399         /* Tell the scheduler that we'd really like pse to run next. */
8400         set_next_buddy(se);
8401
8402         yield_task_fair(rq);
8403
8404         return true;
8405 }
8406
8407 #ifdef CONFIG_SMP
8408 /**************************************************
8409  * Fair scheduling class load-balancing methods.
8410  *
8411  * BASICS
8412  *
8413  * The purpose of load-balancing is to achieve the same basic fairness the
8414  * per-cpu scheduler provides, namely provide a proportional amount of compute
8415  * time to each task. This is expressed in the following equation:
8416  *
8417  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
8418  *
8419  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
8420  * W_i,0 is defined as:
8421  *
8422  *   W_i,0 = \Sum_j w_i,j                                             (2)
8423  *
8424  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
8425  * is derived from the nice value as per prio_to_weight[].
8426  *
8427  * The weight average is an exponential decay average of the instantaneous
8428  * weight:
8429  *
8430  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
8431  *
8432  * C_i is the compute capacity of cpu i, typically it is the
8433  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8434  * can also include other factors [XXX].
8435  *
8436  * To achieve this balance we define a measure of imbalance which follows
8437  * directly from (1):
8438  *
8439  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
8440  *
8441  * We them move tasks around to minimize the imbalance. In the continuous
8442  * function space it is obvious this converges, in the discrete case we get
8443  * a few fun cases generally called infeasible weight scenarios.
8444  *
8445  * [XXX expand on:
8446  *     - infeasible weights;
8447  *     - local vs global optima in the discrete case. ]
8448  *
8449  *
8450  * SCHED DOMAINS
8451  *
8452  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8453  * for all i,j solution, we create a tree of cpus that follows the hardware
8454  * topology where each level pairs two lower groups (or better). This results
8455  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
8456  * tree to only the first of the previous level and we decrease the frequency
8457  * of load-balance at each level inv. proportional to the number of cpus in
8458  * the groups.
8459  *
8460  * This yields:
8461  *
8462  *     log_2 n     1     n
8463  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
8464  *     i = 0      2^i   2^i
8465  *                               `- size of each group
8466  *         |         |     `- number of cpus doing load-balance
8467  *         |         `- freq
8468  *         `- sum over all levels
8469  *
8470  * Coupled with a limit on how many tasks we can migrate every balance pass,
8471  * this makes (5) the runtime complexity of the balancer.
8472  *
8473  * An important property here is that each CPU is still (indirectly) connected
8474  * to every other cpu in at most O(log n) steps:
8475  *
8476  * The adjacency matrix of the resulting graph is given by:
8477  *
8478  *             log_2 n
8479  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
8480  *             k = 0
8481  *
8482  * And you'll find that:
8483  *
8484  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
8485  *
8486  * Showing there's indeed a path between every cpu in at most O(log n) steps.
8487  * The task movement gives a factor of O(m), giving a convergence complexity
8488  * of:
8489  *
8490  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
8491  *
8492  *
8493  * WORK CONSERVING
8494  *
8495  * In order to avoid CPUs going idle while there's still work to do, new idle
8496  * balancing is more aggressive and has the newly idle cpu iterate up the domain
8497  * tree itself instead of relying on other CPUs to bring it work.
8498  *
8499  * This adds some complexity to both (5) and (8) but it reduces the total idle
8500  * time.
8501  *
8502  * [XXX more?]
8503  *
8504  *
8505  * CGROUPS
8506  *
8507  * Cgroups make a horror show out of (2), instead of a simple sum we get:
8508  *
8509  *                                s_k,i
8510  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
8511  *                                 S_k
8512  *
8513  * Where
8514  *
8515  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
8516  *
8517  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
8518  *
8519  * The big problem is S_k, its a global sum needed to compute a local (W_i)
8520  * property.
8521  *
8522  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8523  *      rewrite all of this once again.]
8524  */
8525
8526 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8527
8528 enum fbq_type { regular, remote, all };
8529
8530 enum group_type {
8531         group_other = 0,
8532         group_misfit_task,
8533         group_imbalanced,
8534         group_overloaded,
8535 };
8536
8537 #define LBF_ALL_PINNED  0x01
8538 #define LBF_NEED_BREAK  0x02
8539 #define LBF_DST_PINNED  0x04
8540 #define LBF_SOME_PINNED 0x08
8541 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
8542 #define LBF_IGNORE_BIG_TASKS 0x100
8543 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
8544 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
8545
8546 struct lb_env {
8547         struct sched_domain     *sd;
8548
8549         struct rq               *src_rq;
8550         int                     src_cpu;
8551
8552         int                     dst_cpu;
8553         struct rq               *dst_rq;
8554
8555         struct cpumask          *dst_grpmask;
8556         int                     new_dst_cpu;
8557         enum cpu_idle_type      idle;
8558         long                    imbalance;
8559         unsigned int            src_grp_nr_running;
8560         /* The set of CPUs under consideration for load-balancing */
8561         struct cpumask          *cpus;
8562         unsigned int            busiest_grp_capacity;
8563         unsigned int            busiest_nr_running;
8564
8565         unsigned int            flags;
8566
8567         unsigned int            loop;
8568         unsigned int            loop_break;
8569         unsigned int            loop_max;
8570
8571         enum fbq_type           fbq_type;
8572         enum group_type         busiest_group_type;
8573         struct list_head        tasks;
8574         enum sched_boost_policy boost_policy;
8575 };
8576
8577 /*
8578  * Is this task likely cache-hot:
8579  */
8580 static int task_hot(struct task_struct *p, struct lb_env *env)
8581 {
8582         s64 delta;
8583
8584         lockdep_assert_held(&env->src_rq->lock);
8585
8586         if (p->sched_class != &fair_sched_class)
8587                 return 0;
8588
8589         if (unlikely(p->policy == SCHED_IDLE))
8590                 return 0;
8591
8592         /*
8593          * Buddy candidates are cache hot:
8594          */
8595         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8596                         (&p->se == cfs_rq_of(&p->se)->next ||
8597                          &p->se == cfs_rq_of(&p->se)->last))
8598                 return 1;
8599
8600         if (sysctl_sched_migration_cost == -1)
8601                 return 1;
8602         if (sysctl_sched_migration_cost == 0)
8603                 return 0;
8604
8605         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8606
8607         return delta < (s64)sysctl_sched_migration_cost;
8608 }
8609
8610 #ifdef CONFIG_NUMA_BALANCING
8611 /*
8612  * Returns 1, if task migration degrades locality
8613  * Returns 0, if task migration improves locality i.e migration preferred.
8614  * Returns -1, if task migration is not affected by locality.
8615  */
8616 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8617 {
8618         struct numa_group *numa_group = rcu_dereference(p->numa_group);
8619         unsigned long src_faults, dst_faults;
8620         int src_nid, dst_nid;
8621
8622         if (!static_branch_likely(&sched_numa_balancing))
8623                 return -1;
8624
8625         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8626                 return -1;
8627
8628         src_nid = cpu_to_node(env->src_cpu);
8629         dst_nid = cpu_to_node(env->dst_cpu);
8630
8631         if (src_nid == dst_nid)
8632                 return -1;
8633
8634         /* Migrating away from the preferred node is always bad. */
8635         if (src_nid == p->numa_preferred_nid) {
8636                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8637                         return 1;
8638                 else
8639                         return -1;
8640         }
8641
8642         /* Encourage migration to the preferred node. */
8643         if (dst_nid == p->numa_preferred_nid)
8644                 return 0;
8645
8646         if (numa_group) {
8647                 src_faults = group_faults(p, src_nid);
8648                 dst_faults = group_faults(p, dst_nid);
8649         } else {
8650                 src_faults = task_faults(p, src_nid);
8651                 dst_faults = task_faults(p, dst_nid);
8652         }
8653
8654         return dst_faults < src_faults;
8655 }
8656
8657 #else
8658 static inline int migrate_degrades_locality(struct task_struct *p,
8659                                              struct lb_env *env)
8660 {
8661         return -1;
8662 }
8663 #endif
8664
8665 /*
8666  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8667  */
8668 static
8669 int can_migrate_task(struct task_struct *p, struct lb_env *env)
8670 {
8671         int tsk_cache_hot;
8672         int twf, group_cpus;
8673
8674         lockdep_assert_held(&env->src_rq->lock);
8675
8676         /*
8677          * We do not migrate tasks that are:
8678          * 1) throttled_lb_pair, or
8679          * 2) cannot be migrated to this CPU due to cpus_allowed, or
8680          * 3) running (obviously), or
8681          * 4) are cache-hot on their current CPU.
8682          */
8683         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8684                 return 0;
8685
8686         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
8687                 int cpu;
8688
8689                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
8690
8691                 env->flags |= LBF_SOME_PINNED;
8692
8693                 /*
8694                  * Remember if this task can be migrated to any other cpu in
8695                  * our sched_group. We may want to revisit it if we couldn't
8696                  * meet load balance goals by pulling other tasks on src_cpu.
8697                  *
8698                  * Also avoid computing new_dst_cpu if we have already computed
8699                  * one in current iteration.
8700                  */
8701                 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
8702                         return 0;
8703
8704                 /* Prevent to re-select dst_cpu via env's cpus */
8705                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8706                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
8707                                 env->flags |= LBF_DST_PINNED;
8708                                 env->new_dst_cpu = cpu;
8709                                 break;
8710                         }
8711                 }
8712
8713                 return 0;
8714         }
8715
8716         /* Record that we found atleast one task that could run on dst_cpu */
8717         env->flags &= ~LBF_ALL_PINNED;
8718
8719         if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
8720                 if (nr_big_tasks(env->src_rq) && !is_big_task(p))
8721                         return 0;
8722
8723                 if (env->boost_policy == SCHED_BOOST_ON_BIG &&
8724                                         !task_sched_boost(p))
8725                         return 0;
8726         }
8727
8728         twf = task_will_fit(p, env->dst_cpu);
8729
8730         /*
8731          * Attempt to not pull tasks that don't fit. We may get lucky and find
8732          * one that actually fits.
8733          */
8734         if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
8735                 return 0;
8736
8737         if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
8738             !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
8739                 return 0;
8740
8741         /*
8742          * Group imbalance can sometimes cause work to be pulled across groups
8743          * even though the group could have managed the imbalance on its own.
8744          * Prevent inter-cluster migrations for big tasks when the number of
8745          * tasks is lower than the capacity of the group.
8746          */
8747         group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
8748                                                  SCHED_CAPACITY_SCALE);
8749         if (!twf && env->busiest_nr_running <= group_cpus)
8750                 return 0;
8751
8752         if (task_running(env->src_rq, p)) {
8753                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
8754                 return 0;
8755         }
8756
8757         /*
8758          * Aggressive migration if:
8759          * 1) IDLE or NEWLY_IDLE balance.
8760          * 2) destination numa is preferred
8761          * 3) task is cache cold, or
8762          * 4) too many balance attempts have failed.
8763          */
8764         tsk_cache_hot = migrate_degrades_locality(p, env);
8765         if (tsk_cache_hot == -1)
8766                 tsk_cache_hot = task_hot(p, env);
8767
8768         if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
8769             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
8770                 if (tsk_cache_hot == 1) {
8771                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
8772                         schedstat_inc(p, se.statistics.nr_forced_migrations);
8773                 }
8774                 return 1;
8775         }
8776
8777         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
8778         return 0;
8779 }
8780
8781 /*
8782  * detach_task() -- detach the task for the migration specified in env
8783  */
8784 static void detach_task(struct task_struct *p, struct lb_env *env)
8785 {
8786         lockdep_assert_held(&env->src_rq->lock);
8787
8788         p->on_rq = TASK_ON_RQ_MIGRATING;
8789         deactivate_task(env->src_rq, p, 0);
8790         double_lock_balance(env->src_rq, env->dst_rq);
8791         set_task_cpu(p, env->dst_cpu);
8792         if (task_in_related_thread_group(p))
8793                 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
8794         double_unlock_balance(env->src_rq, env->dst_rq);
8795 }
8796
8797 /*
8798  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
8799  * part of active balancing operations within "domain".
8800  *
8801  * Returns a task if successful and NULL otherwise.
8802  */
8803 static struct task_struct *detach_one_task(struct lb_env *env)
8804 {
8805         struct task_struct *p, *n;
8806
8807         lockdep_assert_held(&env->src_rq->lock);
8808
8809         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
8810                 if (!can_migrate_task(p, env))
8811                         continue;
8812
8813                 detach_task(p, env);
8814
8815                 /*
8816                  * Right now, this is only the second place where
8817                  * lb_gained[env->idle] is updated (other is detach_tasks)
8818                  * so we can safely collect stats here rather than
8819                  * inside detach_tasks().
8820                  */
8821                 schedstat_inc(env->sd, lb_gained[env->idle]);
8822
8823                 return p;
8824         }
8825         return NULL;
8826 }
8827
8828 static const unsigned int sched_nr_migrate_break = 32;
8829
8830 /*
8831  * detach_tasks() -- tries to detach up to imbalance weighted load from
8832  * busiest_rq, as part of a balancing operation within domain "sd".
8833  *
8834  * Returns number of detached tasks if successful and 0 otherwise.
8835  */
8836 static int detach_tasks(struct lb_env *env)
8837 {
8838         struct list_head *tasks = &env->src_rq->cfs_tasks;
8839         struct task_struct *p;
8840         unsigned long load;
8841         int detached = 0;
8842         int orig_loop = env->loop;
8843
8844         lockdep_assert_held(&env->src_rq->lock);
8845
8846         if (env->imbalance <= 0)
8847                 return 0;
8848
8849         if (!same_cluster(env->dst_cpu, env->src_cpu))
8850                 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
8851
8852         if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
8853                 env->flags |= LBF_IGNORE_BIG_TASKS;
8854
8855 redo:
8856         while (!list_empty(tasks)) {
8857                 /*
8858                  * We don't want to steal all, otherwise we may be treated likewise,
8859                  * which could at worst lead to a livelock crash.
8860                  */
8861                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8862                         break;
8863
8864                 p = list_first_entry(tasks, struct task_struct, se.group_node);
8865
8866                 env->loop++;
8867                 /* We've more or less seen every task there is, call it quits */
8868                 if (env->loop > env->loop_max)
8869                         break;
8870
8871                 /* take a breather every nr_migrate tasks */
8872                 if (env->loop > env->loop_break) {
8873                         env->loop_break += sched_nr_migrate_break;
8874                         env->flags |= LBF_NEED_BREAK;
8875                         break;
8876                 }
8877
8878                 if (!can_migrate_task(p, env))
8879                         goto next;
8880
8881                 load = task_h_load(p);
8882
8883                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8884                         goto next;
8885
8886                 if ((load / 2) > env->imbalance)
8887                         goto next;
8888
8889                 detach_task(p, env);
8890                 list_add(&p->se.group_node, &env->tasks);
8891
8892                 detached++;
8893                 env->imbalance -= load;
8894
8895 #ifdef CONFIG_PREEMPT
8896                 /*
8897                  * NEWIDLE balancing is a source of latency, so preemptible
8898                  * kernels will stop after the first task is detached to minimize
8899                  * the critical section.
8900                  */
8901                 if (env->idle == CPU_NEWLY_IDLE)
8902                         break;
8903 #endif
8904
8905                 /*
8906                  * We only want to steal up to the prescribed amount of
8907                  * weighted load.
8908                  */
8909                 if (env->imbalance <= 0)
8910                         break;
8911
8912                 continue;
8913 next:
8914                 list_move_tail(&p->se.group_node, tasks);
8915         }
8916
8917         if (env->flags & (LBF_IGNORE_BIG_TASKS |
8918                         LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
8919                 tasks = &env->src_rq->cfs_tasks;
8920                 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
8921                                 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
8922                 env->loop = orig_loop;
8923                 goto redo;
8924         }
8925
8926         /*
8927          * Right now, this is one of only two places we collect this stat
8928          * so we can safely collect detach_one_task() stats here rather
8929          * than inside detach_one_task().
8930          */
8931         schedstat_add(env->sd, lb_gained[env->idle], detached);
8932
8933         return detached;
8934 }
8935
8936 /*
8937  * attach_task() -- attach the task detached by detach_task() to its new rq.
8938  */
8939 static void attach_task(struct rq *rq, struct task_struct *p)
8940 {
8941         lockdep_assert_held(&rq->lock);
8942
8943         BUG_ON(task_rq(p) != rq);
8944         activate_task(rq, p, 0);
8945         p->on_rq = TASK_ON_RQ_QUEUED;
8946         check_preempt_curr(rq, p, 0);
8947 }
8948
8949 /*
8950  * attach_one_task() -- attaches the task returned from detach_one_task() to
8951  * its new rq.
8952  */
8953 static void attach_one_task(struct rq *rq, struct task_struct *p)
8954 {
8955         raw_spin_lock(&rq->lock);
8956         attach_task(rq, p);
8957         raw_spin_unlock(&rq->lock);
8958 }
8959
8960 /*
8961  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8962  * new rq.
8963  */
8964 static void attach_tasks(struct lb_env *env)
8965 {
8966         struct list_head *tasks = &env->tasks;
8967         struct task_struct *p;
8968
8969         raw_spin_lock(&env->dst_rq->lock);
8970
8971         while (!list_empty(tasks)) {
8972                 p = list_first_entry(tasks, struct task_struct, se.group_node);
8973                 list_del_init(&p->se.group_node);
8974
8975                 attach_task(env->dst_rq, p);
8976         }
8977
8978         raw_spin_unlock(&env->dst_rq->lock);
8979 }
8980
8981 #ifdef CONFIG_FAIR_GROUP_SCHED
8982 static void update_blocked_averages(int cpu)
8983 {
8984         struct rq *rq = cpu_rq(cpu);
8985         struct cfs_rq *cfs_rq;
8986         unsigned long flags;
8987
8988         raw_spin_lock_irqsave(&rq->lock, flags);
8989         update_rq_clock(rq);
8990
8991         /*
8992          * Iterates the task_group tree in a bottom up fashion, see
8993          * list_add_leaf_cfs_rq() for details.
8994          */
8995         for_each_leaf_cfs_rq(rq, cfs_rq) {
8996                 /* throttled entities do not contribute to load */
8997                 if (throttled_hierarchy(cfs_rq))
8998                         continue;
8999
9000                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
9001                                            true))
9002                         update_tg_load_avg(cfs_rq, 0);
9003
9004                 /* Propagate pending load changes to the parent */
9005                 if (cfs_rq->tg->se[cpu])
9006                         update_load_avg(cfs_rq->tg->se[cpu], 0);
9007         }
9008         raw_spin_unlock_irqrestore(&rq->lock, flags);
9009 }
9010
9011 /*
9012  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9013  * This needs to be done in a top-down fashion because the load of a child
9014  * group is a fraction of its parents load.
9015  */
9016 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9017 {
9018         struct rq *rq = rq_of(cfs_rq);
9019         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9020         unsigned long now = jiffies;
9021         unsigned long load;
9022
9023         if (cfs_rq->last_h_load_update == now)
9024                 return;
9025
9026         WRITE_ONCE(cfs_rq->h_load_next, NULL);
9027         for_each_sched_entity(se) {
9028                 cfs_rq = cfs_rq_of(se);
9029                 WRITE_ONCE(cfs_rq->h_load_next, se);
9030                 if (cfs_rq->last_h_load_update == now)
9031                         break;
9032         }
9033
9034         if (!se) {
9035                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9036                 cfs_rq->last_h_load_update = now;
9037         }
9038
9039         while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9040                 load = cfs_rq->h_load;
9041                 load = div64_ul(load * se->avg.load_avg,
9042                         cfs_rq_load_avg(cfs_rq) + 1);
9043                 cfs_rq = group_cfs_rq(se);
9044                 cfs_rq->h_load = load;
9045                 cfs_rq->last_h_load_update = now;
9046         }
9047 }
9048
9049 static unsigned long task_h_load(struct task_struct *p)
9050 {
9051         struct cfs_rq *cfs_rq = task_cfs_rq(p);
9052
9053         update_cfs_rq_h_load(cfs_rq);
9054         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9055                         cfs_rq_load_avg(cfs_rq) + 1);
9056 }
9057 #else
9058 static inline void update_blocked_averages(int cpu)
9059 {
9060         struct rq *rq = cpu_rq(cpu);
9061         struct cfs_rq *cfs_rq = &rq->cfs;
9062         unsigned long flags;
9063
9064         raw_spin_lock_irqsave(&rq->lock, flags);
9065         update_rq_clock(rq);
9066         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
9067         raw_spin_unlock_irqrestore(&rq->lock, flags);
9068 }
9069
9070 static unsigned long task_h_load(struct task_struct *p)
9071 {
9072         return p->se.avg.load_avg;
9073 }
9074 #endif
9075
9076 /********** Helpers for find_busiest_group ************************/
9077
9078 /*
9079  * sg_lb_stats - stats of a sched_group required for load_balancing
9080  */
9081 struct sg_lb_stats {
9082         unsigned long avg_load; /*Avg load across the CPUs of the group */
9083         unsigned long group_load; /* Total load over the CPUs of the group */
9084         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
9085         unsigned long load_per_task;
9086         unsigned long group_capacity;
9087         unsigned long group_util; /* Total utilization of the group */
9088         unsigned int sum_nr_running; /* Nr tasks running in the group */
9089 #ifdef CONFIG_SCHED_HMP
9090         unsigned long sum_nr_big_tasks;
9091         u64 group_cpu_load; /* Scaled load of all CPUs of the group */
9092 #endif
9093         unsigned int idle_cpus;
9094         unsigned int group_weight;
9095         enum group_type group_type;
9096         int group_no_capacity;
9097         int group_misfit_task; /* A cpu has a task too big for its capacity */
9098 #ifdef CONFIG_NUMA_BALANCING
9099         unsigned int nr_numa_running;
9100         unsigned int nr_preferred_running;
9101 #endif
9102 };
9103
9104 /*
9105  * sd_lb_stats - Structure to store the statistics of a sched_domain
9106  *               during load balancing.
9107  */
9108 struct sd_lb_stats {
9109         struct sched_group *busiest;    /* Busiest group in this sd */
9110         struct sched_group *local;      /* Local group in this sd */
9111         unsigned long total_load;       /* Total load of all groups in sd */
9112         unsigned long total_capacity;   /* Total capacity of all groups in sd */
9113         unsigned long avg_load; /* Average load across all groups in sd */
9114
9115         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
9116         struct sg_lb_stats local_stat;  /* Statistics of the local group */
9117 };
9118
9119 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9120 {
9121         /*
9122          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9123          * local_stat because update_sg_lb_stats() does a full clear/assignment.
9124          * We must however clear busiest_stat::avg_load because
9125          * update_sd_pick_busiest() reads this before assignment.
9126          */
9127         *sds = (struct sd_lb_stats){
9128                 .busiest = NULL,
9129                 .local = NULL,
9130                 .total_load = 0UL,
9131                 .total_capacity = 0UL,
9132                 .busiest_stat = {
9133                         .avg_load = 0UL,
9134                         .sum_nr_running = 0,
9135                         .group_type = group_other,
9136 #ifdef CONFIG_SCHED_HMP
9137                         .sum_nr_big_tasks = 0UL,
9138                         .group_cpu_load = 0ULL,
9139 #endif
9140                 },
9141         };
9142 }
9143
9144 #ifdef CONFIG_SCHED_HMP
9145
9146 static int
9147 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9148 {
9149         int local_cpu, busiest_cpu;
9150         int local_capacity, busiest_capacity;
9151         int local_pwr_cost, busiest_pwr_cost;
9152         int nr_cpus;
9153         int boost = sched_boost();
9154
9155         if (!sysctl_sched_restrict_cluster_spill ||
9156                 boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
9157                 return 0;
9158
9159         local_cpu = group_first_cpu(sds->local);
9160         busiest_cpu = group_first_cpu(sds->busiest);
9161
9162         local_capacity = cpu_max_possible_capacity(local_cpu);
9163         busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
9164
9165         local_pwr_cost = cpu_max_power_cost(local_cpu);
9166         busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
9167
9168         if (local_pwr_cost <= busiest_pwr_cost)
9169                 return 0;
9170
9171         if (local_capacity > busiest_capacity &&
9172                         sds->busiest_stat.sum_nr_big_tasks)
9173                 return 0;
9174
9175         nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
9176         if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
9177                 (sds->busiest_stat.sum_nr_running <
9178                         nr_cpus * sysctl_sched_spill_nr_run))
9179                 return 1;
9180
9181         return 0;
9182 }
9183
9184 #else   /* CONFIG_SCHED_HMP */
9185
9186 static inline int
9187 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9188 {
9189         return 0;
9190 }
9191
9192 #endif  /* CONFIG_SCHED_HMP */
9193
9194 /**
9195  * get_sd_load_idx - Obtain the load index for a given sched domain.
9196  * @sd: The sched_domain whose load_idx is to be obtained.
9197  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
9198  *
9199  * Return: The load index.
9200  */
9201 static inline int get_sd_load_idx(struct sched_domain *sd,
9202                                         enum cpu_idle_type idle)
9203 {
9204         int load_idx;
9205
9206         switch (idle) {
9207         case CPU_NOT_IDLE:
9208                 load_idx = sd->busy_idx;
9209                 break;
9210
9211         case CPU_NEWLY_IDLE:
9212                 load_idx = sd->newidle_idx;
9213                 break;
9214         default:
9215                 load_idx = sd->idle_idx;
9216                 break;
9217         }
9218
9219         return load_idx;
9220 }
9221
9222 static unsigned long scale_rt_capacity(int cpu)
9223 {
9224         struct rq *rq = cpu_rq(cpu);
9225         u64 total, used, age_stamp, avg;
9226         s64 delta;
9227
9228         /*
9229          * Since we're reading these variables without serialization make sure
9230          * we read them once before doing sanity checks on them.
9231          */
9232         age_stamp = READ_ONCE(rq->age_stamp);
9233         avg = READ_ONCE(rq->rt_avg);
9234         delta = __rq_clock_broken(rq) - age_stamp;
9235
9236         if (unlikely(delta < 0))
9237                 delta = 0;
9238
9239         total = sched_avg_period() + delta;
9240
9241         used = div_u64(avg, total);
9242
9243         /*
9244          * deadline bandwidth is defined at system level so we must
9245          * weight this bandwidth with the max capacity of the system.
9246          * As a reminder, avg_bw is 20bits width and
9247          * scale_cpu_capacity is 10 bits width
9248          */
9249         used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
9250
9251         if (likely(used < SCHED_CAPACITY_SCALE))
9252                 return SCHED_CAPACITY_SCALE - used;
9253
9254         return 1;
9255 }
9256
9257 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
9258 {
9259         raw_spin_lock_init(&mcc->lock);
9260         mcc->val = 0;
9261         mcc->cpu = -1;
9262 }
9263
9264 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9265 {
9266         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
9267         struct sched_group *sdg = sd->groups;
9268         struct max_cpu_capacity *mcc;
9269         unsigned long max_capacity;
9270         int max_cap_cpu;
9271         unsigned long flags;
9272
9273         cpu_rq(cpu)->cpu_capacity_orig = capacity;
9274
9275         mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
9276
9277         raw_spin_lock_irqsave(&mcc->lock, flags);
9278         max_capacity = mcc->val;
9279         max_cap_cpu = mcc->cpu;
9280
9281         if ((max_capacity > capacity && max_cap_cpu == cpu) ||
9282             (max_capacity < capacity)) {
9283                 mcc->val = capacity;
9284                 mcc->cpu = cpu;
9285 #ifdef CONFIG_SCHED_DEBUG
9286                 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9287                 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
9288                                 cpu, capacity);
9289                 goto skip_unlock;
9290 #endif
9291         }
9292         raw_spin_unlock_irqrestore(&mcc->lock, flags);
9293
9294 skip_unlock: __attribute__ ((unused));
9295         capacity *= scale_rt_capacity(cpu);
9296         capacity >>= SCHED_CAPACITY_SHIFT;
9297
9298         if (!capacity)
9299                 capacity = 1;
9300
9301         cpu_rq(cpu)->cpu_capacity = capacity;
9302         sdg->sgc->capacity = capacity;
9303         sdg->sgc->max_capacity = capacity;
9304         sdg->sgc->min_capacity = capacity;
9305 }
9306
9307 void update_group_capacity(struct sched_domain *sd, int cpu)
9308 {
9309         struct sched_domain *child = sd->child;
9310         struct sched_group *group, *sdg = sd->groups;
9311         unsigned long capacity, max_capacity, min_capacity;
9312         unsigned long interval;
9313
9314         interval = msecs_to_jiffies(sd->balance_interval);
9315         interval = clamp(interval, 1UL, max_load_balance_interval);
9316         sdg->sgc->next_update = jiffies + interval;
9317
9318         if (!child) {
9319                 update_cpu_capacity(sd, cpu);
9320                 return;
9321         }
9322
9323         capacity = 0;
9324         max_capacity = 0;
9325         min_capacity = ULONG_MAX;
9326
9327         if (child->flags & SD_OVERLAP) {
9328                 /*
9329                  * SD_OVERLAP domains cannot assume that child groups
9330                  * span the current group.
9331                  */
9332
9333                 for_each_cpu(cpu, sched_group_cpus(sdg)) {
9334                         struct sched_group_capacity *sgc;
9335                         struct rq *rq = cpu_rq(cpu);
9336
9337                         if (cpumask_test_cpu(cpu, cpu_isolated_mask))
9338                                 continue;
9339                         /*
9340                          * build_sched_domains() -> init_sched_groups_capacity()
9341                          * gets here before we've attached the domains to the
9342                          * runqueues.
9343                          *
9344                          * Use capacity_of(), which is set irrespective of domains
9345                          * in update_cpu_capacity().
9346                          *
9347                          * This avoids capacity from being 0 and
9348                          * causing divide-by-zero issues on boot.
9349                          */
9350                         if (unlikely(!rq->sd)) {
9351                                 capacity += capacity_of(cpu);
9352                         } else {
9353                                 sgc = rq->sd->groups->sgc;
9354                                 capacity += sgc->capacity;
9355                         }
9356
9357                         max_capacity = max(capacity, max_capacity);
9358                         min_capacity = min(capacity, min_capacity);
9359                 }
9360         } else  {
9361                 /*
9362                  * !SD_OVERLAP domains can assume that child groups
9363                  * span the current group.
9364                  */
9365
9366                 group = child->groups;
9367                 do {
9368                         struct sched_group_capacity *sgc = group->sgc;
9369
9370                         cpumask_t *cpus = sched_group_cpus(group);
9371
9372                         /* Revisit this later. This won't work for MT domain */
9373                         if (!cpu_isolated(cpumask_first(cpus))) {
9374                                 capacity += sgc->capacity;
9375                                 max_capacity = max(sgc->max_capacity, max_capacity);
9376                                 min_capacity = min(sgc->min_capacity, min_capacity);
9377                         }
9378                         group = group->next;
9379                 } while (group != child->groups);
9380         }
9381
9382         sdg->sgc->capacity = capacity;
9383         sdg->sgc->max_capacity = max_capacity;
9384         sdg->sgc->min_capacity = min_capacity;
9385 }
9386
9387 /*
9388  * Check whether the capacity of the rq has been noticeably reduced by side
9389  * activity. The imbalance_pct is used for the threshold.
9390  * Return true is the capacity is reduced
9391  */
9392 static inline int
9393 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9394 {
9395         return ((rq->cpu_capacity * sd->imbalance_pct) <
9396                                 (rq->cpu_capacity_orig * 100));
9397 }
9398
9399 /*
9400  * Group imbalance indicates (and tries to solve) the problem where balancing
9401  * groups is inadequate due to tsk_cpus_allowed() constraints.
9402  *
9403  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
9404  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
9405  * Something like:
9406  *
9407  *      { 0 1 2 3 } { 4 5 6 7 }
9408  *              *     * * *
9409  *
9410  * If we were to balance group-wise we'd place two tasks in the first group and
9411  * two tasks in the second group. Clearly this is undesired as it will overload
9412  * cpu 3 and leave one of the cpus in the second group unused.
9413  *
9414  * The current solution to this issue is detecting the skew in the first group
9415  * by noticing the lower domain failed to reach balance and had difficulty
9416  * moving tasks due to affinity constraints.
9417  *
9418  * When this is so detected; this group becomes a candidate for busiest; see
9419  * update_sd_pick_busiest(). And calculate_imbalance() and
9420  * find_busiest_group() avoid some of the usual balance conditions to allow it
9421  * to create an effective group imbalance.
9422  *
9423  * This is a somewhat tricky proposition since the next run might not find the
9424  * group imbalance and decide the groups need to be balanced again. A most
9425  * subtle and fragile situation.
9426  */
9427
9428 static inline int sg_imbalanced(struct sched_group *group)
9429 {
9430         return group->sgc->imbalance;
9431 }
9432
9433 /*
9434  * group_has_capacity returns true if the group has spare capacity that could
9435  * be used by some tasks.
9436  * We consider that a group has spare capacity if the  * number of task is
9437  * smaller than the number of CPUs or if the utilization is lower than the
9438  * available capacity for CFS tasks.
9439  * For the latter, we use a threshold to stabilize the state, to take into
9440  * account the variance of the tasks' load and to return true if the available
9441  * capacity in meaningful for the load balancer.
9442  * As an example, an available capacity of 1% can appear but it doesn't make
9443  * any benefit for the load balance.
9444  */
9445 static inline bool
9446 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
9447 {
9448         if (sgs->sum_nr_running < sgs->group_weight)
9449                 return true;
9450
9451         if ((sgs->group_capacity * 100) >
9452                         (sgs->group_util * env->sd->imbalance_pct))
9453                 return true;
9454
9455         return false;
9456 }
9457
9458 /*
9459  *  group_is_overloaded returns true if the group has more tasks than it can
9460  *  handle.
9461  *  group_is_overloaded is not equals to !group_has_capacity because a group
9462  *  with the exact right number of tasks, has no more spare capacity but is not
9463  *  overloaded so both group_has_capacity and group_is_overloaded return
9464  *  false.
9465  */
9466 static inline bool
9467 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
9468 {
9469         if (sgs->sum_nr_running <= sgs->group_weight)
9470                 return false;
9471
9472         if ((sgs->group_capacity * 100) <
9473                         (sgs->group_util * env->sd->imbalance_pct))
9474                 return true;
9475
9476         return false;
9477 }
9478
9479
9480 /*
9481  * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
9482  * per-cpu capacity than sched_group ref.
9483  */
9484 static inline bool
9485 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
9486 {
9487         return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
9488                                                         ref->sgc->max_capacity;
9489 }
9490
9491 static inline enum
9492 group_type group_classify(struct sched_group *group,
9493                           struct sg_lb_stats *sgs, struct lb_env *env)
9494 {
9495         if (sgs->group_no_capacity)
9496                 return group_overloaded;
9497
9498         if (sg_imbalanced(group))
9499                 return group_imbalanced;
9500
9501         if (sgs->group_misfit_task)
9502                 return group_misfit_task;
9503
9504         return group_other;
9505 }
9506
9507 #ifdef CONFIG_NO_HZ_COMMON
9508 /*
9509  * idle load balancing data
9510  *  - used by the nohz balance, but we want it available here
9511  *    so that we can see which CPUs have no tick.
9512  */
9513 static struct {
9514         cpumask_var_t idle_cpus_mask;
9515         atomic_t nr_cpus;
9516         unsigned long next_balance;     /* in jiffy units */
9517 } nohz ____cacheline_aligned;
9518
9519 static inline void update_cpu_stats_if_tickless(struct rq *rq)
9520 {
9521         /* only called from update_sg_lb_stats when irqs are disabled */
9522         if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
9523                 /* rate limit updates to once-per-jiffie at most */
9524                 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
9525                         return;
9526
9527                 raw_spin_lock(&rq->lock);
9528                 update_rq_clock(rq);
9529                 update_idle_cpu_load(rq);
9530                 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
9531                 raw_spin_unlock(&rq->lock);
9532         }
9533 }
9534
9535 #else
9536 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
9537 #endif
9538
9539 /**
9540  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9541  * @env: The load balancing environment.
9542  * @group: sched_group whose statistics are to be updated.
9543  * @load_idx: Load index of sched_domain of this_cpu for load calc.
9544  * @local_group: Does group contain this_cpu.
9545  * @sgs: variable to hold the statistics for this group.
9546  * @overload: Indicate more than one runnable task for any CPU.
9547  * @overutilized: Indicate overutilization for any CPU.
9548  */
9549 static inline void update_sg_lb_stats(struct lb_env *env,
9550                         struct sched_group *group, int load_idx,
9551                         int local_group, struct sg_lb_stats *sgs,
9552                         bool *overload, bool *overutilized)
9553 {
9554         unsigned long load;
9555         int i, nr_running;
9556
9557         memset(sgs, 0, sizeof(*sgs));
9558
9559         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
9560                 struct rq *rq = cpu_rq(i);
9561
9562                 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
9563                                      sched_irqload(i),
9564                                      power_cost(i, 0),
9565                                      cpu_temp(i));
9566
9567                 if (cpu_isolated(i))
9568                         continue;
9569
9570                 /* if we are entering idle and there are CPUs with
9571                  * their tick stopped, do an update for them
9572                  */
9573                 if (env->idle == CPU_NEWLY_IDLE)
9574                         update_cpu_stats_if_tickless(rq);
9575
9576                 /* Bias balancing toward cpus of our domain */
9577                 if (local_group)
9578                         load = target_load(i, load_idx);
9579                 else
9580                         load = source_load(i, load_idx);
9581
9582                 sgs->group_load += load;
9583                 sgs->group_util += cpu_util(i);
9584                 sgs->sum_nr_running += rq->cfs.h_nr_running;
9585
9586                 nr_running = rq->nr_running;
9587                 if (nr_running > 1)
9588                         *overload = true;
9589
9590 #ifdef CONFIG_SCHED_HMP
9591                 sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
9592                 sgs->group_cpu_load += cpu_load(i);
9593 #endif
9594
9595 #ifdef CONFIG_NUMA_BALANCING
9596                 sgs->nr_numa_running += rq->nr_numa_running;
9597                 sgs->nr_preferred_running += rq->nr_preferred_running;
9598 #endif
9599                 sgs->sum_weighted_load += weighted_cpuload(i);
9600                 /*
9601                  * No need to call idle_cpu() if nr_running is not 0
9602                  */
9603                 if (!nr_running && idle_cpu(i))
9604                         sgs->idle_cpus++;
9605
9606                 if (energy_aware() && cpu_overutilized(i)) {
9607                         *overutilized = true;
9608                         if (!sgs->group_misfit_task && rq->misfit_task)
9609                                 sgs->group_misfit_task = capacity_of(i);
9610                 }
9611         }
9612
9613         /* Isolated CPU has no weight */
9614         if (!group->group_weight) {
9615                 sgs->group_capacity = 0;
9616                 sgs->avg_load = 0;
9617                 sgs->group_no_capacity = 1;
9618                 sgs->group_type = group_other;
9619                 sgs->group_weight = group->group_weight;
9620         } else {
9621                 /* Adjust by relative CPU capacity of the group */
9622                 sgs->group_capacity = group->sgc->capacity;
9623                 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
9624                                                         sgs->group_capacity;
9625
9626                 sgs->group_weight = group->group_weight;
9627
9628                 sgs->group_no_capacity = group_is_overloaded(env, sgs);
9629                 sgs->group_type = group_classify(group, sgs, env);
9630         }
9631
9632         if (sgs->sum_nr_running)
9633                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
9634 }
9635
9636 #ifdef CONFIG_SCHED_HMP
9637 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9638                                                   struct sd_lb_stats *sds,
9639                                                   struct sched_group *sg,
9640                                                   struct sg_lb_stats *sgs)
9641 {
9642         if (env->idle != CPU_NOT_IDLE &&
9643             cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
9644                 if (sgs->sum_nr_big_tasks >
9645                                 sds->busiest_stat.sum_nr_big_tasks) {
9646                         env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
9647                         return true;
9648                 }
9649         }
9650
9651         return false;
9652 }
9653 #else
9654 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9655                                                   struct sd_lb_stats *sds,
9656                                                   struct sched_group *sg,
9657                                                   struct sg_lb_stats *sgs)
9658 {
9659         return false;
9660 }
9661 #endif
9662
9663 /**
9664  * update_sd_pick_busiest - return 1 on busiest group
9665  * @env: The load balancing environment.
9666  * @sds: sched_domain statistics
9667  * @sg: sched_group candidate to be checked for being the busiest
9668  * @sgs: sched_group statistics
9669  *
9670  * Determine if @sg is a busier group than the previously selected
9671  * busiest group.
9672  *
9673  * Return: %true if @sg is a busier group than the previously selected
9674  * busiest group. %false otherwise.
9675  */
9676 static bool update_sd_pick_busiest(struct lb_env *env,
9677                                    struct sd_lb_stats *sds,
9678                                    struct sched_group *sg,
9679                                    struct sg_lb_stats *sgs)
9680 {
9681         struct sg_lb_stats *busiest = &sds->busiest_stat;
9682
9683         if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
9684                 return true;
9685
9686         if (sgs->group_type > busiest->group_type)
9687                 return true;
9688
9689         if (sgs->group_type < busiest->group_type)
9690                 return false;
9691
9692         if (energy_aware()) {
9693                 /*
9694                  * Candidate sg doesn't face any serious load-balance problems
9695                  * so don't pick it if the local sg is already filled up.
9696                  */
9697                 if (sgs->group_type == group_other &&
9698                     !group_has_capacity(env, &sds->local_stat))
9699                         return false;
9700
9701                 if (sgs->avg_load <= busiest->avg_load)
9702                         return false;
9703
9704                 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9705                         goto asym_packing;
9706
9707                 /*
9708                  * Candidate sg has no more than one task per CPU and
9709                  * has higher per-CPU capacity. Migrating tasks to less
9710                  * capable CPUs may harm throughput. Maximize throughput,
9711                  * power/energy consequences are not considered.
9712                  */
9713                 if (sgs->sum_nr_running <= sgs->group_weight &&
9714                     group_smaller_cpu_capacity(sds->local, sg))
9715                         return false;
9716         }
9717
9718 asym_packing:
9719         /* This is the busiest node in its class. */
9720         if (!(env->sd->flags & SD_ASYM_PACKING))
9721                 return true;
9722
9723         /*
9724          * ASYM_PACKING needs to move all the work to the lowest
9725          * numbered CPUs in the group, therefore mark all groups
9726          * higher than ourself as busy.
9727          */
9728         if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
9729                 if (!sds->busiest)
9730                         return true;
9731
9732                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
9733                         return true;
9734         }
9735
9736         return false;
9737 }
9738
9739 #ifdef CONFIG_NUMA_BALANCING
9740 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9741 {
9742         if (sgs->sum_nr_running > sgs->nr_numa_running)
9743                 return regular;
9744         if (sgs->sum_nr_running > sgs->nr_preferred_running)
9745                 return remote;
9746         return all;
9747 }
9748
9749 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9750 {
9751         if (rq->nr_running > rq->nr_numa_running)
9752                 return regular;
9753         if (rq->nr_running > rq->nr_preferred_running)
9754                 return remote;
9755         return all;
9756 }
9757 #else
9758 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9759 {
9760         return all;
9761 }
9762
9763 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9764 {
9765         return regular;
9766 }
9767 #endif /* CONFIG_NUMA_BALANCING */
9768
9769 #define lb_sd_parent(sd) \
9770         (sd->parent && sd->parent->groups != sd->parent->groups->next)
9771
9772 /**
9773  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9774  * @env: The load balancing environment.
9775  * @sds: variable to hold the statistics for this sched_domain.
9776  */
9777 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9778 {
9779         struct sched_domain *child = env->sd->child;
9780         struct sched_group *sg = env->sd->groups;
9781         struct sg_lb_stats tmp_sgs;
9782         int load_idx, prefer_sibling = 0;
9783         bool overload = false, overutilized = false;
9784
9785         if (child && child->flags & SD_PREFER_SIBLING)
9786                 prefer_sibling = 1;
9787
9788         load_idx = get_sd_load_idx(env->sd, env->idle);
9789
9790         do {
9791                 struct sg_lb_stats *sgs = &tmp_sgs;
9792                 int local_group;
9793
9794                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
9795                 if (local_group) {
9796                         sds->local = sg;
9797                         sgs = &sds->local_stat;
9798
9799                         if (env->idle != CPU_NEWLY_IDLE ||
9800                             time_after_eq(jiffies, sg->sgc->next_update))
9801                                 update_group_capacity(env->sd, env->dst_cpu);
9802                 }
9803
9804                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
9805                                                 &overload, &overutilized);
9806
9807                 if (local_group)
9808                         goto next_group;
9809
9810                 /*
9811                  * In case the child domain prefers tasks go to siblings
9812                  * first, lower the sg capacity so that we'll try
9813                  * and move all the excess tasks away. We lower the capacity
9814                  * of a group only if the local group has the capacity to fit
9815                  * these excess tasks. The extra check prevents the case where
9816                  * you always pull from the heaviest group when it is already
9817                  * under-utilized (possible with a large weight task outweighs
9818                  * the tasks on the system).
9819                  */
9820                 if (prefer_sibling && sds->local &&
9821                     group_has_capacity(env, &sds->local_stat) &&
9822                     (sgs->sum_nr_running > 1)) {
9823                         sgs->group_no_capacity = 1;
9824                         sgs->group_type = group_classify(sg, sgs, env);
9825                 }
9826
9827                 /*
9828                  * Ignore task groups with misfit tasks if local group has no
9829                  * capacity or if per-cpu capacity isn't higher.
9830                  */
9831                 if (energy_aware() &&
9832                     sgs->group_type == group_misfit_task &&
9833                     (!group_has_capacity(env, &sds->local_stat) ||
9834                      !group_smaller_cpu_capacity(sg, sds->local)))
9835                         sgs->group_type = group_other;
9836
9837                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9838                         sds->busiest = sg;
9839                         sds->busiest_stat = *sgs;
9840                         env->busiest_nr_running = sgs->sum_nr_running;
9841                         env->busiest_grp_capacity = sgs->group_capacity;
9842                 }
9843
9844 next_group:
9845                 /* Now, start updating sd_lb_stats */
9846                 sds->total_load += sgs->group_load;
9847                 sds->total_capacity += sgs->group_capacity;
9848
9849                 sg = sg->next;
9850         } while (sg != env->sd->groups);
9851
9852         if (env->sd->flags & SD_NUMA)
9853                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9854
9855         env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9856
9857         if (!lb_sd_parent(env->sd)) {
9858                 /* update overload indicator if we are at root domain */
9859                 if (env->dst_rq->rd->overload != overload)
9860                         env->dst_rq->rd->overload = overload;
9861
9862                 /* Update over-utilization (tipping point, U >= 0) indicator */
9863                 if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
9864                         env->dst_rq->rd->overutilized = overutilized;
9865                         trace_sched_overutilized(overutilized);
9866                 }
9867         } else {
9868                 if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
9869                         env->dst_rq->rd->overutilized = true;
9870                         trace_sched_overutilized(true);
9871                 }
9872         }
9873
9874 }
9875
9876 /**
9877  * check_asym_packing - Check to see if the group is packed into the
9878  *                      sched doman.
9879  *
9880  * This is primarily intended to used at the sibling level.  Some
9881  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
9882  * case of POWER7, it can move to lower SMT modes only when higher
9883  * threads are idle.  When in lower SMT modes, the threads will
9884  * perform better since they share less core resources.  Hence when we
9885  * have idle threads, we want them to be the higher ones.
9886  *
9887  * This packing function is run on idle threads.  It checks to see if
9888  * the busiest CPU in this domain (core in the P7 case) has a higher
9889  * CPU number than the packing function is being run on.  Here we are
9890  * assuming lower CPU number will be equivalent to lower a SMT thread
9891  * number.
9892  *
9893  * Return: 1 when packing is required and a task should be moved to
9894  * this CPU.  The amount of the imbalance is returned in *imbalance.
9895  *
9896  * @env: The load balancing environment.
9897  * @sds: Statistics of the sched_domain which is to be packed
9898  */
9899 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9900 {
9901         int busiest_cpu;
9902
9903         if (!(env->sd->flags & SD_ASYM_PACKING))
9904                 return 0;
9905
9906         if (!sds->busiest)
9907                 return 0;
9908
9909         busiest_cpu = group_first_cpu(sds->busiest);
9910         if (env->dst_cpu > busiest_cpu)
9911                 return 0;
9912
9913         env->imbalance = DIV_ROUND_CLOSEST(
9914                 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9915                 SCHED_CAPACITY_SCALE);
9916
9917         return 1;
9918 }
9919
9920 /**
9921  * fix_small_imbalance - Calculate the minor imbalance that exists
9922  *                      amongst the groups of a sched_domain, during
9923  *                      load balancing.
9924  * @env: The load balancing environment.
9925  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9926  */
9927 static inline
9928 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9929 {
9930         unsigned long tmp, capa_now = 0, capa_move = 0;
9931         unsigned int imbn = 2;
9932         unsigned long scaled_busy_load_per_task;
9933         struct sg_lb_stats *local, *busiest;
9934
9935         local = &sds->local_stat;
9936         busiest = &sds->busiest_stat;
9937
9938         if (!local->sum_nr_running)
9939                 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9940         else if (busiest->load_per_task > local->load_per_task)
9941                 imbn = 1;
9942
9943         scaled_busy_load_per_task =
9944                 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9945                 busiest->group_capacity;
9946
9947         if (busiest->avg_load + scaled_busy_load_per_task >=
9948             local->avg_load + (scaled_busy_load_per_task * imbn)) {
9949                 env->imbalance = busiest->load_per_task;
9950                 return;
9951         }
9952
9953         /*
9954          * OK, we don't have enough imbalance to justify moving tasks,
9955          * however we may be able to increase total CPU capacity used by
9956          * moving them.
9957          */
9958
9959         capa_now += busiest->group_capacity *
9960                         min(busiest->load_per_task, busiest->avg_load);
9961         capa_now += local->group_capacity *
9962                         min(local->load_per_task, local->avg_load);
9963         capa_now /= SCHED_CAPACITY_SCALE;
9964
9965         /* Amount of load we'd subtract */
9966         if (busiest->avg_load > scaled_busy_load_per_task) {
9967                 capa_move += busiest->group_capacity *
9968                             min(busiest->load_per_task,
9969                                 busiest->avg_load - scaled_busy_load_per_task);
9970         }
9971
9972         /* Amount of load we'd add */
9973         if (busiest->avg_load * busiest->group_capacity <
9974             busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9975                 tmp = (busiest->avg_load * busiest->group_capacity) /
9976                       local->group_capacity;
9977         } else {
9978                 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9979                       local->group_capacity;
9980         }
9981         capa_move += local->group_capacity *
9982                     min(local->load_per_task, local->avg_load + tmp);
9983         capa_move /= SCHED_CAPACITY_SCALE;
9984
9985         /* Move if we gain throughput */
9986         if (capa_move > capa_now)
9987                 env->imbalance = busiest->load_per_task;
9988 }
9989
9990 /**
9991  * calculate_imbalance - Calculate the amount of imbalance present within the
9992  *                       groups of a given sched_domain during load balance.
9993  * @env: load balance environment
9994  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
9995  */
9996 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9997 {
9998         unsigned long max_pull, load_above_capacity = ~0UL;
9999         struct sg_lb_stats *local, *busiest;
10000
10001         local = &sds->local_stat;
10002         busiest = &sds->busiest_stat;
10003
10004         if (busiest->group_type == group_imbalanced) {
10005                 /*
10006                  * In the group_imb case we cannot rely on group-wide averages
10007                  * to ensure cpu-load equilibrium, look at wider averages. XXX
10008                  */
10009                 busiest->load_per_task =
10010                         min(busiest->load_per_task, sds->avg_load);
10011         }
10012
10013         /*
10014          * In the presence of smp nice balancing, certain scenarios can have
10015          * max load less than avg load(as we skip the groups at or below
10016          * its cpu_capacity, while calculating max_load..)
10017          */
10018         if (busiest->avg_load <= sds->avg_load ||
10019             local->avg_load >= sds->avg_load) {
10020                 if (energy_aware()) {
10021                         /* Misfitting tasks should be migrated in any case */
10022                         if (busiest->group_type == group_misfit_task) {
10023                                 env->imbalance = busiest->group_misfit_task;
10024                                 return;
10025                         }
10026
10027                         /*
10028                          * Busiest group is overloaded, local is not, use the spare
10029                          * cycles to maximize throughput
10030                          */
10031                         if (busiest->group_type == group_overloaded &&
10032                             local->group_type <= group_misfit_task) {
10033                                 env->imbalance = busiest->load_per_task;
10034                                 return;
10035                         }
10036                 }
10037
10038                 env->imbalance = 0;
10039                 return fix_small_imbalance(env, sds);
10040         }
10041
10042         /*
10043          * If there aren't any idle cpus, avoid creating some.
10044          */
10045         if (busiest->group_type == group_overloaded &&
10046             local->group_type   == group_overloaded) {
10047                 load_above_capacity = busiest->sum_nr_running *
10048                                         SCHED_LOAD_SCALE;
10049                 if (load_above_capacity > busiest->group_capacity)
10050                         load_above_capacity -= busiest->group_capacity;
10051                 else
10052                         load_above_capacity = ~0UL;
10053         }
10054
10055         /*
10056          * We're trying to get all the cpus to the average_load, so we don't
10057          * want to push ourselves above the average load, nor do we wish to
10058          * reduce the max loaded cpu below the average load. At the same time,
10059          * we also don't want to reduce the group load below the group capacity
10060          * (so that we can implement power-savings policies etc). Thus we look
10061          * for the minimum possible imbalance.
10062          */
10063         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
10064
10065         /* How much load to actually move to equalise the imbalance */
10066         env->imbalance = min(
10067                 max_pull * busiest->group_capacity,
10068                 (sds->avg_load - local->avg_load) * local->group_capacity
10069         ) / SCHED_CAPACITY_SCALE;
10070
10071         /* Boost imbalance to allow misfit task to be balanced. */
10072         if (energy_aware() && busiest->group_type == group_misfit_task)
10073                 env->imbalance = max_t(long, env->imbalance,
10074                                      busiest->group_misfit_task);
10075
10076         /*
10077          * if *imbalance is less than the average load per runnable task
10078          * there is no guarantee that any tasks will be moved so we'll have
10079          * a think about bumping its value to force at least one task to be
10080          * moved
10081          */
10082         if (env->imbalance < busiest->load_per_task)
10083                 return fix_small_imbalance(env, sds);
10084 }
10085
10086 /******* find_busiest_group() helpers end here *********************/
10087
10088 /**
10089  * find_busiest_group - Returns the busiest group within the sched_domain
10090  * if there is an imbalance. If there isn't an imbalance, and
10091  * the user has opted for power-savings, it returns a group whose
10092  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
10093  * such a group exists.
10094  *
10095  * Also calculates the amount of weighted load which should be moved
10096  * to restore balance.
10097  *
10098  * @env: The load balancing environment.
10099  *
10100  * Return:      - The busiest group if imbalance exists.
10101  *              - If no imbalance and user has opted for power-savings balance,
10102  *                 return the least loaded group whose CPUs can be
10103  *                 put to idle by rebalancing its tasks onto our group.
10104  */
10105 static struct sched_group *find_busiest_group(struct lb_env *env)
10106 {
10107         struct sg_lb_stats *local, *busiest;
10108         struct sd_lb_stats sds;
10109
10110         init_sd_lb_stats(&sds);
10111
10112         /*
10113          * Compute the various statistics relavent for load balancing at
10114          * this level.
10115          */
10116         update_sd_lb_stats(env, &sds);
10117
10118         if (energy_aware() && !env->dst_rq->rd->overutilized)
10119                 goto out_balanced;
10120
10121         local = &sds.local_stat;
10122         busiest = &sds.busiest_stat;
10123
10124         /* ASYM feature bypasses nice load balance check */
10125         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
10126             check_asym_packing(env, &sds))
10127                 return sds.busiest;
10128
10129         /* There is no busy sibling group to pull tasks from */
10130         if (!sds.busiest || busiest->sum_nr_running == 0)
10131                 goto out_balanced;
10132
10133         if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10134                 goto force_balance;
10135
10136         if (bail_inter_cluster_balance(env, &sds))
10137                 goto out_balanced;
10138
10139         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
10140                                                 / sds.total_capacity;
10141
10142         /*
10143          * If the busiest group is imbalanced the below checks don't
10144          * work because they assume all things are equal, which typically
10145          * isn't true due to cpus_allowed constraints and the like.
10146          */
10147         if (busiest->group_type == group_imbalanced)
10148                 goto force_balance;
10149
10150         /*
10151          * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
10152          * capacities from resulting in underutilization due to avg_load.
10153          */
10154         if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
10155             busiest->group_no_capacity)
10156                 goto force_balance;
10157
10158         /* Misfitting tasks should be dealt with regardless of the avg load */
10159         if (energy_aware() && busiest->group_type == group_misfit_task) {
10160                 goto force_balance;
10161         }
10162
10163         /*
10164          * If the local group is busier than the selected busiest group
10165          * don't try and pull any tasks.
10166          */
10167         if (local->avg_load >= busiest->avg_load)
10168                 goto out_balanced;
10169
10170         /*
10171          * Don't pull any tasks if this group is already above the domain
10172          * average load.
10173          */
10174         if (local->avg_load >= sds.avg_load)
10175                 goto out_balanced;
10176
10177         if (env->idle == CPU_IDLE) {
10178                 /*
10179                  * This cpu is idle. If the busiest group is not overloaded
10180                  * and there is no imbalance between this and busiest group
10181                  * wrt idle cpus, it is balanced. The imbalance becomes
10182                  * significant if the diff is greater than 1 otherwise we
10183                  * might end up to just move the imbalance on another group
10184                  */
10185                 if ((busiest->group_type != group_overloaded) &&
10186                     (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
10187                     !group_smaller_cpu_capacity(sds.busiest, sds.local))
10188                         goto out_balanced;
10189         } else {
10190                 /*
10191                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
10192                  * imbalance_pct to be conservative.
10193                  */
10194                 if (100 * busiest->avg_load <=
10195                                 env->sd->imbalance_pct * local->avg_load)
10196                         goto out_balanced;
10197         }
10198
10199 force_balance:
10200         env->busiest_group_type = busiest->group_type;
10201         /* Looks like there is an imbalance. Compute it */
10202         calculate_imbalance(env, &sds);
10203         return sds.busiest;
10204
10205 out_balanced:
10206         env->imbalance = 0;
10207         return NULL;
10208 }
10209
10210 #ifdef CONFIG_SCHED_HMP
10211 static struct rq *find_busiest_queue_hmp(struct lb_env *env,
10212                                      struct sched_group *group)
10213 {
10214         struct rq *busiest = NULL, *busiest_big = NULL;
10215         u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
10216         int max_nr_big = 0, nr_big;
10217         bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
10218         int i;
10219         cpumask_t cpus;
10220
10221         cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
10222
10223         for_each_cpu(i, &cpus) {
10224                 struct rq *rq = cpu_rq(i);
10225                 u64 cumulative_runnable_avg =
10226                                 rq->hmp_stats.cumulative_runnable_avg;
10227
10228                 if (!cpumask_test_cpu(i, env->cpus))
10229                         continue;
10230
10231
10232                 if (find_big) {
10233                         nr_big = nr_big_tasks(rq);
10234                         if (nr_big > max_nr_big ||
10235                             (nr_big > 0 && nr_big == max_nr_big &&
10236                              cumulative_runnable_avg > max_runnable_avg_big)) {
10237                                 max_runnable_avg_big = cumulative_runnable_avg;
10238                                 busiest_big = rq;
10239                                 max_nr_big = nr_big;
10240                                 continue;
10241                         }
10242                 }
10243
10244                 if (cumulative_runnable_avg > max_runnable_avg) {
10245                         max_runnable_avg = cumulative_runnable_avg;
10246                         busiest = rq;
10247                 }
10248         }
10249
10250         if (busiest_big)
10251                 return busiest_big;
10252
10253         env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
10254         return busiest;
10255 }
10256 #else
10257 static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
10258                                     struct sched_group *group)
10259 {
10260         return NULL;
10261 }
10262 #endif
10263
10264 /*
10265  * find_busiest_queue - find the busiest runqueue among the cpus in group.
10266  */
10267 static struct rq *find_busiest_queue(struct lb_env *env,
10268                                      struct sched_group *group)
10269 {
10270         struct rq *busiest = NULL, *rq;
10271         unsigned long busiest_load = 0, busiest_capacity = 1;
10272         int i;
10273
10274 #ifdef CONFIG_SCHED_HMP
10275         return find_busiest_queue_hmp(env, group);
10276 #endif
10277
10278         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
10279                 unsigned long capacity, wl;
10280                 enum fbq_type rt;
10281
10282                 rq = cpu_rq(i);
10283                 rt = fbq_classify_rq(rq);
10284
10285                 /*
10286                  * We classify groups/runqueues into three groups:
10287                  *  - regular: there are !numa tasks
10288                  *  - remote:  there are numa tasks that run on the 'wrong' node
10289                  *  - all:     there is no distinction
10290                  *
10291                  * In order to avoid migrating ideally placed numa tasks,
10292                  * ignore those when there's better options.
10293                  *
10294                  * If we ignore the actual busiest queue to migrate another
10295                  * task, the next balance pass can still reduce the busiest
10296                  * queue by moving tasks around inside the node.
10297                  *
10298                  * If we cannot move enough load due to this classification
10299                  * the next pass will adjust the group classification and
10300                  * allow migration of more tasks.
10301                  *
10302                  * Both cases only affect the total convergence complexity.
10303                  */
10304                 if (rt > env->fbq_type)
10305                         continue;
10306
10307                 capacity = capacity_of(i);
10308
10309                 wl = weighted_cpuload(i);
10310
10311                 /*
10312                  * When comparing with imbalance, use weighted_cpuload()
10313                  * which is not scaled with the cpu capacity.
10314                  */
10315
10316                 if (rq->nr_running == 1 && wl > env->imbalance &&
10317                     !check_cpu_capacity(rq, env->sd) &&
10318                     env->busiest_group_type != group_misfit_task)
10319                         continue;
10320
10321                 /*
10322                  * For the load comparisons with the other cpu's, consider
10323                  * the weighted_cpuload() scaled with the cpu capacity, so
10324                  * that the load can be moved away from the cpu that is
10325                  * potentially running at a lower capacity.
10326                  *
10327                  * Thus we're looking for max(wl_i / capacity_i), crosswise
10328                  * multiplication to rid ourselves of the division works out
10329                  * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
10330                  * our previous maximum.
10331                  */
10332                 if (wl * busiest_capacity > busiest_load * capacity) {
10333                         busiest_load = wl;
10334                         busiest_capacity = capacity;
10335                         busiest = rq;
10336                 }
10337         }
10338
10339         return busiest;
10340 }
10341
10342 /*
10343  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10344  * so long as it is large enough.
10345  */
10346 #define MAX_PINNED_INTERVAL     16
10347
10348 /* Working cpumask for load_balance and load_balance_newidle. */
10349 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
10350
10351 #define NEED_ACTIVE_BALANCE_THRESHOLD 10
10352
10353 static int need_active_balance(struct lb_env *env)
10354 {
10355         struct sched_domain *sd = env->sd;
10356
10357         if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10358                 return 1;
10359
10360         if (env->idle == CPU_NEWLY_IDLE) {
10361
10362                 /*
10363                  * ASYM_PACKING needs to force migrate tasks from busy but
10364                  * higher numbered CPUs in order to pack all tasks in the
10365                  * lowest numbered CPUs.
10366                  */
10367                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
10368                         return 1;
10369         }
10370
10371         /*
10372          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
10373          * It's worth migrating the task if the src_cpu's capacity is reduced
10374          * because of other sched_class or IRQs if more capacity stays
10375          * available on dst_cpu.
10376          */
10377         if ((env->idle != CPU_NOT_IDLE) &&
10378             (env->src_rq->cfs.h_nr_running == 1)) {
10379                 if ((check_cpu_capacity(env->src_rq, sd)) &&
10380                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10381                         return 1;
10382         }
10383
10384         if (energy_aware() &&
10385             (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
10386             ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
10387                                 env->src_rq->cfs.h_nr_running == 1 &&
10388                                 cpu_overutilized(env->src_cpu) &&
10389                                 !cpu_overutilized(env->dst_cpu)) {
10390                         return 1;
10391         }
10392
10393         return unlikely(sd->nr_balance_failed >
10394                         sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
10395 }
10396
10397 static int group_balance_cpu_not_isolated(struct sched_group *sg)
10398 {
10399         cpumask_t cpus;
10400
10401         cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
10402         cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
10403         return cpumask_first(&cpus);
10404 }
10405
10406 static int should_we_balance(struct lb_env *env)
10407 {
10408         struct sched_group *sg = env->sd->groups;
10409         struct cpumask *sg_cpus, *sg_mask;
10410         int cpu, balance_cpu = -1;
10411
10412         /*
10413          * In the newly idle case, we will allow all the cpu's
10414          * to do the newly idle load balance.
10415          */
10416         if (env->idle == CPU_NEWLY_IDLE)
10417                 return 1;
10418
10419         sg_cpus = sched_group_cpus(sg);
10420         sg_mask = sched_group_mask(sg);
10421         /* Try to find first idle cpu */
10422         for_each_cpu_and(cpu, sg_cpus, env->cpus) {
10423                 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
10424                     cpu_isolated(cpu))
10425                         continue;
10426
10427                 balance_cpu = cpu;
10428                 break;
10429         }
10430
10431         if (balance_cpu == -1)
10432                 balance_cpu = group_balance_cpu_not_isolated(sg);
10433
10434         /*
10435          * First idle cpu or the first cpu(busiest) in this sched group
10436          * is eligible for doing load balancing at this and above domains.
10437          */
10438         return balance_cpu == env->dst_cpu;
10439 }
10440
10441 /*
10442  * Check this_cpu to ensure it is balanced within domain. Attempt to move
10443  * tasks if there is an imbalance.
10444  */
10445 static int load_balance(int this_cpu, struct rq *this_rq,
10446                         struct sched_domain *sd, enum cpu_idle_type idle,
10447                         int *continue_balancing)
10448 {
10449         int ld_moved = 0, cur_ld_moved, active_balance = 0;
10450         struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
10451         struct sched_group *group = NULL;
10452         struct rq *busiest = NULL;
10453         unsigned long flags;
10454         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
10455
10456         struct lb_env env = {
10457                 .sd                     = sd,
10458                 .dst_cpu                = this_cpu,
10459                 .dst_rq                 = this_rq,
10460                 .dst_grpmask            = sched_group_cpus(sd->groups),
10461                 .idle                   = idle,
10462                 .loop_break             = sched_nr_migrate_break,
10463                 .cpus                   = cpus,
10464                 .fbq_type               = all,
10465                 .tasks                  = LIST_HEAD_INIT(env.tasks),
10466                 .imbalance              = 0,
10467                 .flags                  = 0,
10468                 .loop                   = 0,
10469                 .busiest_nr_running     = 0,
10470                 .busiest_grp_capacity   = 0,
10471                 .boost_policy           = sched_boost_policy(),
10472         };
10473
10474         /*
10475          * For NEWLY_IDLE load_balancing, we don't need to consider
10476          * other cpus in our group
10477          */
10478         if (idle == CPU_NEWLY_IDLE)
10479                 env.dst_grpmask = NULL;
10480
10481         cpumask_copy(cpus, cpu_active_mask);
10482
10483         schedstat_inc(sd, lb_count[idle]);
10484
10485 redo:
10486         if (!should_we_balance(&env)) {
10487                 *continue_balancing = 0;
10488                 goto out_balanced;
10489         }
10490
10491         group = find_busiest_group(&env);
10492         if (!group) {
10493                 schedstat_inc(sd, lb_nobusyg[idle]);
10494                 goto out_balanced;
10495         }
10496
10497         busiest = find_busiest_queue(&env, group);
10498         if (!busiest) {
10499                 schedstat_inc(sd, lb_nobusyq[idle]);
10500                 goto out_balanced;
10501         }
10502
10503         BUG_ON(busiest == env.dst_rq);
10504
10505         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
10506
10507         env.src_cpu = busiest->cpu;
10508         env.src_rq = busiest;
10509
10510         ld_moved = 0;
10511         if (busiest->nr_running > 1) {
10512                 /*
10513                  * Attempt to move tasks. If find_busiest_group has found
10514                  * an imbalance but busiest->nr_running <= 1, the group is
10515                  * still unbalanced. ld_moved simply stays zero, so it is
10516                  * correctly treated as an imbalance.
10517                  */
10518                 env.flags |= LBF_ALL_PINNED;
10519                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
10520
10521 more_balance:
10522                 raw_spin_lock_irqsave(&busiest->lock, flags);
10523                 update_rq_clock(busiest);
10524
10525                 /* The world might have changed. Validate assumptions */
10526                 if (busiest->nr_running <= 1) {
10527                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
10528                         env.flags &= ~LBF_ALL_PINNED;
10529                         goto no_move;
10530                 }
10531
10532                 /*
10533                  * cur_ld_moved - load moved in current iteration
10534                  * ld_moved     - cumulative load moved across iterations
10535                  */
10536                 cur_ld_moved = detach_tasks(&env);
10537
10538                 /*
10539                  * We've detached some tasks from busiest_rq. Every
10540                  * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
10541                  * unlock busiest->lock, and we are able to be sure
10542                  * that nobody can manipulate the tasks in parallel.
10543                  * See task_rq_lock() family for the details.
10544                  */
10545
10546                 raw_spin_unlock(&busiest->lock);
10547
10548                 if (cur_ld_moved) {
10549                         attach_tasks(&env);
10550                         ld_moved += cur_ld_moved;
10551                 }
10552
10553                 local_irq_restore(flags);
10554
10555                 if (env.flags & LBF_NEED_BREAK) {
10556                         env.flags &= ~LBF_NEED_BREAK;
10557                         goto more_balance;
10558                 }
10559
10560                 /*
10561                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
10562                  * us and move them to an alternate dst_cpu in our sched_group
10563                  * where they can run. The upper limit on how many times we
10564                  * iterate on same src_cpu is dependent on number of cpus in our
10565                  * sched_group.
10566                  *
10567                  * This changes load balance semantics a bit on who can move
10568                  * load to a given_cpu. In addition to the given_cpu itself
10569                  * (or a ilb_cpu acting on its behalf where given_cpu is
10570                  * nohz-idle), we now have balance_cpu in a position to move
10571                  * load to given_cpu. In rare situations, this may cause
10572                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
10573                  * _independently_ and at _same_ time to move some load to
10574                  * given_cpu) causing exceess load to be moved to given_cpu.
10575                  * This however should not happen so much in practice and
10576                  * moreover subsequent load balance cycles should correct the
10577                  * excess load moved.
10578                  */
10579                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
10580
10581                         /* Prevent to re-select dst_cpu via env's cpus */
10582                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
10583
10584                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
10585                         env.dst_cpu      = env.new_dst_cpu;
10586                         env.flags       &= ~LBF_DST_PINNED;
10587                         env.loop         = 0;
10588                         env.loop_break   = sched_nr_migrate_break;
10589
10590                         /*
10591                          * Go back to "more_balance" rather than "redo" since we
10592                          * need to continue with same src_cpu.
10593                          */
10594                         goto more_balance;
10595                 }
10596
10597                 /*
10598                  * We failed to reach balance because of affinity.
10599                  */
10600                 if (sd_parent) {
10601                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10602
10603                         if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
10604                                 *group_imbalance = 1;
10605                 }
10606
10607                 /* All tasks on this runqueue were pinned by CPU affinity */
10608                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
10609                         cpumask_clear_cpu(cpu_of(busiest), cpus);
10610                         if (!cpumask_empty(cpus)) {
10611                                 env.loop = 0;
10612                                 env.loop_break = sched_nr_migrate_break;
10613                                 goto redo;
10614                         }
10615                         goto out_all_pinned;
10616                 }
10617         }
10618
10619 no_move:
10620         if (!ld_moved) {
10621                 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
10622                         schedstat_inc(sd, lb_failed[idle]);
10623
10624                 /*
10625                  * Increment the failure counter only on periodic balance.
10626                  * We do not want newidle balance, which can be very
10627                  * frequent, pollute the failure counter causing
10628                  * excessive cache_hot migrations and active balances.
10629                  */
10630                 if (idle != CPU_NEWLY_IDLE &&
10631                     !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
10632                         if (env.src_grp_nr_running > 1)
10633                                 sd->nr_balance_failed++;
10634                 }
10635
10636                 if (need_active_balance(&env)) {
10637                         raw_spin_lock_irqsave(&busiest->lock, flags);
10638
10639                         /* don't kick the active_load_balance_cpu_stop,
10640                          * if the curr task on busiest cpu can't be
10641                          * moved to this_cpu
10642                          */
10643                         if (!cpumask_test_cpu(this_cpu,
10644                                         tsk_cpus_allowed(busiest->curr))) {
10645                                 raw_spin_unlock_irqrestore(&busiest->lock,
10646                                                             flags);
10647                                 env.flags |= LBF_ALL_PINNED;
10648                                 goto out_one_pinned;
10649                         }
10650
10651                         /*
10652                          * ->active_balance synchronizes accesses to
10653                          * ->active_balance_work.  Once set, it's cleared
10654                          * only after active load balance is finished.
10655                          */
10656                         if (!busiest->active_balance &&
10657                             !cpu_isolated(cpu_of(busiest))) {
10658                                 busiest->active_balance = 1;
10659                                 busiest->push_cpu = this_cpu;
10660                                 active_balance = 1;
10661                         }
10662                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
10663
10664                         if (active_balance) {
10665                                 stop_one_cpu_nowait(cpu_of(busiest),
10666                                         active_load_balance_cpu_stop, busiest,
10667                                         &busiest->active_balance_work);
10668                                 *continue_balancing = 0;
10669                         }
10670
10671                         /*
10672                          * We've kicked active balancing, reset the failure
10673                          * counter.
10674                          */
10675                         sd->nr_balance_failed =
10676                             sd->cache_nice_tries +
10677                             NEED_ACTIVE_BALANCE_THRESHOLD - 1;
10678                 }
10679         } else {
10680                 sd->nr_balance_failed = 0;
10681
10682                 /* Assumes one 'busiest' cpu that we pulled tasks from */
10683                 if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
10684                         int check_groups = !!(env.flags &
10685                                          LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10686
10687                         check_for_freq_change(this_rq, false, check_groups);
10688                         check_for_freq_change(busiest, false, check_groups);
10689                 } else {
10690                         check_for_freq_change(this_rq, true, false);
10691                 }
10692         }
10693         if (likely(!active_balance)) {
10694                 /* We were unbalanced, so reset the balancing interval */
10695                 sd->balance_interval = sd->min_interval;
10696         } else {
10697                 /*
10698                  * If we've begun active balancing, start to back off. This
10699                  * case may not be covered by the all_pinned logic if there
10700                  * is only 1 task on the busy runqueue (because we don't call
10701                  * detach_tasks).
10702                  */
10703                 if (sd->balance_interval < sd->max_interval)
10704                         sd->balance_interval *= 2;
10705         }
10706
10707         goto out;
10708
10709 out_balanced:
10710         /*
10711          * We reach balance although we may have faced some affinity
10712          * constraints. Clear the imbalance flag if it was set.
10713          */
10714         if (sd_parent) {
10715                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10716
10717                 if (*group_imbalance)
10718                         *group_imbalance = 0;
10719         }
10720
10721 out_all_pinned:
10722         /*
10723          * We reach balance because all tasks are pinned at this level so
10724          * we can't migrate them. Let the imbalance flag set so parent level
10725          * can try to migrate them.
10726          */
10727         schedstat_inc(sd, lb_balanced[idle]);
10728
10729         sd->nr_balance_failed = 0;
10730
10731 out_one_pinned:
10732         /* tune up the balancing interval */
10733         if (((env.flags & LBF_ALL_PINNED) &&
10734                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
10735                         (sd->balance_interval < sd->max_interval))
10736                 sd->balance_interval *= 2;
10737
10738         ld_moved = 0;
10739 out:
10740         trace_sched_load_balance(this_cpu, idle, *continue_balancing,
10741                                  group ? group->cpumask[0] : 0,
10742                                  busiest ? busiest->nr_running : 0,
10743                                  env.imbalance, env.flags, ld_moved,
10744                                  sd->balance_interval);
10745         return ld_moved;
10746 }
10747
10748 static inline unsigned long
10749 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10750 {
10751         unsigned long interval = sd->balance_interval;
10752
10753         if (cpu_busy)
10754                 interval *= sd->busy_factor;
10755
10756         /* scale ms to jiffies */
10757         interval = msecs_to_jiffies(interval);
10758         interval = clamp(interval, 1UL, max_load_balance_interval);
10759
10760         return interval;
10761 }
10762
10763 static inline void
10764 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
10765 {
10766         unsigned long interval, next;
10767
10768         interval = get_sd_balance_interval(sd, cpu_busy);
10769         next = sd->last_balance + interval;
10770
10771         if (time_after(*next_balance, next))
10772                 *next_balance = next;
10773 }
10774
10775 /*
10776  * idle_balance is called by schedule() if this_cpu is about to become
10777  * idle. Attempts to pull tasks from other CPUs.
10778  */
10779 static int idle_balance(struct rq *this_rq)
10780 {
10781         unsigned long next_balance = jiffies + HZ;
10782         int this_cpu = this_rq->cpu;
10783         struct sched_domain *sd;
10784         int pulled_task = 0;
10785         u64 curr_cost = 0;
10786
10787         if (cpu_isolated(this_cpu))
10788                 return 0;
10789
10790         idle_enter_fair(this_rq);
10791
10792         /*
10793          * We must set idle_stamp _before_ calling idle_balance(), such that we
10794          * measure the duration of idle_balance() as idle time.
10795          */
10796         this_rq->idle_stamp = rq_clock(this_rq);
10797
10798         if (!energy_aware() &&
10799             (this_rq->avg_idle < sysctl_sched_migration_cost ||
10800              !this_rq->rd->overload)) {
10801                 rcu_read_lock();
10802                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10803                 if (sd)
10804                         update_next_balance(sd, 0, &next_balance);
10805                 rcu_read_unlock();
10806
10807                 goto out;
10808         }
10809
10810         raw_spin_unlock(&this_rq->lock);
10811
10812         update_blocked_averages(this_cpu);
10813         rcu_read_lock();
10814         for_each_domain(this_cpu, sd) {
10815                 int continue_balancing = 1;
10816                 u64 t0, domain_cost;
10817
10818                 if (!(sd->flags & SD_LOAD_BALANCE))
10819                         continue;
10820
10821                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10822                         update_next_balance(sd, 0, &next_balance);
10823                         break;
10824                 }
10825
10826                 if (sd->flags & SD_BALANCE_NEWIDLE) {
10827                         t0 = sched_clock_cpu(this_cpu);
10828
10829                         pulled_task = load_balance(this_cpu, this_rq,
10830                                                    sd, CPU_NEWLY_IDLE,
10831                                                    &continue_balancing);
10832
10833                         domain_cost = sched_clock_cpu(this_cpu) - t0;
10834                         if (domain_cost > sd->max_newidle_lb_cost)
10835                                 sd->max_newidle_lb_cost = domain_cost;
10836
10837                         curr_cost += domain_cost;
10838                 }
10839
10840                 update_next_balance(sd, 0, &next_balance);
10841
10842                 /*
10843                  * Stop searching for tasks to pull if there are
10844                  * now runnable tasks on the balance rq or if
10845                  * continue_balancing has been unset (only possible
10846                  * due to active migration).
10847                  */
10848                 if (pulled_task || this_rq->nr_running > 0 ||
10849                                                 !continue_balancing)
10850                         break;
10851         }
10852         rcu_read_unlock();
10853
10854         raw_spin_lock(&this_rq->lock);
10855
10856         if (curr_cost > this_rq->max_idle_balance_cost)
10857                 this_rq->max_idle_balance_cost = curr_cost;
10858
10859         /*
10860          * While browsing the domains, we released the rq lock, a task could
10861          * have been enqueued in the meantime. Since we're not going idle,
10862          * pretend we pulled a task.
10863          */
10864         if (this_rq->cfs.h_nr_running && !pulled_task)
10865                 pulled_task = 1;
10866
10867 out:
10868         /* Move the next balance forward */
10869         if (time_after(this_rq->next_balance, next_balance))
10870                 this_rq->next_balance = next_balance;
10871
10872         /* Is there a task of a high priority class? */
10873         if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10874                 pulled_task = -1;
10875
10876         if (pulled_task) {
10877                 idle_exit_fair(this_rq);
10878                 this_rq->idle_stamp = 0;
10879         }
10880
10881         return pulled_task;
10882 }
10883
10884 /*
10885  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
10886  * running tasks off the busiest CPU onto idle CPUs. It requires at
10887  * least 1 task to be running on each physical CPU where possible, and
10888  * avoids physical / logical imbalances.
10889  */
10890 static int active_load_balance_cpu_stop(void *data)
10891 {
10892         struct rq *busiest_rq = data;
10893         int busiest_cpu = cpu_of(busiest_rq);
10894         int target_cpu = busiest_rq->push_cpu;
10895         struct rq *target_rq = cpu_rq(target_cpu);
10896         struct sched_domain *sd = NULL;
10897         struct task_struct *p = NULL;
10898         struct task_struct *push_task = NULL;
10899         int push_task_detached = 0;
10900         struct lb_env env = {
10901                 .sd                     = sd,
10902                 .dst_cpu                = target_cpu,
10903                 .dst_rq                 = target_rq,
10904                 .src_cpu                = busiest_rq->cpu,
10905                 .src_rq                 = busiest_rq,
10906                 .idle                   = CPU_IDLE,
10907                 .busiest_nr_running     = 0,
10908                 .busiest_grp_capacity   = 0,
10909                 .flags                  = 0,
10910                 .loop                   = 0,
10911                 .boost_policy           = sched_boost_policy(),
10912         };
10913         bool moved = false;
10914
10915         raw_spin_lock_irq(&busiest_rq->lock);
10916
10917         /* make sure the requested cpu hasn't gone down in the meantime */
10918         if (unlikely(busiest_cpu != smp_processor_id() ||
10919                      !busiest_rq->active_balance))
10920                 goto out_unlock;
10921
10922         /* Is there any task to move? */
10923         if (busiest_rq->nr_running <= 1)
10924                 goto out_unlock;
10925
10926         /*
10927          * This condition is "impossible", if it occurs
10928          * we need to fix it. Originally reported by
10929          * Bjorn Helgaas on a 128-cpu setup.
10930          */
10931         BUG_ON(busiest_rq == target_rq);
10932
10933         push_task = busiest_rq->push_task;
10934         target_cpu = busiest_rq->push_cpu;
10935         if (push_task) {
10936                 if (task_on_rq_queued(push_task) &&
10937                         push_task->state == TASK_RUNNING &&
10938                         task_cpu(push_task) == busiest_cpu &&
10939                                         cpu_online(target_cpu)) {
10940                         detach_task(push_task, &env);
10941                         push_task_detached = 1;
10942                         moved = true;
10943                 }
10944                 goto out_unlock;
10945         }
10946
10947         /* Search for an sd spanning us and the target CPU. */
10948         rcu_read_lock();
10949         for_each_domain(target_cpu, sd) {
10950                 if ((sd->flags & SD_LOAD_BALANCE) &&
10951                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10952                                 break;
10953         }
10954
10955         if (likely(sd)) {
10956                 env.sd = sd;
10957                 schedstat_inc(sd, alb_count);
10958                 update_rq_clock(busiest_rq);
10959
10960                 p = detach_one_task(&env);
10961                 if (p) {
10962                         schedstat_inc(sd, alb_pushed);
10963                         moved = true;
10964                 } else {
10965                         schedstat_inc(sd, alb_failed);
10966                 }
10967         }
10968         rcu_read_unlock();
10969 out_unlock:
10970         busiest_rq->active_balance = 0;
10971         push_task = busiest_rq->push_task;
10972         target_cpu = busiest_rq->push_cpu;
10973
10974         if (push_task)
10975                 busiest_rq->push_task = NULL;
10976
10977         raw_spin_unlock(&busiest_rq->lock);
10978
10979         if (push_task) {
10980                 if (push_task_detached)
10981                         attach_one_task(target_rq, push_task);
10982                 put_task_struct(push_task);
10983                 clear_reserved(target_cpu);
10984         }
10985
10986         if (p)
10987                 attach_one_task(target_rq, p);
10988
10989         local_irq_enable();
10990
10991         if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
10992                 int check_groups = !!(env.flags &
10993                                          LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10994                 check_for_freq_change(busiest_rq, false, check_groups);
10995                 check_for_freq_change(target_rq, false, check_groups);
10996         } else if (moved) {
10997                 check_for_freq_change(target_rq, true, false);
10998         }
10999
11000         return 0;
11001 }
11002
11003 static inline int on_null_domain(struct rq *rq)
11004 {
11005         return unlikely(!rcu_dereference_sched(rq->sd));
11006 }
11007
11008 #ifdef CONFIG_NO_HZ_COMMON
11009 /*
11010  * idle load balancing details
11011  * - When one of the busy CPUs notice that there may be an idle rebalancing
11012  *   needed, they will kick the idle load balancer, which then does idle
11013  *   load balancing for all the idle CPUs.
11014  */
11015
11016 #ifdef CONFIG_SCHED_HMP
11017 static inline int find_new_hmp_ilb(int type)
11018 {
11019         int call_cpu = raw_smp_processor_id();
11020         struct sched_domain *sd;
11021         int ilb;
11022
11023         rcu_read_lock();
11024
11025         /* Pick an idle cpu "closest" to call_cpu */
11026         for_each_domain(call_cpu, sd) {
11027                 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
11028                                                 sched_domain_span(sd)) {
11029                         if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
11030                                         cpu_max_power_cost(ilb) <=
11031                                         cpu_max_power_cost(call_cpu))) {
11032                                 rcu_read_unlock();
11033                                 reset_balance_interval(ilb);
11034                                 return ilb;
11035                         }
11036                 }
11037         }
11038
11039         rcu_read_unlock();
11040         return nr_cpu_ids;
11041 }
11042 #else   /* CONFIG_SCHED_HMP */
11043 static inline int find_new_hmp_ilb(int type)
11044 {
11045         return 0;
11046 }
11047 #endif  /* CONFIG_SCHED_HMP */
11048
11049 static inline int find_new_ilb(int type)
11050 {
11051         int ilb;
11052
11053 #ifdef CONFIG_SCHED_HMP
11054         return find_new_hmp_ilb(type);
11055 #endif
11056
11057         ilb = cpumask_first(nohz.idle_cpus_mask);
11058
11059         if (ilb < nr_cpu_ids && idle_cpu(ilb))
11060                 return ilb;
11061
11062         return nr_cpu_ids;
11063 }
11064
11065 /*
11066  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
11067  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
11068  * CPU (if there is one).
11069  */
11070 static void nohz_balancer_kick(int type)
11071 {
11072         int ilb_cpu;
11073
11074         nohz.next_balance++;
11075
11076         ilb_cpu = find_new_ilb(type);
11077
11078         if (ilb_cpu >= nr_cpu_ids)
11079                 return;
11080
11081         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
11082                 return;
11083         /*
11084          * Use smp_send_reschedule() instead of resched_cpu().
11085          * This way we generate a sched IPI on the target cpu which
11086          * is idle. And the softirq performing nohz idle load balance
11087          * will be run before returning from the IPI.
11088          */
11089         smp_send_reschedule(ilb_cpu);
11090         return;
11091 }
11092
11093 void nohz_balance_clear_nohz_mask(int cpu)
11094 {
11095         if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
11096                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
11097                 atomic_dec(&nohz.nr_cpus);
11098         }
11099 }
11100
11101 static inline void nohz_balance_exit_idle(int cpu)
11102 {
11103         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
11104                 /*
11105                  * Completely isolated CPUs don't ever set, so we must test.
11106                  */
11107                 nohz_balance_clear_nohz_mask(cpu);
11108                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11109         }
11110 }
11111
11112 static inline void set_cpu_sd_state_busy(void)
11113 {
11114         struct sched_domain *sd;
11115         int cpu = smp_processor_id();
11116
11117         rcu_read_lock();
11118         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11119
11120         if (!sd || !sd->nohz_idle)
11121                 goto unlock;
11122         sd->nohz_idle = 0;
11123
11124         atomic_inc(&sd->groups->sgc->nr_busy_cpus);
11125 unlock:
11126         rcu_read_unlock();
11127 }
11128
11129 void set_cpu_sd_state_idle(void)
11130 {
11131         struct sched_domain *sd;
11132         int cpu = smp_processor_id();
11133
11134         rcu_read_lock();
11135         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11136
11137         if (!sd || sd->nohz_idle)
11138                 goto unlock;
11139         sd->nohz_idle = 1;
11140
11141         atomic_dec(&sd->groups->sgc->nr_busy_cpus);
11142 unlock:
11143         rcu_read_unlock();
11144 }
11145
11146 /*
11147  * This routine will record that the cpu is going idle with tick stopped.
11148  * This info will be used in performing idle load balancing in the future.
11149  */
11150 void nohz_balance_enter_idle(int cpu)
11151 {
11152         /*
11153          * If this cpu is going down, then nothing needs to be done.
11154          */
11155         if (!cpu_active(cpu))
11156                 return;
11157
11158         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
11159                 return;
11160
11161         /*
11162          * If we're a completely isolated CPU, we don't play.
11163          */
11164         if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
11165                 return;
11166
11167         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
11168         atomic_inc(&nohz.nr_cpus);
11169         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11170 }
11171
11172 static int sched_ilb_notifier(struct notifier_block *nfb,
11173                                         unsigned long action, void *hcpu)
11174 {
11175         switch (action & ~CPU_TASKS_FROZEN) {
11176         case CPU_DYING:
11177                 nohz_balance_exit_idle(smp_processor_id());
11178                 return NOTIFY_OK;
11179         default:
11180                 return NOTIFY_DONE;
11181         }
11182 }
11183 #endif
11184
11185 static DEFINE_SPINLOCK(balancing);
11186
11187 /*
11188  * Scale the max load_balance interval with the number of CPUs in the system.
11189  * This trades load-balance latency on larger machines for less cross talk.
11190  */
11191 void update_max_interval(void)
11192 {
11193         cpumask_t avail_mask;
11194         unsigned int available_cpus;
11195
11196         cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
11197         available_cpus = cpumask_weight(&avail_mask);
11198
11199         max_load_balance_interval = HZ*available_cpus/10;
11200 }
11201
11202 /*
11203  * It checks each scheduling domain to see if it is due to be balanced,
11204  * and initiates a balancing operation if so.
11205  *
11206  * Balancing parameters are set up in init_sched_domains.
11207  */
11208 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
11209 {
11210         int continue_balancing = 1;
11211         int cpu = rq->cpu;
11212         unsigned long interval;
11213         struct sched_domain *sd;
11214         /* Earliest time when we have to do rebalance again */
11215         unsigned long next_balance = jiffies + 60*HZ;
11216         int update_next_balance = 0;
11217         int need_serialize, need_decay = 0;
11218         u64 max_cost = 0;
11219
11220         update_blocked_averages(cpu);
11221
11222         rcu_read_lock();
11223         for_each_domain(cpu, sd) {
11224                 /*
11225                  * Decay the newidle max times here because this is a regular
11226                  * visit to all the domains. Decay ~1% per second.
11227                  */
11228                 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
11229                         sd->max_newidle_lb_cost =
11230                                 (sd->max_newidle_lb_cost * 253) / 256;
11231                         sd->next_decay_max_lb_cost = jiffies + HZ;
11232                         need_decay = 1;
11233                 }
11234                 max_cost += sd->max_newidle_lb_cost;
11235
11236                 if (!(sd->flags & SD_LOAD_BALANCE))
11237                         continue;
11238
11239                 /*
11240                  * Stop the load balance at this level. There is another
11241                  * CPU in our sched group which is doing load balancing more
11242                  * actively.
11243                  */
11244                 if (!continue_balancing) {
11245                         if (need_decay)
11246                                 continue;
11247                         break;
11248                 }
11249
11250                 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11251
11252                 need_serialize = sd->flags & SD_SERIALIZE;
11253                 if (need_serialize) {
11254                         if (!spin_trylock(&balancing))
11255                                 goto out;
11256                 }
11257
11258                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
11259                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
11260                                 /*
11261                                  * The LBF_DST_PINNED logic could have changed
11262                                  * env->dst_cpu, so we can't know our idle
11263                                  * state even if we migrated tasks. Update it.
11264                                  */
11265                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
11266                         }
11267                         sd->last_balance = jiffies;
11268                         interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11269                 }
11270                 if (need_serialize)
11271                         spin_unlock(&balancing);
11272 out:
11273                 if (time_after(next_balance, sd->last_balance + interval)) {
11274                         next_balance = sd->last_balance + interval;
11275                         update_next_balance = 1;
11276                 }
11277         }
11278         if (need_decay) {
11279                 /*
11280                  * Ensure the rq-wide value also decays but keep it at a
11281                  * reasonable floor to avoid funnies with rq->avg_idle.
11282                  */
11283                 rq->max_idle_balance_cost =
11284                         max((u64)sysctl_sched_migration_cost, max_cost);
11285         }
11286         rcu_read_unlock();
11287
11288         /*
11289          * next_balance will be updated only when there is a need.
11290          * When the cpu is attached to null domain for ex, it will not be
11291          * updated.
11292          */
11293         if (likely(update_next_balance)) {
11294                 rq->next_balance = next_balance;
11295
11296 #ifdef CONFIG_NO_HZ_COMMON
11297                 /*
11298                  * If this CPU has been elected to perform the nohz idle
11299                  * balance. Other idle CPUs have already rebalanced with
11300                  * nohz_idle_balance() and nohz.next_balance has been
11301                  * updated accordingly. This CPU is now running the idle load
11302                  * balance for itself and we need to update the
11303                  * nohz.next_balance accordingly.
11304                  */
11305                 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
11306                         nohz.next_balance = rq->next_balance;
11307 #endif
11308         }
11309 }
11310
11311 #ifdef CONFIG_NO_HZ_COMMON
11312 /*
11313  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
11314  * rebalancing for all the cpus for whom scheduler ticks are stopped.
11315  */
11316 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11317 {
11318         int this_cpu = this_rq->cpu;
11319         struct rq *rq;
11320         int balance_cpu;
11321         /* Earliest time when we have to do rebalance again */
11322         unsigned long next_balance = jiffies + 60*HZ;
11323         int update_next_balance = 0;
11324         cpumask_t cpus;
11325
11326         if (idle != CPU_IDLE ||
11327             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
11328                 goto end;
11329
11330         cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
11331
11332         for_each_cpu(balance_cpu, &cpus) {
11333                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
11334                         continue;
11335
11336                 /*
11337                  * If this cpu gets work to do, stop the load balancing
11338                  * work being done for other cpus. Next load
11339                  * balancing owner will pick it up.
11340                  */
11341                 if (need_resched())
11342                         break;
11343
11344                 rq = cpu_rq(balance_cpu);
11345
11346                 /*
11347                  * If time for next balance is due,
11348                  * do the balance.
11349                  */
11350                 if (time_after_eq(jiffies, rq->next_balance)) {
11351                         raw_spin_lock_irq(&rq->lock);
11352                         update_rq_clock(rq);
11353                         update_idle_cpu_load(rq);
11354                         raw_spin_unlock_irq(&rq->lock);
11355                         rebalance_domains(rq, CPU_IDLE);
11356                 }
11357
11358                 if (time_after(next_balance, rq->next_balance)) {
11359                         next_balance = rq->next_balance;
11360                         update_next_balance = 1;
11361                 }
11362         }
11363
11364         /*
11365          * next_balance will be updated only when there is a need.
11366          * When the CPU is attached to null domain for ex, it will not be
11367          * updated.
11368          */
11369         if (likely(update_next_balance))
11370                 nohz.next_balance = next_balance;
11371 end:
11372         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
11373 }
11374
11375 #ifdef CONFIG_SCHED_HMP
11376 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11377 {
11378         struct sched_domain *sd;
11379         int i;
11380
11381         if (rq->nr_running < 2)
11382                 return 0;
11383
11384         if (!sysctl_sched_restrict_cluster_spill ||
11385                         sched_boost_policy() == SCHED_BOOST_ON_ALL)
11386                 return 1;
11387
11388         if (cpu_max_power_cost(cpu) == max_power_cost)
11389                 return 1;
11390
11391         rcu_read_lock();
11392         sd = rcu_dereference_check_sched_domain(rq->sd);
11393         if (!sd) {
11394                 rcu_read_unlock();
11395                 return 0;
11396         }
11397
11398         for_each_cpu(i, sched_domain_span(sd)) {
11399                 if (cpu_load(i) < sched_spill_load &&
11400                                 cpu_rq(i)->nr_running <
11401                                 sysctl_sched_spill_nr_run) {
11402                         /* Change the kick type to limit to CPUs that
11403                          * are of equal or lower capacity.
11404                          */
11405                         *type = NOHZ_KICK_RESTRICT;
11406                         break;
11407                 }
11408         }
11409         rcu_read_unlock();
11410         return 1;
11411 }
11412 #else
11413 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11414 {
11415         return 0;
11416 }
11417 #endif
11418
11419 static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
11420 {
11421         unsigned long now = jiffies;
11422
11423         /*
11424          * None are in tickless mode and hence no need for NOHZ idle load
11425          * balancing.
11426          */
11427         if (likely(!atomic_read(&nohz.nr_cpus)))
11428                 return 0;
11429
11430 #ifdef CONFIG_SCHED_HMP
11431         return _nohz_kick_needed_hmp(rq, cpu, type);
11432 #endif
11433
11434         if (time_before(now, nohz.next_balance))
11435                 return 0;
11436
11437         if (rq->nr_running >= 2 &&
11438             (!energy_aware() || cpu_overutilized(cpu)))
11439                 return true;
11440
11441         /* Do idle load balance if there have misfit task */
11442         if (energy_aware())
11443                 return rq->misfit_task;
11444
11445         return (rq->nr_running >= 2);
11446 }
11447
11448 /*
11449  * Current heuristic for kicking the idle load balancer in the presence
11450  * of an idle cpu in the system.
11451  *   - This rq has more than one task.
11452  *   - This rq has at least one CFS task and the capacity of the CPU is
11453  *     significantly reduced because of RT tasks or IRQs.
11454  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
11455  *     multiple busy cpu.
11456  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
11457  *     domain span are idle.
11458  */
11459 static inline bool nohz_kick_needed(struct rq *rq, int *type)
11460 {
11461 #ifndef CONFIG_SCHED_HMP
11462         struct sched_domain *sd;
11463         struct sched_group_capacity *sgc;
11464         int nr_busy;
11465 #endif
11466         int cpu = rq->cpu;
11467         bool kick = false;
11468
11469         if (unlikely(rq->idle_balance))
11470                 return false;
11471
11472        /*
11473         * We may be recently in ticked or tickless idle mode. At the first
11474         * busy tick after returning from idle, we will update the busy stats.
11475         */
11476         set_cpu_sd_state_busy();
11477         nohz_balance_exit_idle(cpu);
11478
11479         if (_nohz_kick_needed(rq, cpu, type))
11480                 return true;
11481
11482 #ifndef CONFIG_SCHED_HMP
11483         rcu_read_lock();
11484         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11485         if (sd) {
11486                 sgc = sd->groups->sgc;
11487                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
11488
11489                 if (nr_busy > 1) {
11490                         kick = true;
11491                         goto unlock;
11492                 }
11493
11494         }
11495
11496         sd = rcu_dereference(rq->sd);
11497         if (sd) {
11498                 if ((rq->cfs.h_nr_running >= 1) &&
11499                                 check_cpu_capacity(rq, sd)) {
11500                         kick = true;
11501                         goto unlock;
11502                 }
11503         }
11504
11505         sd = rcu_dereference(per_cpu(sd_asym, cpu));
11506         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
11507                                   sched_domain_span(sd)) < cpu)) {
11508                 kick = true;
11509                 goto unlock;
11510         }
11511
11512 unlock:
11513         rcu_read_unlock();
11514 #endif
11515         return kick;
11516 }
11517 #else
11518 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
11519 #endif
11520
11521 /*
11522  * run_rebalance_domains is triggered when needed from the scheduler tick.
11523  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
11524  */
11525 static void run_rebalance_domains(struct softirq_action *h)
11526 {
11527         struct rq *this_rq = this_rq();
11528         enum cpu_idle_type idle = this_rq->idle_balance ?
11529                                                 CPU_IDLE : CPU_NOT_IDLE;
11530
11531         /*
11532          * If this cpu has a pending nohz_balance_kick, then do the
11533          * balancing on behalf of the other idle cpus whose ticks are
11534          * stopped. Do nohz_idle_balance *before* rebalance_domains to
11535          * give the idle cpus a chance to load balance. Else we may
11536          * load balance only within the local sched_domain hierarchy
11537          * and abort nohz_idle_balance altogether if we pull some load.
11538          */
11539         nohz_idle_balance(this_rq, idle);
11540         rebalance_domains(this_rq, idle);
11541 }
11542
11543 /*
11544  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
11545  */
11546 void trigger_load_balance(struct rq *rq)
11547 {
11548         int type = NOHZ_KICK_ANY;
11549
11550         /* Don't need to rebalance while attached to NULL domain or
11551          * cpu is isolated.
11552          */
11553         if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
11554                 return;
11555
11556         if (time_after_eq(jiffies, rq->next_balance))
11557                 raise_softirq(SCHED_SOFTIRQ);
11558 #ifdef CONFIG_NO_HZ_COMMON
11559         if (nohz_kick_needed(rq, &type))
11560                 nohz_balancer_kick(type);
11561 #endif
11562 }
11563
11564 static void rq_online_fair(struct rq *rq)
11565 {
11566         update_sysctl();
11567
11568         update_runtime_enabled(rq);
11569 }
11570
11571 static void rq_offline_fair(struct rq *rq)
11572 {
11573         update_sysctl();
11574
11575         /* Ensure any throttled groups are reachable by pick_next_task */
11576         unthrottle_offline_cfs_rqs(rq);
11577 }
11578
11579 #endif /* CONFIG_SMP */
11580
11581 /*
11582  * scheduler tick hitting a task of our scheduling class:
11583  */
11584 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11585 {
11586         struct cfs_rq *cfs_rq;
11587         struct sched_entity *se = &curr->se;
11588
11589         for_each_sched_entity(se) {
11590                 cfs_rq = cfs_rq_of(se);
11591                 entity_tick(cfs_rq, se, queued);
11592         }
11593
11594         if (static_branch_unlikely(&sched_numa_balancing))
11595                 task_tick_numa(rq, curr);
11596
11597 #ifdef CONFIG_SMP
11598         if (energy_aware() &&
11599             !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
11600                 rq->rd->overutilized = true;
11601                 trace_sched_overutilized(true);
11602         }
11603
11604         rq->misfit_task = !task_fits_max(curr, rq->cpu);
11605 #endif
11606
11607 }
11608
11609 /*
11610  * called on fork with the child task as argument from the parent's context
11611  *  - child not yet on the tasklist
11612  *  - preemption disabled
11613  */
11614 static void task_fork_fair(struct task_struct *p)
11615 {
11616         struct cfs_rq *cfs_rq;
11617         struct sched_entity *se = &p->se, *curr;
11618         struct rq *rq = this_rq();
11619
11620         raw_spin_lock(&rq->lock);
11621         update_rq_clock(rq);
11622
11623         cfs_rq = task_cfs_rq(current);
11624         curr = cfs_rq->curr;
11625         if (curr) {
11626                 update_curr(cfs_rq);
11627                 se->vruntime = curr->vruntime;
11628         }
11629         place_entity(cfs_rq, se, 1);
11630
11631         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11632                 /*
11633                  * Upon rescheduling, sched_class::put_prev_task() will place
11634                  * 'current' within the tree based on its new key value.
11635                  */
11636                 swap(curr->vruntime, se->vruntime);
11637                 resched_curr(rq);
11638         }
11639
11640         se->vruntime -= cfs_rq->min_vruntime;
11641         raw_spin_unlock(&rq->lock);
11642 }
11643
11644 /*
11645  * Priority of the task has changed. Check to see if we preempt
11646  * the current task.
11647  */
11648 static void
11649 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11650 {
11651         if (!task_on_rq_queued(p))
11652                 return;
11653
11654         /*
11655          * Reschedule if we are currently running on this runqueue and
11656          * our priority decreased, or if we are not currently running on
11657          * this runqueue and our priority is higher than the current's
11658          */
11659         if (rq->curr == p) {
11660                 if (p->prio > oldprio)
11661                         resched_curr(rq);
11662         } else
11663                 check_preempt_curr(rq, p, 0);
11664 }
11665
11666 static inline bool vruntime_normalized(struct task_struct *p)
11667 {
11668         struct sched_entity *se = &p->se;
11669
11670         /*
11671          * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11672          * the dequeue_entity(.flags=0) will already have normalized the
11673          * vruntime.
11674          */
11675         if (p->on_rq)
11676                 return true;
11677
11678         /*
11679          * When !on_rq, vruntime of the task has usually NOT been normalized.
11680          * But there are some cases where it has already been normalized:
11681          *
11682          * - A forked child which is waiting for being woken up by
11683          *   wake_up_new_task().
11684          * - A task which has been woken up by try_to_wake_up() and
11685          *   waiting for actually being woken up by sched_ttwu_pending().
11686          */
11687         if (!se->sum_exec_runtime || p->state == TASK_WAKING)
11688                 return true;
11689
11690         return false;
11691 }
11692
11693 #ifdef CONFIG_FAIR_GROUP_SCHED
11694 /*
11695  * Propagate the changes of the sched_entity across the tg tree to make it
11696  * visible to the root
11697  */
11698 static void propagate_entity_cfs_rq(struct sched_entity *se)
11699 {
11700         struct cfs_rq *cfs_rq;
11701
11702         /* Start to propagate at parent */
11703         se = se->parent;
11704
11705         for_each_sched_entity(se) {
11706                 cfs_rq = cfs_rq_of(se);
11707
11708                 if (cfs_rq_throttled(cfs_rq))
11709                         break;
11710
11711                 update_load_avg(se, UPDATE_TG);
11712         }
11713 }
11714 #else
11715 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11716 #endif
11717
11718 static void detach_entity_cfs_rq(struct sched_entity *se)
11719 {
11720         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11721
11722         /* Catch up with the cfs_rq and remove our load when we leave */
11723         update_load_avg(se, 0);
11724         detach_entity_load_avg(cfs_rq, se);
11725         update_tg_load_avg(cfs_rq, false);
11726         propagate_entity_cfs_rq(se);
11727 }
11728
11729 static void attach_entity_cfs_rq(struct sched_entity *se)
11730 {
11731         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11732
11733 #ifdef CONFIG_FAIR_GROUP_SCHED
11734         /*
11735          * Since the real-depth could have been changed (only FAIR
11736          * class maintain depth value), reset depth properly.
11737          */
11738         se->depth = se->parent ? se->parent->depth + 1 : 0;
11739 #endif
11740
11741         /* Synchronize entity with its cfs_rq */
11742         update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11743         attach_entity_load_avg(cfs_rq, se);
11744         update_tg_load_avg(cfs_rq, false);
11745         propagate_entity_cfs_rq(se);
11746 }
11747
11748 static void detach_task_cfs_rq(struct task_struct *p)
11749 {
11750         struct sched_entity *se = &p->se;
11751         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11752
11753         if (!vruntime_normalized(p)) {
11754                 /*
11755                  * Fix up our vruntime so that the current sleep doesn't
11756                  * cause 'unlimited' sleep bonus.
11757                  */
11758                 place_entity(cfs_rq, se, 0);
11759                 se->vruntime -= cfs_rq->min_vruntime;
11760         }
11761
11762         detach_entity_cfs_rq(se);
11763 }
11764
11765 static void attach_task_cfs_rq(struct task_struct *p)
11766 {
11767         struct sched_entity *se = &p->se;
11768         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11769
11770         attach_entity_cfs_rq(se);
11771
11772         if (!vruntime_normalized(p))
11773                 se->vruntime += cfs_rq->min_vruntime;
11774 }
11775
11776 static void switched_from_fair(struct rq *rq, struct task_struct *p)
11777 {
11778         detach_task_cfs_rq(p);
11779 }
11780
11781 static void switched_to_fair(struct rq *rq, struct task_struct *p)
11782 {
11783         attach_task_cfs_rq(p);
11784
11785         if (task_on_rq_queued(p)) {
11786                 /*
11787                  * We were most likely switched from sched_rt, so
11788                  * kick off the schedule if running, otherwise just see
11789                  * if we can still preempt the current task.
11790                  */
11791                 if (rq->curr == p)
11792                         resched_curr(rq);
11793                 else
11794                         check_preempt_curr(rq, p, 0);
11795         }
11796 }
11797
11798 /* Account for a task changing its policy or group.
11799  *
11800  * This routine is mostly called to set cfs_rq->curr field when a task
11801  * migrates between groups/classes.
11802  */
11803 static void set_curr_task_fair(struct rq *rq)
11804 {
11805         struct sched_entity *se = &rq->curr->se;
11806
11807         for_each_sched_entity(se) {
11808                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11809
11810                 set_next_entity(cfs_rq, se);
11811                 /* ensure bandwidth has been allocated on our new cfs_rq */
11812                 account_cfs_rq_runtime(cfs_rq, 0);
11813         }
11814 }
11815
11816 void init_cfs_rq(struct cfs_rq *cfs_rq)
11817 {
11818         cfs_rq->tasks_timeline = RB_ROOT;
11819         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11820 #ifndef CONFIG_64BIT
11821         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11822 #endif
11823 #ifdef CONFIG_SMP
11824 #ifdef CONFIG_FAIR_GROUP_SCHED
11825         cfs_rq->propagate_avg = 0;
11826 #endif
11827         atomic_long_set(&cfs_rq->removed_load_avg, 0);
11828         atomic_long_set(&cfs_rq->removed_util_avg, 0);
11829 #endif
11830 }
11831
11832 #ifdef CONFIG_FAIR_GROUP_SCHED
11833 static void task_set_group_fair(struct task_struct *p)
11834 {
11835         struct sched_entity *se = &p->se;
11836
11837         set_task_rq(p, task_cpu(p));
11838         se->depth = se->parent ? se->parent->depth + 1 : 0;
11839 }
11840
11841 static void task_move_group_fair(struct task_struct *p)
11842 {
11843         detach_task_cfs_rq(p);
11844         set_task_rq(p, task_cpu(p));
11845
11846 #ifdef CONFIG_SMP
11847         /* Tell se's cfs_rq has been changed -- migrated */
11848         p->se.avg.last_update_time = 0;
11849 #endif
11850         attach_task_cfs_rq(p);
11851 }
11852
11853 static void task_change_group_fair(struct task_struct *p, int type)
11854 {
11855         switch (type) {
11856         case TASK_SET_GROUP:
11857                 task_set_group_fair(p);
11858                 break;
11859
11860         case TASK_MOVE_GROUP:
11861                 task_move_group_fair(p);
11862                 break;
11863         }
11864 }
11865
11866 void free_fair_sched_group(struct task_group *tg)
11867 {
11868         int i;
11869
11870         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11871
11872         for_each_possible_cpu(i) {
11873                 if (tg->cfs_rq)
11874                         kfree(tg->cfs_rq[i]);
11875                 if (tg->se)
11876                         kfree(tg->se[i]);
11877         }
11878
11879         kfree(tg->cfs_rq);
11880         kfree(tg->se);
11881 }
11882
11883 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11884 {
11885         struct sched_entity *se;
11886         struct cfs_rq *cfs_rq;
11887         struct rq *rq;
11888         int i;
11889
11890         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
11891         if (!tg->cfs_rq)
11892                 goto err;
11893         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
11894         if (!tg->se)
11895                 goto err;
11896
11897         tg->shares = NICE_0_LOAD;
11898
11899         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11900
11901         for_each_possible_cpu(i) {
11902                 rq = cpu_rq(i);
11903
11904                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11905                                       GFP_KERNEL, cpu_to_node(i));
11906                 if (!cfs_rq)
11907                         goto err;
11908
11909                 se = kzalloc_node(sizeof(struct sched_entity),
11910                                   GFP_KERNEL, cpu_to_node(i));
11911                 if (!se)
11912                         goto err_free_rq;
11913
11914                 init_cfs_rq(cfs_rq);
11915                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11916                 init_entity_runnable_average(se);
11917
11918                 raw_spin_lock_irq(&rq->lock);
11919                 post_init_entity_util_avg(se);
11920                 raw_spin_unlock_irq(&rq->lock);
11921         }
11922
11923         return 1;
11924
11925 err_free_rq:
11926         kfree(cfs_rq);
11927 err:
11928         return 0;
11929 }
11930
11931 void unregister_fair_sched_group(struct task_group *tg)
11932 {
11933         unsigned long flags;
11934         struct rq *rq;
11935         int cpu;
11936
11937         for_each_possible_cpu(cpu) {
11938                 if (tg->se[cpu])
11939                         remove_entity_load_avg(tg->se[cpu]);
11940
11941                 /*
11942                  * Only empty task groups can be destroyed; so we can speculatively
11943                  * check on_list without danger of it being re-added.
11944                  */
11945                 if (!tg->cfs_rq[cpu]->on_list)
11946                         continue;
11947
11948                 rq = cpu_rq(cpu);
11949
11950                 raw_spin_lock_irqsave(&rq->lock, flags);
11951                 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11952                 raw_spin_unlock_irqrestore(&rq->lock, flags);
11953         }
11954 }
11955
11956 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11957                         struct sched_entity *se, int cpu,
11958                         struct sched_entity *parent)
11959 {
11960         struct rq *rq = cpu_rq(cpu);
11961
11962         cfs_rq->tg = tg;
11963         cfs_rq->rq = rq;
11964         init_cfs_rq_runtime(cfs_rq);
11965
11966         tg->cfs_rq[cpu] = cfs_rq;
11967         tg->se[cpu] = se;
11968
11969         /* se could be NULL for root_task_group */
11970         if (!se)
11971                 return;
11972
11973         if (!parent) {
11974                 se->cfs_rq = &rq->cfs;
11975                 se->depth = 0;
11976         } else {
11977                 se->cfs_rq = parent->my_q;
11978                 se->depth = parent->depth + 1;
11979         }
11980
11981         se->my_q = cfs_rq;
11982         /* guarantee group entities always have weight */
11983         update_load_set(&se->load, NICE_0_LOAD);
11984         se->parent = parent;
11985 }
11986
11987 static DEFINE_MUTEX(shares_mutex);
11988
11989 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11990 {
11991         int i;
11992         unsigned long flags;
11993
11994         /*
11995          * We can't change the weight of the root cgroup.
11996          */
11997         if (!tg->se[0])
11998                 return -EINVAL;
11999
12000         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
12001
12002         mutex_lock(&shares_mutex);
12003         if (tg->shares == shares)
12004                 goto done;
12005
12006         tg->shares = shares;
12007         for_each_possible_cpu(i) {
12008                 struct rq *rq = cpu_rq(i);
12009                 struct sched_entity *se;
12010
12011                 se = tg->se[i];
12012                 /* Propagate contribution to hierarchy */
12013                 raw_spin_lock_irqsave(&rq->lock, flags);
12014
12015                 /* Possible calls to update_curr() need rq clock */
12016                 update_rq_clock(rq);
12017                 for_each_sched_entity(se) {
12018                         update_load_avg(se, UPDATE_TG);
12019                         update_cfs_shares(se);
12020                 }
12021                 raw_spin_unlock_irqrestore(&rq->lock, flags);
12022         }
12023
12024 done:
12025         mutex_unlock(&shares_mutex);
12026         return 0;
12027 }
12028 #else /* CONFIG_FAIR_GROUP_SCHED */
12029
12030 void free_fair_sched_group(struct task_group *tg) { }
12031
12032 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12033 {
12034         return 1;
12035 }
12036
12037 void unregister_fair_sched_group(struct task_group *tg) { }
12038
12039 #endif /* CONFIG_FAIR_GROUP_SCHED */
12040
12041
12042 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
12043 {
12044         struct sched_entity *se = &task->se;
12045         unsigned int rr_interval = 0;
12046
12047         /*
12048          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
12049          * idle runqueue:
12050          */
12051         if (rq->cfs.load.weight)
12052                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
12053
12054         return rr_interval;
12055 }
12056
12057 /*
12058  * All the scheduling class methods:
12059  */
12060 const struct sched_class fair_sched_class = {
12061         .next                   = &idle_sched_class,
12062         .enqueue_task           = enqueue_task_fair,
12063         .dequeue_task           = dequeue_task_fair,
12064         .yield_task             = yield_task_fair,
12065         .yield_to_task          = yield_to_task_fair,
12066
12067         .check_preempt_curr     = check_preempt_wakeup,
12068
12069         .pick_next_task         = pick_next_task_fair,
12070         .put_prev_task          = put_prev_task_fair,
12071
12072 #ifdef CONFIG_SMP
12073         .select_task_rq         = select_task_rq_fair,
12074         .migrate_task_rq        = migrate_task_rq_fair,
12075
12076         .rq_online              = rq_online_fair,
12077         .rq_offline             = rq_offline_fair,
12078
12079         .task_waking            = task_waking_fair,
12080         .task_dead              = task_dead_fair,
12081         .set_cpus_allowed       = set_cpus_allowed_common,
12082 #endif
12083
12084         .set_curr_task          = set_curr_task_fair,
12085         .task_tick              = task_tick_fair,
12086         .task_fork              = task_fork_fair,
12087
12088         .prio_changed           = prio_changed_fair,
12089         .switched_from          = switched_from_fair,
12090         .switched_to            = switched_to_fair,
12091
12092         .get_rr_interval        = get_rr_interval_fair,
12093
12094         .update_curr            = update_curr_fair,
12095
12096 #ifdef CONFIG_FAIR_GROUP_SCHED
12097         .task_change_group      = task_change_group_fair,
12098 #endif
12099 #ifdef CONFIG_SCHED_HMP
12100         .inc_hmp_sched_stats    = inc_hmp_sched_stats_fair,
12101         .dec_hmp_sched_stats    = dec_hmp_sched_stats_fair,
12102         .fixup_hmp_sched_stats  = fixup_hmp_sched_stats_fair,
12103 #endif
12104 };
12105
12106 #ifdef CONFIG_SCHED_DEBUG
12107 void print_cfs_stats(struct seq_file *m, int cpu)
12108 {
12109         struct cfs_rq *cfs_rq;
12110
12111         rcu_read_lock();
12112         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
12113                 print_cfs_rq(m, cpu, cfs_rq);
12114         rcu_read_unlock();
12115 }
12116
12117 #ifdef CONFIG_NUMA_BALANCING
12118 void show_numa_stats(struct task_struct *p, struct seq_file *m)
12119 {
12120         int node;
12121         unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
12122
12123         for_each_online_node(node) {
12124                 if (p->numa_faults) {
12125                         tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
12126                         tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
12127                 }
12128                 if (p->numa_group) {
12129                         gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
12130                         gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
12131                 }
12132                 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
12133         }
12134 }
12135 #endif /* CONFIG_NUMA_BALANCING */
12136 #endif /* CONFIG_SCHED_DEBUG */
12137
12138 __init void init_sched_fair_class(void)
12139 {
12140 #ifdef CONFIG_SMP
12141         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
12142
12143 #ifdef CONFIG_NO_HZ_COMMON
12144         nohz.next_balance = jiffies;
12145         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
12146         cpu_notifier(sched_ilb_notifier, 0);
12147 #endif
12148 #endif /* SMP */
12149
12150 }