kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/cpuidle.h>
  27 #include <linux/slab.h>
  28 #include <linux/profile.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/mempolicy.h>
  31 #include <linux/migrate.h>
  32 #include <linux/task_work.h>
  33 #include <linux/module.h>
  34
  35 #include "sched.h"
  36 #include <trace/events/sched.h>
  37 #include "tune.h"
  38 #include "walt.h"
  39
  40 /*
  41  * Targeted preemption latency for CPU-bound tasks:
  42  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  43  *
  44  * NOTE: this latency value is not the same as the concept of
  45  * 'timeslice length' - timeslices in CFS are of variable length
  46  * and have no persistent notion like in traditional, time-slice
  47  * based scheduling concepts.
  48  *
  49  * (to see the precise effective timeslice length of your workload,
  50  *  run vmstat and monitor the context-switches (cs) field)
  51  */
  52 unsigned int sysctl_sched_latency = 6000000ULL;
  53 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  54
  55 unsigned int sysctl_sched_sync_hint_enable = 1;
  56 unsigned int sysctl_sched_cstate_aware = 1;
  57
  58 /*
  59  * The initial- and re-scaling of tunables is configurable
  60  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  61  *
  62  * Options are:
  63  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  64  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  65  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  66  */
  67 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  68         = SCHED_TUNABLESCALING_LOG;
  69
  70 /*
  71  * Minimal preemption granularity for CPU-bound tasks:
  72  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  73  */
  74 unsigned int sysctl_sched_min_granularity = 750000ULL;
  75 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  76
  77 /*
  78  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  79  */
  80 static unsigned int sched_nr_latency = 8;
  81
  82 /*
  83  * After fork, child runs first. If set to 0 (default) then
  84  * parent will (try to) run first.
  85  */
  86 unsigned int sysctl_sched_child_runs_first __read_mostly;
  87
  88 /*
  89  * SCHED_OTHER wake-up granularity.
  90  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  91  *
  92  * This option delays the preemption effects of decoupled workloads
  93  * and reduces their over-scheduling. Synchronous workloads will still
  94  * have immediate wakeup/sleep latencies.
  95  */
  96 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  97 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  98
  99 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 100
 101 /*
 102  * The exponential sliding  window over which load is averaged for shares
 103  * distribution.
 104  * (default: 10msec)
 105  */
 106 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 107
 108 #ifdef CONFIG_CFS_BANDWIDTH
 109 /*
 110  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 111  * each time a cfs_rq requests quota.
 112  *
 113  * Note: in the case that the slice exceeds the runtime remaining (either due
 114  * to consumption or the quota being specified to be smaller than the slice)
 115  * we will always only issue the remaining available time.
 116  *
 117  * default: 5 msec, units: microseconds
 118   */
 119 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 120 #endif
 121
 122 /*
 123  * The margin used when comparing utilization with CPU capacity:
 124  * util * margin < capacity * 1024
 125  */
 126 unsigned int capacity_margin = 1280; /* ~20% */
 127
 128 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 129 {
 130         lw->weight += inc;
 131         lw->inv_weight = 0;
 132 }
 133
 134 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 135 {
 136         lw->weight -= dec;
 137         lw->inv_weight = 0;
 138 }
 139
 140 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 141 {
 142         lw->weight = w;
 143         lw->inv_weight = 0;
 144 }
 145
 146 /*
 147  * Increase the granularity value when there are more CPUs,
 148  * because with more CPUs the 'effective latency' as visible
 149  * to users decreases. But the relationship is not linear,
 150  * so pick a second-best guess by going with the log2 of the
 151  * number of CPUs.
 152  *
 153  * This idea comes from the SD scheduler of Con Kolivas:
 154  */
 155 static unsigned int get_update_sysctl_factor(void)
 156 {
 157         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 158         unsigned int factor;
 159
 160         switch (sysctl_sched_tunable_scaling) {
 161         case SCHED_TUNABLESCALING_NONE:
 162                 factor = 1;
 163                 break;
 164         case SCHED_TUNABLESCALING_LINEAR:
 165                 factor = cpus;
 166                 break;
 167         case SCHED_TUNABLESCALING_LOG:
 168         default:
 169                 factor = 1 + ilog2(cpus);
 170                 break;
 171         }
 172
 173         return factor;
 174 }
 175
 176 static void update_sysctl(void)
 177 {
 178         unsigned int factor = get_update_sysctl_factor();
 179
 180 #define SET_SYSCTL(name) \
 181         (sysctl_##name = (factor) * normalized_sysctl_##name)
 182         SET_SYSCTL(sched_min_granularity);
 183         SET_SYSCTL(sched_latency);
 184         SET_SYSCTL(sched_wakeup_granularity);
 185 #undef SET_SYSCTL
 186 }
 187
 188 void sched_init_granularity(void)
 189 {
 190         update_sysctl();
 191 }
 192
 193 #define WMULT_CONST     (~0U)
 194 #define WMULT_SHIFT     32
 195
 196 static void __update_inv_weight(struct load_weight *lw)
 197 {
 198         unsigned long w;
 199
 200         if (likely(lw->inv_weight))
 201                 return;
 202
 203         w = scale_load_down(lw->weight);
 204
 205         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 206                 lw->inv_weight = 1;
 207         else if (unlikely(!w))
 208                 lw->inv_weight = WMULT_CONST;
 209         else
 210                 lw->inv_weight = WMULT_CONST / w;
 211 }
 212
 213 /*
 214  * delta_exec * weight / lw.weight
 215  *   OR
 216  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 217  *
 218  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 219  * we're guaranteed shift stays positive because inv_weight is guaranteed to
 220  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 221  *
 222  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 223  * weight/lw.weight <= 1, and therefore our shift will also be positive.
 224  */
 225 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 226 {
 227         u64 fact = scale_load_down(weight);
 228         int shift = WMULT_SHIFT;
 229
 230         __update_inv_weight(lw);
 231
 232         if (unlikely(fact >> 32)) {
 233                 while (fact >> 32) {
 234                         fact >>= 1;
 235                         shift--;
 236                 }
 237         }
 238
 239         /* hint to use a 32x32->64 mul */
 240         fact = (u64)(u32)fact * lw->inv_weight;
 241
 242         while (fact >> 32) {
 243                 fact >>= 1;
 244                 shift--;
 245         }
 246
 247         return mul_u64_u32_shr(delta_exec, fact, shift);
 248 }
 249
 250 #ifdef CONFIG_SMP
 251 static int active_load_balance_cpu_stop(void *data);
 252 #endif
 253
 254 const struct sched_class fair_sched_class;
 255
 256 /**************************************************************
 257  * CFS operations on generic schedulable entities:
 258  */
 259
 260 #ifdef CONFIG_FAIR_GROUP_SCHED
 261
 262 /* cpu runqueue to which this cfs_rq is attached */
 263 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 264 {
 265         return cfs_rq->rq;
 266 }
 267
 268 /* An entity is a task if it doesn't "own" a runqueue */
 269 #define entity_is_task(se)      (!se->my_q)
 270
 271 static inline struct task_struct *task_of(struct sched_entity *se)
 272 {
 273 #ifdef CONFIG_SCHED_DEBUG
 274         WARN_ON_ONCE(!entity_is_task(se));
 275 #endif
 276         return container_of(se, struct task_struct, se);
 277 }
 278
 279 /* Walk up scheduling entities hierarchy */
 280 #define for_each_sched_entity(se) \
 281                 for (; se; se = se->parent)
 282
 283 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 284 {
 285         return p->se.cfs_rq;
 286 }
 287
 288 /* runqueue on which this entity is (to be) queued */
 289 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 290 {
 291         return se->cfs_rq;
 292 }
 293
 294 /* runqueue "owned" by this group */
 295 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 296 {
 297         return grp->my_q;
 298 }
 299
 300 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 301 {
 302         if (!cfs_rq->on_list) {
 303                 struct rq *rq = rq_of(cfs_rq);
 304                 int cpu = cpu_of(rq);
 305                 /*
 306                  * Ensure we either appear before our parent (if already
 307                  * enqueued) or force our parent to appear after us when it is
 308                  * enqueued. The fact that we always enqueue bottom-up
 309                  * reduces this to two cases and a special case for the root
 310                  * cfs_rq. Furthermore, it also means that we will always reset
 311                  * tmp_alone_branch either when the branch is connected
 312                  * to a tree or when we reach the beg of the tree
 313                  */
 314                 if (cfs_rq->tg->parent &&
 315                     cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 316                         /*
 317                          * If parent is already on the list, we add the child
 318                          * just before. Thanks to circular linked property of
 319                          * the list, this means to put the child at the tail
 320                          * of the list that starts by parent.
 321                          */
 322                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 323                                 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 324                         /*
 325                          * The branch is now connected to its tree so we can
 326                          * reset tmp_alone_branch to the beginning of the
 327                          * list.
 328                          */
 329                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 330                 } else if (!cfs_rq->tg->parent) {
 331                         /*
 332                          * cfs rq without parent should be put
 333                          * at the tail of the list.
 334                          */
 335                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 336                                 &rq->leaf_cfs_rq_list);
 337                         /*
 338                          * We have reach the beg of a tree so we can reset
 339                          * tmp_alone_branch to the beginning of the list.
 340                          */
 341                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 342                 } else {
 343                         /*
 344                          * The parent has not already been added so we want to
 345                          * make sure that it will be put after us.
 346                          * tmp_alone_branch points to the beg of the branch
 347                          * where we will add parent.
 348                          */
 349                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 350                                 rq->tmp_alone_branch);
 351                         /*
 352                          * update tmp_alone_branch to points to the new beg
 353                          * of the branch
 354                          */
 355                         rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 356                 }
 357
 358                 cfs_rq->on_list = 1;
 359         }
 360 }
 361
 362 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 363 {
 364         if (cfs_rq->on_list) {
 365                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 366                 cfs_rq->on_list = 0;
 367         }
 368 }
 369
 370 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 371 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 372         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 373
 374 /* Do the two (enqueued) entities belong to the same group ? */
 375 static inline struct cfs_rq *
 376 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 377 {
 378         if (se->cfs_rq == pse->cfs_rq)
 379                 return se->cfs_rq;
 380
 381         return NULL;
 382 }
 383
 384 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 385 {
 386         return se->parent;
 387 }
 388
 389 static void
 390 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 391 {
 392         int se_depth, pse_depth;
 393
 394         /*
 395          * preemption test can be made between sibling entities who are in the
 396          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 397          * both tasks until we find their ancestors who are siblings of common
 398          * parent.
 399          */
 400
 401         /* First walk up until both entities are at same depth */
 402         se_depth = (*se)->depth;
 403         pse_depth = (*pse)->depth;
 404
 405         while (se_depth > pse_depth) {
 406                 se_depth--;
 407                 *se = parent_entity(*se);
 408         }
 409
 410         while (pse_depth > se_depth) {
 411                 pse_depth--;
 412                 *pse = parent_entity(*pse);
 413         }
 414
 415         while (!is_same_group(*se, *pse)) {
 416                 *se = parent_entity(*se);
 417                 *pse = parent_entity(*pse);
 418         }
 419 }
 420
 421 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 422
 423 static inline struct task_struct *task_of(struct sched_entity *se)
 424 {
 425         return container_of(se, struct task_struct, se);
 426 }
 427
 428 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 429 {
 430         return container_of(cfs_rq, struct rq, cfs);
 431 }
 432
 433 #define entity_is_task(se)      1
 434
 435 #define for_each_sched_entity(se) \
 436                 for (; se; se = NULL)
 437
 438 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 439 {
 440         return &task_rq(p)->cfs;
 441 }
 442
 443 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 444 {
 445         struct task_struct *p = task_of(se);
 446         struct rq *rq = task_rq(p);
 447
 448         return &rq->cfs;
 449 }
 450
 451 /* runqueue "owned" by this group */
 452 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 453 {
 454         return NULL;
 455 }
 456
 457 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 458 {
 459 }
 460
 461 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 462 {
 463 }
 464
 465 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 466                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 467
 468 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 469 {
 470         return NULL;
 471 }
 472
 473 static inline void
 474 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 475 {
 476 }
 477
 478 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 479
 480 static __always_inline
 481 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 482
 483 /**************************************************************
 484  * Scheduling class tree data structure manipulation methods:
 485  */
 486
 487 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 488 {
 489         s64 delta = (s64)(vruntime - max_vruntime);
 490         if (delta > 0)
 491                 max_vruntime = vruntime;
 492
 493         return max_vruntime;
 494 }
 495
 496 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 497 {
 498         s64 delta = (s64)(vruntime - min_vruntime);
 499         if (delta < 0)
 500                 min_vruntime = vruntime;
 501
 502         return min_vruntime;
 503 }
 504
 505 static inline int entity_before(struct sched_entity *a,
 506                                 struct sched_entity *b)
 507 {
 508         return (s64)(a->vruntime - b->vruntime) < 0;
 509 }
 510
 511 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 512 {
 513         u64 vruntime = cfs_rq->min_vruntime;
 514
 515         if (cfs_rq->curr)
 516                 vruntime = cfs_rq->curr->vruntime;
 517
 518         if (cfs_rq->rb_leftmost) {
 519                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 520                                                    struct sched_entity,
 521                                                    run_node);
 522
 523                 if (!cfs_rq->curr)
 524                         vruntime = se->vruntime;
 525                 else
 526                         vruntime = min_vruntime(vruntime, se->vruntime);
 527         }
 528
 529         /* ensure we never gain time by being placed backwards. */
 530         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 531 #ifndef CONFIG_64BIT
 532         smp_wmb();
 533         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 534 #endif
 535 }
 536
 537 /*
 538  * Enqueue an entity into the rb-tree:
 539  */
 540 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 541 {
 542         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 543         struct rb_node *parent = NULL;
 544         struct sched_entity *entry;
 545         int leftmost = 1;
 546
 547         /*
 548          * Find the right place in the rbtree:
 549          */
 550         while (*link) {
 551                 parent = *link;
 552                 entry = rb_entry(parent, struct sched_entity, run_node);
 553                 /*
 554                  * We dont care about collisions. Nodes with
 555                  * the same key stay together.
 556                  */
 557                 if (entity_before(se, entry)) {
 558                         link = &parent->rb_left;
 559                 } else {
 560                         link = &parent->rb_right;
 561                         leftmost = 0;
 562                 }
 563         }
 564
 565         /*
 566          * Maintain a cache of leftmost tree entries (it is frequently
 567          * used):
 568          */
 569         if (leftmost)
 570                 cfs_rq->rb_leftmost = &se->run_node;
 571
 572         rb_link_node(&se->run_node, parent, link);
 573         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 574 }
 575
 576 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 577 {
 578         if (cfs_rq->rb_leftmost == &se->run_node) {
 579                 struct rb_node *next_node;
 580
 581                 next_node = rb_next(&se->run_node);
 582                 cfs_rq->rb_leftmost = next_node;
 583         }
 584
 585         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 586 }
 587
 588 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 589 {
 590         struct rb_node *left = cfs_rq->rb_leftmost;
 591
 592         if (!left)
 593                 return NULL;
 594
 595         return rb_entry(left, struct sched_entity, run_node);
 596 }
 597
 598 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 599 {
 600         struct rb_node *next = rb_next(&se->run_node);
 601
 602         if (!next)
 603                 return NULL;
 604
 605         return rb_entry(next, struct sched_entity, run_node);
 606 }
 607
 608 #ifdef CONFIG_SCHED_DEBUG
 609 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 610 {
 611         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 612
 613         if (!last)
 614                 return NULL;
 615
 616         return rb_entry(last, struct sched_entity, run_node);
 617 }
 618
 619 /**************************************************************
 620  * Scheduling class statistics methods:
 621  */
 622
 623 int sched_proc_update_handler(struct ctl_table *table, int write,
 624                 void __user *buffer, size_t *lenp,
 625                 loff_t *ppos)
 626 {
 627         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 628         unsigned int factor = get_update_sysctl_factor();
 629
 630         if (ret || !write)
 631                 return ret;
 632
 633         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 634                                         sysctl_sched_min_granularity);
 635
 636 #define WRT_SYSCTL(name) \
 637         (normalized_sysctl_##name = sysctl_##name / (factor))
 638         WRT_SYSCTL(sched_min_granularity);
 639         WRT_SYSCTL(sched_latency);
 640         WRT_SYSCTL(sched_wakeup_granularity);
 641 #undef WRT_SYSCTL
 642
 643         return 0;
 644 }
 645 #endif
 646
 647 /*
 648  * delta /= w
 649  */
 650 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 651 {
 652         if (unlikely(se->load.weight != NICE_0_LOAD))
 653                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 654
 655         return delta;
 656 }
 657
 658 /*
 659  * The idea is to set a period in which each task runs once.
 660  *
 661  * When there are too many tasks (sched_nr_latency) we have to stretch
 662  * this period because otherwise the slices get too small.
 663  *
 664  * p = (nr <= nl) ? l : l*nr/nl
 665  */
 666 static u64 __sched_period(unsigned long nr_running)
 667 {
 668         if (unlikely(nr_running > sched_nr_latency))
 669                 return nr_running * sysctl_sched_min_granularity;
 670         else
 671                 return sysctl_sched_latency;
 672 }
 673
 674 /*
 675  * We calculate the wall-time slice from the period by taking a part
 676  * proportional to the weight.
 677  *
 678  * s = p*P[w/rw]
 679  */
 680 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 681 {
 682         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 683
 684         for_each_sched_entity(se) {
 685                 struct load_weight *load;
 686                 struct load_weight lw;
 687
 688                 cfs_rq = cfs_rq_of(se);
 689                 load = &cfs_rq->load;
 690
 691                 if (unlikely(!se->on_rq)) {
 692                         lw = cfs_rq->load;
 693
 694                         update_load_add(&lw, se->load.weight);
 695                         load = &lw;
 696                 }
 697                 slice = __calc_delta(slice, se->load.weight, load);
 698         }
 699         return slice;
 700 }
 701
 702 /*
 703  * We calculate the vruntime slice of a to-be-inserted task.
 704  *
 705  * vs = s/w
 706  */
 707 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 708 {
 709         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 710 }
 711
 712 #ifdef CONFIG_SMP
 713 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 714 static unsigned long task_h_load(struct task_struct *p);
 715
 716 /*
 717  * We choose a half-life close to 1 scheduling period.
 718  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
 719  * dependent on this value.
 720  */
 721 #define LOAD_AVG_PERIOD 32
 722 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 723 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 724
 725 /* Give new sched_entity start runnable values to heavy its load in infant time */
 726 void init_entity_runnable_average(struct sched_entity *se)
 727 {
 728         struct sched_avg *sa = &se->avg;
 729
 730         sa->last_update_time = 0;
 731         /*
 732          * sched_avg's period_contrib should be strictly less then 1024, so
 733          * we give it 1023 to make sure it is almost a period (1024us), and
 734          * will definitely be update (after enqueue).
 735          */
 736         sa->period_contrib = 1023;
 737         /*
 738          * Tasks are intialized with full load to be seen as heavy tasks until
 739          * they get a chance to stabilize to their real load level.
 740          * Group entities are intialized with zero load to reflect the fact that
 741          * nothing has been attached to the task group yet.
 742          */
 743         if (entity_is_task(se))
 744                 sa->load_avg = scale_load_down(se->load.weight);
 745         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 746         /*
 747          * In previous Android versions, we used to have:
 748          *      sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 749          *      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 750          * However, that functionality has been moved to enqueue.
 751          * It is unclear if we should restore this in enqueue.
 752          */
 753         /*
 754          * At this point, util_avg won't be used in select_task_rq_fair anyway
 755          */
 756         sa->util_avg = 0;
 757         sa->util_sum = 0;
 758         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 759 }
 760
 761 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 762 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
 763 static void attach_entity_cfs_rq(struct sched_entity *se);
 764 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 765
 766 /*
 767  * With new tasks being created, their initial util_avgs are extrapolated
 768  * based on the cfs_rq's current util_avg:
 769  *
 770  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 771  *
 772  * However, in many cases, the above util_avg does not give a desired
 773  * value. Moreover, the sum of the util_avgs may be divergent, such
 774  * as when the series is a harmonic series.
 775  *
 776  * To solve this problem, we also cap the util_avg of successive tasks to
 777  * only 1/2 of the left utilization budget:
 778  *
 779  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
 780  *
 781  * where n denotes the nth task.
 782  *
 783  * For example, a simplest series from the beginning would be like:
 784  *
 785  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 786  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 787  *
 788  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 789  * if util_avg > util_avg_cap.
 790  */
 791 void post_init_entity_util_avg(struct sched_entity *se)
 792 {
 793         struct cfs_rq *cfs_rq = cfs_rq_of(se);
 794         struct sched_avg *sa = &se->avg;
 795         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 796
 797         if (cap > 0) {
 798                 if (cfs_rq->avg.util_avg != 0) {
 799                         sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 800                         sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 801
 802                         if (sa->util_avg > cap)
 803                                 sa->util_avg = cap;
 804                 } else {
 805                         sa->util_avg = cap;
 806                 }
 807                 /*
 808                  * If we wish to restore tuning via setting initial util,
 809                  * this is where we should do it.
 810                  */
 811                 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 812         }
 813
 814         if (entity_is_task(se)) {
 815                 struct task_struct *p = task_of(se);
 816                 if (p->sched_class != &fair_sched_class) {
 817                         /*
 818                          * For !fair tasks do:
 819                          *
 820                         update_cfs_rq_load_avg(now, cfs_rq, false);
 821                         attach_entity_load_avg(cfs_rq, se);
 822                         switched_from_fair(rq, p);
 823                          *
 824                          * such that the next switched_to_fair() has the
 825                          * expected state.
 826                          */
 827                         se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 828                         return;
 829                 }
 830         }
 831
 832         attach_entity_cfs_rq(se);
 833 }
 834
 835 #else /* !CONFIG_SMP */
 836 void init_entity_runnable_average(struct sched_entity *se)
 837 {
 838 }
 839 void post_init_entity_util_avg(struct sched_entity *se)
 840 {
 841 }
 842 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 843 {
 844 }
 845 #endif /* CONFIG_SMP */
 846
 847 /*
 848  * Update the current task's runtime statistics.
 849  */
 850 static void update_curr(struct cfs_rq *cfs_rq)
 851 {
 852         struct sched_entity *curr = cfs_rq->curr;
 853         u64 now = rq_clock_task(rq_of(cfs_rq));
 854         u64 delta_exec;
 855
 856         if (unlikely(!curr))
 857                 return;
 858
 859         delta_exec = now - curr->exec_start;
 860         if (unlikely((s64)delta_exec <= 0))
 861                 return;
 862
 863         curr->exec_start = now;
 864
 865         schedstat_set(curr->statistics.exec_max,
 866                       max(delta_exec, curr->statistics.exec_max));
 867
 868         curr->sum_exec_runtime += delta_exec;
 869         schedstat_add(cfs_rq, exec_clock, delta_exec);
 870
 871         curr->vruntime += calc_delta_fair(delta_exec, curr);
 872         update_min_vruntime(cfs_rq);
 873
 874         if (entity_is_task(curr)) {
 875                 struct task_struct *curtask = task_of(curr);
 876
 877                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 878                 cpuacct_charge(curtask, delta_exec);
 879                 account_group_exec_runtime(curtask, delta_exec);
 880         }
 881
 882         account_cfs_rq_runtime(cfs_rq, delta_exec);
 883 }
 884
 885 static void update_curr_fair(struct rq *rq)
 886 {
 887         update_curr(cfs_rq_of(&rq->curr->se));
 888 }
 889
 890 #ifdef CONFIG_SCHEDSTATS
 891 static inline void
 892 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 893 {
 894         u64 wait_start = rq_clock(rq_of(cfs_rq));
 895
 896         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 897             likely(wait_start > se->statistics.wait_start))
 898                 wait_start -= se->statistics.wait_start;
 899
 900         se->statistics.wait_start = wait_start;
 901 }
 902
 903 static void
 904 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 905 {
 906         struct task_struct *p;
 907         u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 908
 909         if (entity_is_task(se)) {
 910                 p = task_of(se);
 911                 if (task_on_rq_migrating(p)) {
 912                         /*
 913                          * Preserve migrating task's wait time so wait_start
 914                          * time stamp can be adjusted to accumulate wait time
 915                          * prior to migration.
 916                          */
 917                         se->statistics.wait_start = delta;
 918                         return;
 919                 }
 920                 trace_sched_stat_wait(p, delta);
 921         }
 922
 923         se->statistics.wait_max = max(se->statistics.wait_max, delta);
 924         se->statistics.wait_count++;
 925         se->statistics.wait_sum += delta;
 926         se->statistics.wait_start = 0;
 927 }
 928 #else
 929 static inline void
 930 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 931 {
 932 }
 933
 934 static inline void
 935 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 936 {
 937 }
 938 #endif
 939
 940 /*
 941  * Task is being enqueued - update stats:
 942  */
 943 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 944 {
 945         /*
 946          * Are we enqueueing a waiting task? (for current tasks
 947          * a dequeue/enqueue event is a NOP)
 948          */
 949         if (se != cfs_rq->curr)
 950                 update_stats_wait_start(cfs_rq, se);
 951 }
 952
 953 static inline void
 954 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 955 {
 956         /*
 957          * Mark the end of the wait period if dequeueing a
 958          * waiting task:
 959          */
 960         if (se != cfs_rq->curr)
 961                 update_stats_wait_end(cfs_rq, se);
 962 }
 963
 964 /*
 965  * We are picking a new current task - update its stats:
 966  */
 967 static inline void
 968 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 969 {
 970         /*
 971          * We are starting a new run period:
 972          */
 973         se->exec_start = rq_clock_task(rq_of(cfs_rq));
 974 }
 975
 976 /**************************************************
 977  * Scheduling class queueing methods:
 978  */
 979
 980 #ifdef CONFIG_NUMA_BALANCING
 981 /*
 982  * Approximate time to scan a full NUMA task in ms. The task scan period is
 983  * calculated based on the tasks virtual memory size and
 984  * numa_balancing_scan_size.
 985  */
 986 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 987 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 988
 989 /* Portion of address space to scan in MB */
 990 unsigned int sysctl_numa_balancing_scan_size = 256;
 991
 992 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 993 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 994
 995 static unsigned int task_nr_scan_windows(struct task_struct *p)
 996 {
 997         unsigned long rss = 0;
 998         unsigned long nr_scan_pages;
 999
1000         /*
1001          * Calculations based on RSS as non-present and empty pages are skipped
1002          * by the PTE scanner and NUMA hinting faults should be trapped based
1003          * on resident pages
1004          */
1005         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1006         rss = get_mm_rss(p->mm);
1007         if (!rss)
1008                 rss = nr_scan_pages;
1009
1010         rss = round_up(rss, nr_scan_pages);
1011         return rss / nr_scan_pages;
1012 }
1013
1014 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1015 #define MAX_SCAN_WINDOW 2560
1016
1017 static unsigned int task_scan_min(struct task_struct *p)
1018 {
1019         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1020         unsigned int scan, floor;
1021         unsigned int windows = 1;
1022
1023         if (scan_size < MAX_SCAN_WINDOW)
1024                 windows = MAX_SCAN_WINDOW / scan_size;
1025         floor = 1000 / windows;
1026
1027         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1028         return max_t(unsigned int, floor, scan);
1029 }
1030
1031 static unsigned int task_scan_max(struct task_struct *p)
1032 {
1033         unsigned int smin = task_scan_min(p);
1034         unsigned int smax;
1035
1036         /* Watch for min being lower than max due to floor calculations */
1037         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1038         return max(smin, smax);
1039 }
1040
1041 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1042 {
1043         rq->nr_numa_running += (p->numa_preferred_nid != -1);
1044         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1045 }
1046
1047 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1048 {
1049         rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1050         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1051 }
1052
1053 struct numa_group {
1054         atomic_t refcount;
1055
1056         spinlock_t lock; /* nr_tasks, tasks */
1057         int nr_tasks;
1058         pid_t gid;
1059
1060         struct rcu_head rcu;
1061         nodemask_t active_nodes;
1062         unsigned long total_faults;
1063         /*
1064          * Faults_cpu is used to decide whether memory should move
1065          * towards the CPU. As a consequence, these stats are weighted
1066          * more by CPU use than by memory faults.
1067          */
1068         unsigned long *faults_cpu;
1069         unsigned long faults[0];
1070 };
1071
1072 /* Shared or private faults. */
1073 #define NR_NUMA_HINT_FAULT_TYPES 2
1074
1075 /* Memory and CPU locality */
1076 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1077
1078 /* Averaged statistics, and temporary buffers. */
1079 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1080
1081 pid_t task_numa_group_id(struct task_struct *p)
1082 {
1083         return p->numa_group ? p->numa_group->gid : 0;
1084 }
1085
1086 /*
1087  * The averaged statistics, shared & private, memory & cpu,
1088  * occupy the first half of the array. The second half of the
1089  * array is for current counters, which are averaged into the
1090  * first set by task_numa_placement.
1091  */
1092 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1093 {
1094         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1095 }
1096
1097 static inline unsigned long task_faults(struct task_struct *p, int nid)
1098 {
1099         if (!p->numa_faults)
1100                 return 0;
1101
1102         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1103                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1104 }
1105
1106 static inline unsigned long group_faults(struct task_struct *p, int nid)
1107 {
1108         if (!p->numa_group)
1109                 return 0;
1110
1111         return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1112                 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1113 }
1114
1115 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1116 {
1117         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1118                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1119 }
1120
1121 /* Handle placement on systems where not all nodes are directly connected. */
1122 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1123                                         int maxdist, bool task)
1124 {
1125         unsigned long score = 0;
1126         int node;
1127
1128         /*
1129          * All nodes are directly connected, and the same distance
1130          * from each other. No need for fancy placement algorithms.
1131          */
1132         if (sched_numa_topology_type == NUMA_DIRECT)
1133                 return 0;
1134
1135         /*
1136          * This code is called for each node, introducing N^2 complexity,
1137          * which should be ok given the number of nodes rarely exceeds 8.
1138          */
1139         for_each_online_node(node) {
1140                 unsigned long faults;
1141                 int dist = node_distance(nid, node);
1142
1143                 /*
1144                  * The furthest away nodes in the system are not interesting
1145                  * for placement; nid was already counted.
1146                  */
1147                 if (dist == sched_max_numa_distance || node == nid)
1148                         continue;
1149
1150                 /*
1151                  * On systems with a backplane NUMA topology, compare groups
1152                  * of nodes, and move tasks towards the group with the most
1153                  * memory accesses. When comparing two nodes at distance
1154                  * "hoplimit", only nodes closer by than "hoplimit" are part
1155                  * of each group. Skip other nodes.
1156                  */
1157                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1158                                         dist > maxdist)
1159                         continue;
1160
1161                 /* Add up the faults from nearby nodes. */
1162                 if (task)
1163                         faults = task_faults(p, node);
1164                 else
1165                         faults = group_faults(p, node);
1166
1167                 /*
1168                  * On systems with a glueless mesh NUMA topology, there are
1169                  * no fixed "groups of nodes". Instead, nodes that are not
1170                  * directly connected bounce traffic through intermediate
1171                  * nodes; a numa_group can occupy any set of nodes.
1172                  * The further away a node is, the less the faults count.
1173                  * This seems to result in good task placement.
1174                  */
1175                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1176                         faults *= (sched_max_numa_distance - dist);
1177                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1178                 }
1179
1180                 score += faults;
1181         }
1182
1183         return score;
1184 }
1185
1186 /*
1187  * These return the fraction of accesses done by a particular task, or
1188  * task group, on a particular numa node.  The group weight is given a
1189  * larger multiplier, in order to group tasks together that are almost
1190  * evenly spread out between numa nodes.
1191  */
1192 static inline unsigned long task_weight(struct task_struct *p, int nid,
1193                                         int dist)
1194 {
1195         unsigned long faults, total_faults;
1196
1197         if (!p->numa_faults)
1198                 return 0;
1199
1200         total_faults = p->total_numa_faults;
1201
1202         if (!total_faults)
1203                 return 0;
1204
1205         faults = task_faults(p, nid);
1206         faults += score_nearby_nodes(p, nid, dist, true);
1207
1208         return 1000 * faults / total_faults;
1209 }
1210
1211 static inline unsigned long group_weight(struct task_struct *p, int nid,
1212                                          int dist)
1213 {
1214         unsigned long faults, total_faults;
1215
1216         if (!p->numa_group)
1217                 return 0;
1218
1219         total_faults = p->numa_group->total_faults;
1220
1221         if (!total_faults)
1222                 return 0;
1223
1224         faults = group_faults(p, nid);
1225         faults += score_nearby_nodes(p, nid, dist, false);
1226
1227         return 1000 * faults / total_faults;
1228 }
1229
1230 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1231                                 int src_nid, int dst_cpu)
1232 {
1233         struct numa_group *ng = p->numa_group;
1234         int dst_nid = cpu_to_node(dst_cpu);
1235         int last_cpupid, this_cpupid;
1236
1237         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1238
1239         /*
1240          * Multi-stage node selection is used in conjunction with a periodic
1241          * migration fault to build a temporal task<->page relation. By using
1242          * a two-stage filter we remove short/unlikely relations.
1243          *
1244          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1245          * a task's usage of a particular page (n_p) per total usage of this
1246          * page (n_t) (in a given time-span) to a probability.
1247          *
1248          * Our periodic faults will sample this probability and getting the
1249          * same result twice in a row, given these samples are fully
1250          * independent, is then given by P(n)^2, provided our sample period
1251          * is sufficiently short compared to the usage pattern.
1252          *
1253          * This quadric squishes small probabilities, making it less likely we
1254          * act on an unlikely task<->page relation.
1255          */
1256         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1257         if (!cpupid_pid_unset(last_cpupid) &&
1258                                 cpupid_to_nid(last_cpupid) != dst_nid)
1259                 return false;
1260
1261         /* Always allow migrate on private faults */
1262         if (cpupid_match_pid(p, last_cpupid))
1263                 return true;
1264
1265         /* A shared fault, but p->numa_group has not been set up yet. */
1266         if (!ng)
1267                 return true;
1268
1269         /*
1270          * Do not migrate if the destination is not a node that
1271          * is actively used by this numa group.
1272          */
1273         if (!node_isset(dst_nid, ng->active_nodes))
1274                 return false;
1275
1276         /*
1277          * Source is a node that is not actively used by this
1278          * numa group, while the destination is. Migrate.
1279          */
1280         if (!node_isset(src_nid, ng->active_nodes))
1281                 return true;
1282
1283         /*
1284          * Both source and destination are nodes in active
1285          * use by this numa group. Maximize memory bandwidth
1286          * by migrating from more heavily used groups, to less
1287          * heavily used ones, spreading the load around.
1288          * Use a 1/4 hysteresis to avoid spurious page movement.
1289          */
1290         return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1291 }
1292
1293 static unsigned long weighted_cpuload(const int cpu);
1294 static unsigned long source_load(int cpu, int type);
1295 static unsigned long target_load(int cpu, int type);
1296 static unsigned long capacity_of(int cpu);
1297 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1298
1299 /* Cached statistics for all CPUs within a node */
1300 struct numa_stats {
1301         unsigned long nr_running;
1302         unsigned long load;
1303
1304         /* Total compute capacity of CPUs on a node */
1305         unsigned long compute_capacity;
1306
1307         /* Approximate capacity in terms of runnable tasks on a node */
1308         unsigned long task_capacity;
1309         int has_free_capacity;
1310 };
1311
1312 /*
1313  * XXX borrowed from update_sg_lb_stats
1314  */
1315 static void update_numa_stats(struct numa_stats *ns, int nid)
1316 {
1317         int smt, cpu, cpus = 0;
1318         unsigned long capacity;
1319
1320         memset(ns, 0, sizeof(*ns));
1321         for_each_cpu(cpu, cpumask_of_node(nid)) {
1322                 struct rq *rq = cpu_rq(cpu);
1323
1324                 ns->nr_running += rq->nr_running;
1325                 ns->load += weighted_cpuload(cpu);
1326                 ns->compute_capacity += capacity_of(cpu);
1327
1328                 cpus++;
1329         }
1330
1331         /*
1332          * If we raced with hotplug and there are no CPUs left in our mask
1333          * the @ns structure is NULL'ed and task_numa_compare() will
1334          * not find this node attractive.
1335          *
1336          * We'll either bail at !has_free_capacity, or we'll detect a huge
1337          * imbalance and bail there.
1338          */
1339         if (!cpus)
1340                 return;
1341
1342         /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1343         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1344         capacity = cpus / smt; /* cores */
1345
1346         ns->task_capacity = min_t(unsigned, capacity,
1347                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1348         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1349 }
1350
1351 struct task_numa_env {
1352         struct task_struct *p;
1353
1354         int src_cpu, src_nid;
1355         int dst_cpu, dst_nid;
1356
1357         struct numa_stats src_stats, dst_stats;
1358
1359         int imbalance_pct;
1360         int dist;
1361
1362         struct task_struct *best_task;
1363         long best_imp;
1364         int best_cpu;
1365 };
1366
1367 static void task_numa_assign(struct task_numa_env *env,
1368                              struct task_struct *p, long imp)
1369 {
1370         if (env->best_task)
1371                 put_task_struct(env->best_task);
1372
1373         env->best_task = p;
1374         env->best_imp = imp;
1375         env->best_cpu = env->dst_cpu;
1376 }
1377
1378 static bool load_too_imbalanced(long src_load, long dst_load,
1379                                 struct task_numa_env *env)
1380 {
1381         long imb, old_imb;
1382         long orig_src_load, orig_dst_load;
1383         long src_capacity, dst_capacity;
1384
1385         /*
1386          * The load is corrected for the CPU capacity available on each node.
1387          *
1388          * src_load        dst_load
1389          * ------------ vs ---------
1390          * src_capacity    dst_capacity
1391          */
1392         src_capacity = env->src_stats.compute_capacity;
1393         dst_capacity = env->dst_stats.compute_capacity;
1394
1395         /* We care about the slope of the imbalance, not the direction. */
1396         if (dst_load < src_load)
1397                 swap(dst_load, src_load);
1398
1399         /* Is the difference below the threshold? */
1400         imb = dst_load * src_capacity * 100 -
1401               src_load * dst_capacity * env->imbalance_pct;
1402         if (imb <= 0)
1403                 return false;
1404
1405         /*
1406          * The imbalance is above the allowed threshold.
1407          * Compare it with the old imbalance.
1408          */
1409         orig_src_load = env->src_stats.load;
1410         orig_dst_load = env->dst_stats.load;
1411
1412         if (orig_dst_load < orig_src_load)
1413                 swap(orig_dst_load, orig_src_load);
1414
1415         old_imb = orig_dst_load * src_capacity * 100 -
1416                   orig_src_load * dst_capacity * env->imbalance_pct;
1417
1418         /* Would this change make things worse? */
1419         return (imb > old_imb);
1420 }
1421
1422 /*
1423  * This checks if the overall compute and NUMA accesses of the system would
1424  * be improved if the source tasks was migrated to the target dst_cpu taking
1425  * into account that it might be best if task running on the dst_cpu should
1426  * be exchanged with the source task
1427  */
1428 static void task_numa_compare(struct task_numa_env *env,
1429                               long taskimp, long groupimp)
1430 {
1431         struct rq *src_rq = cpu_rq(env->src_cpu);
1432         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1433         struct task_struct *cur;
1434         long src_load, dst_load;
1435         long load;
1436         long imp = env->p->numa_group ? groupimp : taskimp;
1437         long moveimp = imp;
1438         int dist = env->dist;
1439         bool assigned = false;
1440
1441         rcu_read_lock();
1442
1443         raw_spin_lock_irq(&dst_rq->lock);
1444         cur = dst_rq->curr;
1445         /*
1446          * No need to move the exiting task or idle task.
1447          */
1448         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1449                 cur = NULL;
1450         else {
1451                 /*
1452                  * The task_struct must be protected here to protect the
1453                  * p->numa_faults access in the task_weight since the
1454                  * numa_faults could already be freed in the following path:
1455                  * finish_task_switch()
1456                  *     --> put_task_struct()
1457                  *         --> __put_task_struct()
1458                  *             --> task_numa_free()
1459                  */
1460                 get_task_struct(cur);
1461         }
1462
1463         raw_spin_unlock_irq(&dst_rq->lock);
1464
1465         /*
1466          * Because we have preemption enabled we can get migrated around and
1467          * end try selecting ourselves (current == env->p) as a swap candidate.
1468          */
1469         if (cur == env->p)
1470                 goto unlock;
1471
1472         /*
1473          * "imp" is the fault differential for the source task between the
1474          * source and destination node. Calculate the total differential for
1475          * the source task and potential destination task. The more negative
1476          * the value is, the more rmeote accesses that would be expected to
1477          * be incurred if the tasks were swapped.
1478          */
1479         if (cur) {
1480                 /* Skip this swap candidate if cannot move to the source cpu */
1481                 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1482                         goto unlock;
1483
1484                 /*
1485                  * If dst and source tasks are in the same NUMA group, or not
1486                  * in any group then look only at task weights.
1487                  */
1488                 if (cur->numa_group == env->p->numa_group) {
1489                         imp = taskimp + task_weight(cur, env->src_nid, dist) -
1490                               task_weight(cur, env->dst_nid, dist);
1491                         /*
1492                          * Add some hysteresis to prevent swapping the
1493                          * tasks within a group over tiny differences.
1494                          */
1495                         if (cur->numa_group)
1496                                 imp -= imp/16;
1497                 } else {
1498                         /*
1499                          * Compare the group weights. If a task is all by
1500                          * itself (not part of a group), use the task weight
1501                          * instead.
1502                          */
1503                         if (cur->numa_group)
1504                                 imp += group_weight(cur, env->src_nid, dist) -
1505                                        group_weight(cur, env->dst_nid, dist);
1506                         else
1507                                 imp += task_weight(cur, env->src_nid, dist) -
1508                                        task_weight(cur, env->dst_nid, dist);
1509                 }
1510         }
1511
1512         if (imp <= env->best_imp && moveimp <= env->best_imp)
1513                 goto unlock;
1514
1515         if (!cur) {
1516                 /* Is there capacity at our destination? */
1517                 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1518                     !env->dst_stats.has_free_capacity)
1519                         goto unlock;
1520
1521                 goto balance;
1522         }
1523
1524         /* Balance doesn't matter much if we're running a task per cpu */
1525         if (imp > env->best_imp && src_rq->nr_running == 1 &&
1526                         dst_rq->nr_running == 1)
1527                 goto assign;
1528
1529         /*
1530          * In the overloaded case, try and keep the load balanced.
1531          */
1532 balance:
1533         load = task_h_load(env->p);
1534         dst_load = env->dst_stats.load + load;
1535         src_load = env->src_stats.load - load;
1536
1537         if (moveimp > imp && moveimp > env->best_imp) {
1538                 /*
1539                  * If the improvement from just moving env->p direction is
1540                  * better than swapping tasks around, check if a move is
1541                  * possible. Store a slightly smaller score than moveimp,
1542                  * so an actually idle CPU will win.
1543                  */
1544                 if (!load_too_imbalanced(src_load, dst_load, env)) {
1545                         imp = moveimp - 1;
1546                         put_task_struct(cur);
1547                         cur = NULL;
1548                         goto assign;
1549                 }
1550         }
1551
1552         if (imp <= env->best_imp)
1553                 goto unlock;
1554
1555         if (cur) {
1556                 load = task_h_load(cur);
1557                 dst_load -= load;
1558                 src_load += load;
1559         }
1560
1561         if (load_too_imbalanced(src_load, dst_load, env))
1562                 goto unlock;
1563
1564         /*
1565          * One idle CPU per node is evaluated for a task numa move.
1566          * Call select_idle_sibling to maybe find a better one.
1567          */
1568         if (!cur)
1569                 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1570                                                    env->dst_cpu);
1571
1572 assign:
1573         assigned = true;
1574         task_numa_assign(env, cur, imp);
1575 unlock:
1576         rcu_read_unlock();
1577         /*
1578          * The dst_rq->curr isn't assigned. The protection for task_struct is
1579          * finished.
1580          */
1581         if (cur && !assigned)
1582                 put_task_struct(cur);
1583 }
1584
1585 static void task_numa_find_cpu(struct task_numa_env *env,
1586                                 long taskimp, long groupimp)
1587 {
1588         int cpu;
1589
1590         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1591                 /* Skip this CPU if the source task cannot migrate */
1592                 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1593                         continue;
1594
1595                 env->dst_cpu = cpu;
1596                 task_numa_compare(env, taskimp, groupimp);
1597         }
1598 }
1599
1600 /* Only move tasks to a NUMA node less busy than the current node. */
1601 static bool numa_has_capacity(struct task_numa_env *env)
1602 {
1603         struct numa_stats *src = &env->src_stats;
1604         struct numa_stats *dst = &env->dst_stats;
1605
1606         if (src->has_free_capacity && !dst->has_free_capacity)
1607                 return false;
1608
1609         /*
1610          * Only consider a task move if the source has a higher load
1611          * than the destination, corrected for CPU capacity on each node.
1612          *
1613          *      src->load                dst->load
1614          * --------------------- vs ---------------------
1615          * src->compute_capacity    dst->compute_capacity
1616          */
1617         if (src->load * dst->compute_capacity * env->imbalance_pct >
1618
1619             dst->load * src->compute_capacity * 100)
1620                 return true;
1621
1622         return false;
1623 }
1624
1625 static int task_numa_migrate(struct task_struct *p)
1626 {
1627         struct task_numa_env env = {
1628                 .p = p,
1629
1630                 .src_cpu = task_cpu(p),
1631                 .src_nid = task_node(p),
1632
1633                 .imbalance_pct = 112,
1634
1635                 .best_task = NULL,
1636                 .best_imp = 0,
1637                 .best_cpu = -1
1638         };
1639         struct sched_domain *sd;
1640         unsigned long taskweight, groupweight;
1641         int nid, ret, dist;
1642         long taskimp, groupimp;
1643
1644         /*
1645          * Pick the lowest SD_NUMA domain, as that would have the smallest
1646          * imbalance and would be the first to start moving tasks about.
1647          *
1648          * And we want to avoid any moving of tasks about, as that would create
1649          * random movement of tasks -- counter the numa conditions we're trying
1650          * to satisfy here.
1651          */
1652         rcu_read_lock();
1653         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1654         if (sd)
1655                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1656         rcu_read_unlock();
1657
1658         /*
1659          * Cpusets can break the scheduler domain tree into smaller
1660          * balance domains, some of which do not cross NUMA boundaries.
1661          * Tasks that are "trapped" in such domains cannot be migrated
1662          * elsewhere, so there is no point in (re)trying.
1663          */
1664         if (unlikely(!sd)) {
1665                 p->numa_preferred_nid = task_node(p);
1666                 return -EINVAL;
1667         }
1668
1669         env.dst_nid = p->numa_preferred_nid;
1670         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1671         taskweight = task_weight(p, env.src_nid, dist);
1672         groupweight = group_weight(p, env.src_nid, dist);
1673         update_numa_stats(&env.src_stats, env.src_nid);
1674         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1675         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1676         update_numa_stats(&env.dst_stats, env.dst_nid);
1677
1678         /* Try to find a spot on the preferred nid. */
1679         if (numa_has_capacity(&env))
1680                 task_numa_find_cpu(&env, taskimp, groupimp);
1681
1682         /*
1683          * Look at other nodes in these cases:
1684          * - there is no space available on the preferred_nid
1685          * - the task is part of a numa_group that is interleaved across
1686          *   multiple NUMA nodes; in order to better consolidate the group,
1687          *   we need to check other locations.
1688          */
1689         if (env.best_cpu == -1 || (p->numa_group &&
1690                         nodes_weight(p->numa_group->active_nodes) > 1)) {
1691                 for_each_online_node(nid) {
1692                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1693                                 continue;
1694
1695                         dist = node_distance(env.src_nid, env.dst_nid);
1696                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1697                                                 dist != env.dist) {
1698                                 taskweight = task_weight(p, env.src_nid, dist);
1699                                 groupweight = group_weight(p, env.src_nid, dist);
1700                         }
1701
1702                         /* Only consider nodes where both task and groups benefit */
1703                         taskimp = task_weight(p, nid, dist) - taskweight;
1704                         groupimp = group_weight(p, nid, dist) - groupweight;
1705                         if (taskimp < 0 && groupimp < 0)
1706                                 continue;
1707
1708                         env.dist = dist;
1709                         env.dst_nid = nid;
1710                         update_numa_stats(&env.dst_stats, env.dst_nid);
1711                         if (numa_has_capacity(&env))
1712                                 task_numa_find_cpu(&env, taskimp, groupimp);
1713                 }
1714         }
1715
1716         /*
1717          * If the task is part of a workload that spans multiple NUMA nodes,
1718          * and is migrating into one of the workload's active nodes, remember
1719          * this node as the task's preferred numa node, so the workload can
1720          * settle down.
1721          * A task that migrated to a second choice node will be better off
1722          * trying for a better one later. Do not set the preferred node here.
1723          */
1724         if (p->numa_group) {
1725                 if (env.best_cpu == -1)
1726                         nid = env.src_nid;
1727                 else
1728                         nid = env.dst_nid;
1729
1730                 if (node_isset(nid, p->numa_group->active_nodes))
1731                         sched_setnuma(p, env.dst_nid);
1732         }
1733
1734         /* No better CPU than the current one was found. */
1735         if (env.best_cpu == -1)
1736                 return -EAGAIN;
1737
1738         /*
1739          * Reset the scan period if the task is being rescheduled on an
1740          * alternative node to recheck if the tasks is now properly placed.
1741          */
1742         p->numa_scan_period = task_scan_min(p);
1743
1744         if (env.best_task == NULL) {
1745                 ret = migrate_task_to(p, env.best_cpu);
1746                 if (ret != 0)
1747                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1748                 return ret;
1749         }
1750
1751         ret = migrate_swap(p, env.best_task);
1752         if (ret != 0)
1753                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1754         put_task_struct(env.best_task);
1755         return ret;
1756 }
1757
1758 /* Attempt to migrate a task to a CPU on the preferred node. */
1759 static void numa_migrate_preferred(struct task_struct *p)
1760 {
1761         unsigned long interval = HZ;
1762
1763         /* This task has no NUMA fault statistics yet */
1764         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1765                 return;
1766
1767         /* Periodically retry migrating the task to the preferred node */
1768         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1769         p->numa_migrate_retry = jiffies + interval;
1770
1771         /* Success if task is already running on preferred CPU */
1772         if (task_node(p) == p->numa_preferred_nid)
1773                 return;
1774
1775         /* Otherwise, try migrate to a CPU on the preferred node */
1776         task_numa_migrate(p);
1777 }
1778
1779 /*
1780  * Find the nodes on which the workload is actively running. We do this by
1781  * tracking the nodes from which NUMA hinting faults are triggered. This can
1782  * be different from the set of nodes where the workload's memory is currently
1783  * located.
1784  *
1785  * The bitmask is used to make smarter decisions on when to do NUMA page
1786  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1787  * are added when they cause over 6/16 of the maximum number of faults, but
1788  * only removed when they drop below 3/16.
1789  */
1790 static void update_numa_active_node_mask(struct numa_group *numa_group)
1791 {
1792         unsigned long faults, max_faults = 0;
1793         int nid;
1794
1795         for_each_online_node(nid) {
1796                 faults = group_faults_cpu(numa_group, nid);
1797                 if (faults > max_faults)
1798                         max_faults = faults;
1799         }
1800
1801         for_each_online_node(nid) {
1802                 faults = group_faults_cpu(numa_group, nid);
1803                 if (!node_isset(nid, numa_group->active_nodes)) {
1804                         if (faults > max_faults * 6 / 16)
1805                                 node_set(nid, numa_group->active_nodes);
1806                 } else if (faults < max_faults * 3 / 16)
1807                         node_clear(nid, numa_group->active_nodes);
1808         }
1809 }
1810
1811 /*
1812  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1813  * increments. The more local the fault statistics are, the higher the scan
1814  * period will be for the next scan window. If local/(local+remote) ratio is
1815  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1816  * the scan period will decrease. Aim for 70% local accesses.
1817  */
1818 #define NUMA_PERIOD_SLOTS 10
1819 #define NUMA_PERIOD_THRESHOLD 7
1820
1821 /*
1822  * Increase the scan period (slow down scanning) if the majority of
1823  * our memory is already on our local node, or if the majority of
1824  * the page accesses are shared with other processes.
1825  * Otherwise, decrease the scan period.
1826  */
1827 static void update_task_scan_period(struct task_struct *p,
1828                         unsigned long shared, unsigned long private)
1829 {
1830         unsigned int period_slot;
1831         int ratio;
1832         int diff;
1833
1834         unsigned long remote = p->numa_faults_locality[0];
1835         unsigned long local = p->numa_faults_locality[1];
1836
1837         /*
1838          * If there were no record hinting faults then either the task is
1839          * completely idle or all activity is areas that are not of interest
1840          * to automatic numa balancing. Related to that, if there were failed
1841          * migration then it implies we are migrating too quickly or the local
1842          * node is overloaded. In either case, scan slower
1843          */
1844         if (local + shared == 0 || p->numa_faults_locality[2]) {
1845                 p->numa_scan_period = min(p->numa_scan_period_max,
1846                         p->numa_scan_period << 1);
1847
1848                 p->mm->numa_next_scan = jiffies +
1849                         msecs_to_jiffies(p->numa_scan_period);
1850
1851                 return;
1852         }
1853
1854         /*
1855          * Prepare to scale scan period relative to the current period.
1856          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1857          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1858          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1859          */
1860         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1861         ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1862         if (ratio >= NUMA_PERIOD_THRESHOLD) {
1863                 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1864                 if (!slot)
1865                         slot = 1;
1866                 diff = slot * period_slot;
1867         } else {
1868                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1869
1870                 /*
1871                  * Scale scan rate increases based on sharing. There is an
1872                  * inverse relationship between the degree of sharing and
1873                  * the adjustment made to the scanning period. Broadly
1874                  * speaking the intent is that there is little point
1875                  * scanning faster if shared accesses dominate as it may
1876                  * simply bounce migrations uselessly
1877                  */
1878                 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1879                 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1880         }
1881
1882         p->numa_scan_period = clamp(p->numa_scan_period + diff,
1883                         task_scan_min(p), task_scan_max(p));
1884         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1885 }
1886
1887 /*
1888  * Get the fraction of time the task has been running since the last
1889  * NUMA placement cycle. The scheduler keeps similar statistics, but
1890  * decays those on a 32ms period, which is orders of magnitude off
1891  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1892  * stats only if the task is so new there are no NUMA statistics yet.
1893  */
1894 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1895 {
1896         u64 runtime, delta, now;
1897         /* Use the start of this time slice to avoid calculations. */
1898         now = p->se.exec_start;
1899         runtime = p->se.sum_exec_runtime;
1900
1901         if (p->last_task_numa_placement) {
1902                 delta = runtime - p->last_sum_exec_runtime;
1903                 *period = now - p->last_task_numa_placement;
1904
1905                 /* Avoid time going backwards, prevent potential divide error: */
1906                 if (unlikely((s64)*period < 0))
1907                         *period = 0;
1908         } else {
1909                 delta = p->se.avg.load_sum / p->se.load.weight;
1910                 *period = LOAD_AVG_MAX;
1911         }
1912
1913         p->last_sum_exec_runtime = runtime;
1914         p->last_task_numa_placement = now;
1915
1916         return delta;
1917 }
1918
1919 /*
1920  * Determine the preferred nid for a task in a numa_group. This needs to
1921  * be done in a way that produces consistent results with group_weight,
1922  * otherwise workloads might not converge.
1923  */
1924 static int preferred_group_nid(struct task_struct *p, int nid)
1925 {
1926         nodemask_t nodes;
1927         int dist;
1928
1929         /* Direct connections between all NUMA nodes. */
1930         if (sched_numa_topology_type == NUMA_DIRECT)
1931                 return nid;
1932
1933         /*
1934          * On a system with glueless mesh NUMA topology, group_weight
1935          * scores nodes according to the number of NUMA hinting faults on
1936          * both the node itself, and on nearby nodes.
1937          */
1938         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1939                 unsigned long score, max_score = 0;
1940                 int node, max_node = nid;
1941
1942                 dist = sched_max_numa_distance;
1943
1944                 for_each_online_node(node) {
1945                         score = group_weight(p, node, dist);
1946                         if (score > max_score) {
1947                                 max_score = score;
1948                                 max_node = node;
1949                         }
1950                 }
1951                 return max_node;
1952         }
1953
1954         /*
1955          * Finding the preferred nid in a system with NUMA backplane
1956          * interconnect topology is more involved. The goal is to locate
1957          * tasks from numa_groups near each other in the system, and
1958          * untangle workloads from different sides of the system. This requires
1959          * searching down the hierarchy of node groups, recursively searching
1960          * inside the highest scoring group of nodes. The nodemask tricks
1961          * keep the complexity of the search down.
1962          */
1963         nodes = node_online_map;
1964         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1965                 unsigned long max_faults = 0;
1966                 nodemask_t max_group = NODE_MASK_NONE;
1967                 int a, b;
1968
1969                 /* Are there nodes at this distance from each other? */
1970                 if (!find_numa_distance(dist))
1971                         continue;
1972
1973                 for_each_node_mask(a, nodes) {
1974                         unsigned long faults = 0;
1975                         nodemask_t this_group;
1976                         nodes_clear(this_group);
1977
1978                         /* Sum group's NUMA faults; includes a==b case. */
1979                         for_each_node_mask(b, nodes) {
1980                                 if (node_distance(a, b) < dist) {
1981                                         faults += group_faults(p, b);
1982                                         node_set(b, this_group);
1983                                         node_clear(b, nodes);
1984                                 }
1985                         }
1986
1987                         /* Remember the top group. */
1988                         if (faults > max_faults) {
1989                                 max_faults = faults;
1990                                 max_group = this_group;
1991                                 /*
1992                                  * subtle: at the smallest distance there is
1993                                  * just one node left in each "group", the
1994                                  * winner is the preferred nid.
1995                                  */
1996                                 nid = a;
1997                         }
1998                 }
1999                 /* Next round, evaluate the nodes within max_group. */
2000                 if (!max_faults)
2001                         break;
2002                 nodes = max_group;
2003         }
2004         return nid;
2005 }
2006
2007 static void task_numa_placement(struct task_struct *p)
2008 {
2009         int seq, nid, max_nid = -1, max_group_nid = -1;
2010         unsigned long max_faults = 0, max_group_faults = 0;
2011         unsigned long fault_types[2] = { 0, 0 };
2012         unsigned long total_faults;
2013         u64 runtime, period;
2014         spinlock_t *group_lock = NULL;
2015
2016         /*
2017          * The p->mm->numa_scan_seq field gets updated without
2018          * exclusive access. Use READ_ONCE() here to ensure
2019          * that the field is read in a single access:
2020          */
2021         seq = READ_ONCE(p->mm->numa_scan_seq);
2022         if (p->numa_scan_seq == seq)
2023                 return;
2024         p->numa_scan_seq = seq;
2025         p->numa_scan_period_max = task_scan_max(p);
2026
2027         total_faults = p->numa_faults_locality[0] +
2028                        p->numa_faults_locality[1];
2029         runtime = numa_get_avg_runtime(p, &period);
2030
2031         /* If the task is part of a group prevent parallel updates to group stats */
2032         if (p->numa_group) {
2033                 group_lock = &p->numa_group->lock;
2034                 spin_lock_irq(group_lock);
2035         }
2036
2037         /* Find the node with the highest number of faults */
2038         for_each_online_node(nid) {
2039                 /* Keep track of the offsets in numa_faults array */
2040                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2041                 unsigned long faults = 0, group_faults = 0;
2042                 int priv;
2043
2044                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2045                         long diff, f_diff, f_weight;
2046
2047                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2048                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2049                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2050                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2051
2052                         /* Decay existing window, copy faults since last scan */
2053                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2054                         fault_types[priv] += p->numa_faults[membuf_idx];
2055                         p->numa_faults[membuf_idx] = 0;
2056
2057                         /*
2058                          * Normalize the faults_from, so all tasks in a group
2059                          * count according to CPU use, instead of by the raw
2060                          * number of faults. Tasks with little runtime have
2061                          * little over-all impact on throughput, and thus their
2062                          * faults are less important.
2063                          */
2064                         f_weight = div64_u64(runtime << 16, period + 1);
2065                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2066                                    (total_faults + 1);
2067                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2068                         p->numa_faults[cpubuf_idx] = 0;
2069
2070                         p->numa_faults[mem_idx] += diff;
2071                         p->numa_faults[cpu_idx] += f_diff;
2072                         faults += p->numa_faults[mem_idx];
2073                         p->total_numa_faults += diff;
2074                         if (p->numa_group) {
2075                                 /*
2076                                  * safe because we can only change our own group
2077                                  *
2078                                  * mem_idx represents the offset for a given
2079                                  * nid and priv in a specific region because it
2080                                  * is at the beginning of the numa_faults array.
2081                                  */
2082                                 p->numa_group->faults[mem_idx] += diff;
2083                                 p->numa_group->faults_cpu[mem_idx] += f_diff;
2084                                 p->numa_group->total_faults += diff;
2085                                 group_faults += p->numa_group->faults[mem_idx];
2086                         }
2087                 }
2088
2089                 if (faults > max_faults) {
2090                         max_faults = faults;
2091                         max_nid = nid;
2092                 }
2093
2094                 if (group_faults > max_group_faults) {
2095                         max_group_faults = group_faults;
2096                         max_group_nid = nid;
2097                 }
2098         }
2099
2100         update_task_scan_period(p, fault_types[0], fault_types[1]);
2101
2102         if (p->numa_group) {
2103                 update_numa_active_node_mask(p->numa_group);
2104                 spin_unlock_irq(group_lock);
2105                 max_nid = preferred_group_nid(p, max_group_nid);
2106         }
2107
2108         if (max_faults) {
2109                 /* Set the new preferred node */
2110                 if (max_nid != p->numa_preferred_nid)
2111                         sched_setnuma(p, max_nid);
2112
2113                 if (task_node(p) != p->numa_preferred_nid)
2114                         numa_migrate_preferred(p);
2115         }
2116 }
2117
2118 static inline int get_numa_group(struct numa_group *grp)
2119 {
2120         return atomic_inc_not_zero(&grp->refcount);
2121 }
2122
2123 static inline void put_numa_group(struct numa_group *grp)
2124 {
2125         if (atomic_dec_and_test(&grp->refcount))
2126                 kfree_rcu(grp, rcu);
2127 }
2128
2129 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2130                         int *priv)
2131 {
2132         struct numa_group *grp, *my_grp;
2133         struct task_struct *tsk;
2134         bool join = false;
2135         int cpu = cpupid_to_cpu(cpupid);
2136         int i;
2137
2138         if (unlikely(!p->numa_group)) {
2139                 unsigned int size = sizeof(struct numa_group) +
2140                                     4*nr_node_ids*sizeof(unsigned long);
2141
2142                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2143                 if (!grp)
2144                         return;
2145
2146                 atomic_set(&grp->refcount, 1);
2147                 spin_lock_init(&grp->lock);
2148                 grp->gid = p->pid;
2149                 /* Second half of the array tracks nids where faults happen */
2150                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2151                                                 nr_node_ids;
2152
2153                 node_set(task_node(current), grp->active_nodes);
2154
2155                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2156                         grp->faults[i] = p->numa_faults[i];
2157
2158                 grp->total_faults = p->total_numa_faults;
2159
2160                 grp->nr_tasks++;
2161                 rcu_assign_pointer(p->numa_group, grp);
2162         }
2163
2164         rcu_read_lock();
2165         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2166
2167         if (!cpupid_match_pid(tsk, cpupid))
2168                 goto no_join;
2169
2170         grp = rcu_dereference(tsk->numa_group);
2171         if (!grp)
2172                 goto no_join;
2173
2174         my_grp = p->numa_group;
2175         if (grp == my_grp)
2176                 goto no_join;
2177
2178         /*
2179          * Only join the other group if its bigger; if we're the bigger group,
2180          * the other task will join us.
2181          */
2182         if (my_grp->nr_tasks > grp->nr_tasks)
2183                 goto no_join;
2184
2185         /*
2186          * Tie-break on the grp address.
2187          */
2188         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2189                 goto no_join;
2190
2191         /* Always join threads in the same process. */
2192         if (tsk->mm == current->mm)
2193                 join = true;
2194
2195         /* Simple filter to avoid false positives due to PID collisions */
2196         if (flags & TNF_SHARED)
2197                 join = true;
2198
2199         /* Update priv based on whether false sharing was detected */
2200         *priv = !join;
2201
2202         if (join && !get_numa_group(grp))
2203                 goto no_join;
2204
2205         rcu_read_unlock();
2206
2207         if (!join)
2208                 return;
2209
2210         BUG_ON(irqs_disabled());
2211         double_lock_irq(&my_grp->lock, &grp->lock);
2212
2213         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2214                 my_grp->faults[i] -= p->numa_faults[i];
2215                 grp->faults[i] += p->numa_faults[i];
2216         }
2217         my_grp->total_faults -= p->total_numa_faults;
2218         grp->total_faults += p->total_numa_faults;
2219
2220         my_grp->nr_tasks--;
2221         grp->nr_tasks++;
2222
2223         spin_unlock(&my_grp->lock);
2224         spin_unlock_irq(&grp->lock);
2225
2226         rcu_assign_pointer(p->numa_group, grp);
2227
2228         put_numa_group(my_grp);
2229         return;
2230
2231 no_join:
2232         rcu_read_unlock();
2233         return;
2234 }
2235
2236 /*
2237  * Get rid of NUMA staticstics associated with a task (either current or dead).
2238  * If @final is set, the task is dead and has reached refcount zero, so we can
2239  * safely free all relevant data structures. Otherwise, there might be
2240  * concurrent reads from places like load balancing and procfs, and we should
2241  * reset the data back to default state without freeing ->numa_faults.
2242  */
2243 void task_numa_free(struct task_struct *p, bool final)
2244 {
2245         struct numa_group *grp = p->numa_group;
2246         unsigned long *numa_faults = p->numa_faults;
2247         unsigned long flags;
2248         int i;
2249
2250         if (!numa_faults)
2251                 return;
2252
2253         if (grp) {
2254                 spin_lock_irqsave(&grp->lock, flags);
2255                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2256                         grp->faults[i] -= p->numa_faults[i];
2257                 grp->total_faults -= p->total_numa_faults;
2258
2259                 grp->nr_tasks--;
2260                 spin_unlock_irqrestore(&grp->lock, flags);
2261                 RCU_INIT_POINTER(p->numa_group, NULL);
2262                 put_numa_group(grp);
2263         }
2264
2265         if (final) {
2266                 p->numa_faults = NULL;
2267                 kfree(numa_faults);
2268         } else {
2269                 p->total_numa_faults = 0;
2270                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2271                         numa_faults[i] = 0;
2272         }
2273 }
2274
2275 /*
2276  * Got a PROT_NONE fault for a page on @node.
2277  */
2278 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2279 {
2280         struct task_struct *p = current;
2281         bool migrated = flags & TNF_MIGRATED;
2282         int cpu_node = task_node(current);
2283         int local = !!(flags & TNF_FAULT_LOCAL);
2284         int priv;
2285
2286         if (!static_branch_likely(&sched_numa_balancing))
2287                 return;
2288
2289         /* for example, ksmd faulting in a user's mm */
2290         if (!p->mm)
2291                 return;
2292
2293         /* Allocate buffer to track faults on a per-node basis */
2294         if (unlikely(!p->numa_faults)) {
2295                 int size = sizeof(*p->numa_faults) *
2296                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2297
2298                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2299                 if (!p->numa_faults)
2300                         return;
2301
2302                 p->total_numa_faults = 0;
2303                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2304         }
2305
2306         /*
2307          * First accesses are treated as private, otherwise consider accesses
2308          * to be private if the accessing pid has not changed
2309          */
2310         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2311                 priv = 1;
2312         } else {
2313                 priv = cpupid_match_pid(p, last_cpupid);
2314                 if (!priv && !(flags & TNF_NO_GROUP))
2315                         task_numa_group(p, last_cpupid, flags, &priv);
2316         }
2317
2318         /*
2319          * If a workload spans multiple NUMA nodes, a shared fault that
2320          * occurs wholly within the set of nodes that the workload is
2321          * actively using should be counted as local. This allows the
2322          * scan rate to slow down when a workload has settled down.
2323          */
2324         if (!priv && !local && p->numa_group &&
2325                         node_isset(cpu_node, p->numa_group->active_nodes) &&
2326                         node_isset(mem_node, p->numa_group->active_nodes))
2327                 local = 1;
2328
2329         task_numa_placement(p);
2330
2331         /*
2332          * Retry task to preferred node migration periodically, in case it
2333          * case it previously failed, or the scheduler moved us.
2334          */
2335         if (time_after(jiffies, p->numa_migrate_retry))
2336                 numa_migrate_preferred(p);
2337
2338         if (migrated)
2339                 p->numa_pages_migrated += pages;
2340         if (flags & TNF_MIGRATE_FAIL)
2341                 p->numa_faults_locality[2] += pages;
2342
2343         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2344         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2345         p->numa_faults_locality[local] += pages;
2346 }
2347
2348 static void reset_ptenuma_scan(struct task_struct *p)
2349 {
2350         /*
2351          * We only did a read acquisition of the mmap sem, so
2352          * p->mm->numa_scan_seq is written to without exclusive access
2353          * and the update is not guaranteed to be atomic. That's not
2354          * much of an issue though, since this is just used for
2355          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2356          * expensive, to avoid any form of compiler optimizations:
2357          */
2358         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2359         p->mm->numa_scan_offset = 0;
2360 }
2361
2362 /*
2363  * The expensive part of numa migration is done from task_work context.
2364  * Triggered from task_tick_numa().
2365  */
2366 void task_numa_work(struct callback_head *work)
2367 {
2368         unsigned long migrate, next_scan, now = jiffies;
2369         struct task_struct *p = current;
2370         struct mm_struct *mm = p->mm;
2371         struct vm_area_struct *vma;
2372         unsigned long start, end;
2373         unsigned long nr_pte_updates = 0;
2374         long pages, virtpages;
2375
2376         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2377
2378         work->next = work; /* protect against double add */
2379         /*
2380          * Who cares about NUMA placement when they're dying.
2381          *
2382          * NOTE: make sure not to dereference p->mm before this check,
2383          * exit_task_work() happens _after_ exit_mm() so we could be called
2384          * without p->mm even though we still had it when we enqueued this
2385          * work.
2386          */
2387         if (p->flags & PF_EXITING)
2388                 return;
2389
2390         if (!mm->numa_next_scan) {
2391                 mm->numa_next_scan = now +
2392                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2393         }
2394
2395         /*
2396          * Enforce maximal scan/migration frequency..
2397          */
2398         migrate = mm->numa_next_scan;
2399         if (time_before(now, migrate))
2400                 return;
2401
2402         if (p->numa_scan_period == 0) {
2403                 p->numa_scan_period_max = task_scan_max(p);
2404                 p->numa_scan_period = task_scan_min(p);
2405         }
2406
2407         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2408         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2409                 return;
2410
2411         /*
2412          * Delay this task enough that another task of this mm will likely win
2413          * the next time around.
2414          */
2415         p->node_stamp += 2 * TICK_NSEC;
2416
2417         start = mm->numa_scan_offset;
2418         pages = sysctl_numa_balancing_scan_size;
2419         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2420         virtpages = pages * 8;     /* Scan up to this much virtual space */
2421         if (!pages)
2422                 return;
2423
2424
2425         if (!down_read_trylock(&mm->mmap_sem))
2426                 return;
2427         vma = find_vma(mm, start);
2428         if (!vma) {
2429                 reset_ptenuma_scan(p);
2430                 start = 0;
2431                 vma = mm->mmap;
2432         }
2433         for (; vma; vma = vma->vm_next) {
2434                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2435                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2436                         continue;
2437                 }
2438
2439                 /*
2440                  * Shared library pages mapped by multiple processes are not
2441                  * migrated as it is expected they are cache replicated. Avoid
2442                  * hinting faults in read-only file-backed mappings or the vdso
2443                  * as migrating the pages will be of marginal benefit.
2444                  */
2445                 if (!vma->vm_mm ||
2446                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2447                         continue;
2448
2449                 /*
2450                  * Skip inaccessible VMAs to avoid any confusion between
2451                  * PROT_NONE and NUMA hinting ptes
2452                  */
2453                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2454                         continue;
2455
2456                 do {
2457                         start = max(start, vma->vm_start);
2458                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2459                         end = min(end, vma->vm_end);
2460                         nr_pte_updates = change_prot_numa(vma, start, end);
2461
2462                         /*
2463                          * Try to scan sysctl_numa_balancing_size worth of
2464                          * hpages that have at least one present PTE that
2465                          * is not already pte-numa. If the VMA contains
2466                          * areas that are unused or already full of prot_numa
2467                          * PTEs, scan up to virtpages, to skip through those
2468                          * areas faster.
2469                          */
2470                         if (nr_pte_updates)
2471                                 pages -= (end - start) >> PAGE_SHIFT;
2472                         virtpages -= (end - start) >> PAGE_SHIFT;
2473
2474                         start = end;
2475                         if (pages <= 0 || virtpages <= 0)
2476                                 goto out;
2477
2478                         cond_resched();
2479                 } while (end != vma->vm_end);
2480         }
2481
2482 out:
2483         /*
2484          * It is possible to reach the end of the VMA list but the last few
2485          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2486          * would find the !migratable VMA on the next scan but not reset the
2487          * scanner to the start so check it now.
2488          */
2489         if (vma)
2490                 mm->numa_scan_offset = start;
2491         else
2492                 reset_ptenuma_scan(p);
2493         up_read(&mm->mmap_sem);
2494 }
2495
2496 /*
2497  * Drive the periodic memory faults..
2498  */
2499 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2500 {
2501         struct callback_head *work = &curr->numa_work;
2502         u64 period, now;
2503
2504         /*
2505          * We don't care about NUMA placement if we don't have memory.
2506          */
2507         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2508                 return;
2509
2510         /*
2511          * Using runtime rather than walltime has the dual advantage that
2512          * we (mostly) drive the selection from busy threads and that the
2513          * task needs to have done some actual work before we bother with
2514          * NUMA placement.
2515          */
2516         now = curr->se.sum_exec_runtime;
2517         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2518
2519         if (now > curr->node_stamp + period) {
2520                 if (!curr->node_stamp)
2521                         curr->numa_scan_period = task_scan_min(curr);
2522                 curr->node_stamp += period;
2523
2524                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2525                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2526                         task_work_add(curr, work, true);
2527                 }
2528         }
2529 }
2530 #else
2531 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2532 {
2533 }
2534
2535 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2536 {
2537 }
2538
2539 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2540 {
2541 }
2542 #endif /* CONFIG_NUMA_BALANCING */
2543
2544 static void
2545 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2546 {
2547         update_load_add(&cfs_rq->load, se->load.weight);
2548         if (!parent_entity(se))
2549                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2550 #ifdef CONFIG_SMP
2551         if (entity_is_task(se)) {
2552                 struct rq *rq = rq_of(cfs_rq);
2553
2554                 account_numa_enqueue(rq, task_of(se));
2555                 list_add(&se->group_node, &rq->cfs_tasks);
2556         }
2557 #endif
2558         cfs_rq->nr_running++;
2559 }
2560
2561 static void
2562 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2563 {
2564         update_load_sub(&cfs_rq->load, se->load.weight);
2565         if (!parent_entity(se))
2566                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2567         if (entity_is_task(se)) {
2568                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2569                 list_del_init(&se->group_node);
2570         }
2571         cfs_rq->nr_running--;
2572 }
2573
2574 #ifdef CONFIG_FAIR_GROUP_SCHED
2575 # ifdef CONFIG_SMP
2576 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2577 {
2578         long tg_weight, load, shares;
2579
2580         /*
2581          * This really should be: cfs_rq->avg.load_avg, but instead we use
2582          * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2583          * the shares for small weight interactive tasks.
2584          */
2585         load = scale_load_down(cfs_rq->load.weight);
2586
2587         tg_weight = atomic_long_read(&tg->load_avg);
2588
2589         /* Ensure tg_weight >= load */
2590         tg_weight -= cfs_rq->tg_load_avg_contrib;
2591         tg_weight += load;
2592
2593         shares = (tg->shares * load);
2594         if (tg_weight)
2595                 shares /= tg_weight;
2596
2597         if (shares < MIN_SHARES)
2598                 shares = MIN_SHARES;
2599         if (shares > tg->shares)
2600                 shares = tg->shares;
2601
2602         return shares;
2603 }
2604 # else /* CONFIG_SMP */
2605 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2606 {
2607         return tg->shares;
2608 }
2609 # endif /* CONFIG_SMP */
2610
2611 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2612                             unsigned long weight)
2613 {
2614         if (se->on_rq) {
2615                 /* commit outstanding execution time */
2616                 if (cfs_rq->curr == se)
2617                         update_curr(cfs_rq);
2618                 account_entity_dequeue(cfs_rq, se);
2619         }
2620
2621         update_load_set(&se->load, weight);
2622
2623         if (se->on_rq)
2624                 account_entity_enqueue(cfs_rq, se);
2625 }
2626
2627 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2628
2629 static void update_cfs_shares(struct sched_entity *se)
2630 {
2631         struct cfs_rq *cfs_rq = group_cfs_rq(se);
2632         struct task_group *tg;
2633         long shares;
2634
2635         if (!cfs_rq)
2636                 return;
2637
2638         if (throttled_hierarchy(cfs_rq))
2639                 return;
2640
2641         tg = cfs_rq->tg;
2642
2643 #ifndef CONFIG_SMP
2644         if (likely(se->load.weight == tg->shares))
2645                 return;
2646 #endif
2647         shares = calc_cfs_shares(cfs_rq, tg);
2648
2649         reweight_entity(cfs_rq_of(se), se, shares);
2650 }
2651
2652 #else /* CONFIG_FAIR_GROUP_SCHED */
2653 static inline void update_cfs_shares(struct sched_entity *se)
2654 {
2655 }
2656 #endif /* CONFIG_FAIR_GROUP_SCHED */
2657
2658 #ifdef CONFIG_SMP
2659 u32 sched_get_wake_up_idle(struct task_struct *p)
2660 {
2661         u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2662
2663         return !!enabled;
2664 }
2665 EXPORT_SYMBOL(sched_get_wake_up_idle);
2666
2667 int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2668 {
2669         int enable = !!wake_up_idle;
2670
2671         if (enable)
2672                 p->flags |= PF_WAKE_UP_IDLE;
2673         else
2674                 p->flags &= ~PF_WAKE_UP_IDLE;
2675
2676         return 0;
2677 }
2678 EXPORT_SYMBOL(sched_set_wake_up_idle);
2679
2680 static const u32 runnable_avg_yN_inv[] = {
2681         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2682         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2683         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2684         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2685         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2686         0x85aac367, 0x82cd8698,
2687 };
2688
2689 /*
2690  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2691  * over-estimates when re-combining.
2692  */
2693 static const u32 runnable_avg_yN_sum[] = {
2694             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2695          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2696         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2697 };
2698
2699 /*
2700  * Approximate:
2701  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2702  */
2703 static __always_inline u64 decay_load(u64 val, u64 n)
2704 {
2705         unsigned int local_n;
2706
2707         if (!n)
2708                 return val;
2709         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2710                 return 0;
2711
2712         /* after bounds checking we can collapse to 32-bit */
2713         local_n = n;
2714
2715         /*
2716          * As y^PERIOD = 1/2, we can combine
2717          *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2718          * With a look-up table which covers y^n (n<PERIOD)
2719          *
2720          * To achieve constant time decay_load.
2721          */
2722         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2723                 val >>= local_n / LOAD_AVG_PERIOD;
2724                 local_n %= LOAD_AVG_PERIOD;
2725         }
2726
2727         val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2728         return val;
2729 }
2730
2731 /*
2732  * For updates fully spanning n periods, the contribution to runnable
2733  * average will be: \Sum 1024*y^n
2734  *
2735  * We can compute this reasonably efficiently by combining:
2736  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2737  */
2738 static u32 __compute_runnable_contrib(u64 n)
2739 {
2740         u32 contrib = 0;
2741
2742         if (likely(n <= LOAD_AVG_PERIOD))
2743                 return runnable_avg_yN_sum[n];
2744         else if (unlikely(n >= LOAD_AVG_MAX_N))
2745                 return LOAD_AVG_MAX;
2746
2747         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2748         do {
2749                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2750                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2751
2752                 n -= LOAD_AVG_PERIOD;
2753         } while (n > LOAD_AVG_PERIOD);
2754
2755         contrib = decay_load(contrib, n);
2756         return contrib + runnable_avg_yN_sum[n];
2757 }
2758
2759 #ifdef CONFIG_SCHED_HMP
2760
2761 /* CPU selection flag */
2762 #define SBC_FLAG_PREV_CPU                               0x1
2763 #define SBC_FLAG_BEST_CAP_CPU                           0x2
2764 #define SBC_FLAG_CPU_COST                               0x4
2765 #define SBC_FLAG_MIN_COST                               0x8
2766 #define SBC_FLAG_IDLE_LEAST_LOADED                      0x10
2767 #define SBC_FLAG_IDLE_CSTATE                            0x20
2768 #define SBC_FLAG_COST_CSTATE_TIE_BREAKER                0x40
2769 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER       0x80
2770 #define SBC_FLAG_CSTATE_LOAD                            0x100
2771 #define SBC_FLAG_BEST_SIBLING                           0x200
2772 #define SBC_FLAG_WAKER_CPU                              0x400
2773 #define SBC_FLAG_PACK_TASK                              0x800
2774
2775 /* Cluster selection flag */
2776 #define SBC_FLAG_COLOC_CLUSTER                          0x10000
2777 #define SBC_FLAG_WAKER_CLUSTER                          0x20000
2778 #define SBC_FLAG_BACKUP_CLUSTER                         0x40000
2779 #define SBC_FLAG_BOOST_CLUSTER                          0x80000
2780
2781 struct cpu_select_env {
2782         struct task_struct *p;
2783         struct related_thread_group *rtg;
2784         u8 reason;
2785         u8 need_idle:1;
2786         u8 need_waker_cluster:1;
2787         u8 sync:1;
2788         enum sched_boost_policy boost_policy;
2789         u8 pack_task:1;
2790         int prev_cpu;
2791         DECLARE_BITMAP(candidate_list, NR_CPUS);
2792         DECLARE_BITMAP(backup_list, NR_CPUS);
2793         u64 task_load;
2794         u64 cpu_load;
2795         u32 sbc_best_flag;
2796         u32 sbc_best_cluster_flag;
2797         struct cpumask search_cpus;
2798 };
2799
2800 struct cluster_cpu_stats {
2801         int best_idle_cpu, least_loaded_cpu;
2802         int best_capacity_cpu, best_cpu, best_sibling_cpu;
2803         int min_cost, best_sibling_cpu_cost;
2804         int best_cpu_wakeup_latency;
2805         u64 min_load, best_load, best_sibling_cpu_load;
2806         s64 highest_spare_capacity;
2807 };
2808
2809 /*
2810  * Should task be woken to any available idle cpu?
2811  *
2812  * Waking tasks to idle cpu has mixed implications on both performance and
2813  * power. In many cases, scheduler can't estimate correctly impact of using idle
2814  * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
2815  * module to pass a strong hint to scheduler that the task in question should be
2816  * woken to idle cpu, generally to improve performance.
2817  */
2818 static inline int wake_to_idle(struct task_struct *p)
2819 {
2820         return (current->flags & PF_WAKE_UP_IDLE) ||
2821                  (p->flags & PF_WAKE_UP_IDLE);
2822 }
2823
2824 static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
2825 {
2826         u64 total_load;
2827
2828         total_load = env->task_load + env->cpu_load;
2829
2830         if (total_load > sched_spill_load ||
2831             (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
2832                 return 1;
2833
2834         return 0;
2835 }
2836
2837 static int skip_cpu(int cpu, struct cpu_select_env *env)
2838 {
2839         int tcpu = task_cpu(env->p);
2840         int skip = 0;
2841
2842         if (!env->reason)
2843                 return 0;
2844
2845         if (is_reserved(cpu))
2846                 return 1;
2847
2848         switch (env->reason) {
2849         case UP_MIGRATION:
2850                 skip = !idle_cpu(cpu);
2851                 break;
2852         case IRQLOAD_MIGRATION:
2853                 /* Purposely fall through */
2854         default:
2855                 skip = (cpu == tcpu);
2856                 break;
2857         }
2858
2859         return skip;
2860 }
2861
2862 static inline int
2863 acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
2864 {
2865         int tcpu;
2866
2867         if (!env->reason)
2868                 return 1;
2869
2870         tcpu = task_cpu(env->p);
2871         switch (env->reason) {
2872         case UP_MIGRATION:
2873                 return cluster->capacity > cpu_capacity(tcpu);
2874
2875         case DOWN_MIGRATION:
2876                 return cluster->capacity < cpu_capacity(tcpu);
2877
2878         default:
2879                 break;
2880         }
2881
2882         return 1;
2883 }
2884
2885 static int
2886 skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
2887 {
2888         if (!test_bit(cluster->id, env->candidate_list))
2889                 return 1;
2890
2891         if (!acceptable_capacity(cluster, env)) {
2892                 __clear_bit(cluster->id, env->candidate_list);
2893                 return 1;
2894         }
2895
2896         return 0;
2897 }
2898
2899 static struct sched_cluster *
2900 select_least_power_cluster(struct cpu_select_env *env)
2901 {
2902         struct sched_cluster *cluster;
2903
2904         if (env->rtg) {
2905                 int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
2906
2907                 env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
2908
2909                 if (task_load_will_fit(env->p, env->task_load,
2910                                         cpu, env->boost_policy)) {
2911                         env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
2912
2913                         if (env->boost_policy == SCHED_BOOST_NONE)
2914                                 return env->rtg->preferred_cluster;
2915
2916                         for_each_sched_cluster(cluster) {
2917                                 if (cluster != env->rtg->preferred_cluster) {
2918                                         __set_bit(cluster->id,
2919                                                 env->backup_list);
2920                                         __clear_bit(cluster->id,
2921                                                 env->candidate_list);
2922                                 }
2923                         }
2924
2925                         return env->rtg->preferred_cluster;
2926                 }
2927
2928                 /*
2929                  * Since the task load does not fit on the preferred
2930                  * cluster anymore, pretend that the task does not
2931                  * have any preferred cluster. This allows the waking
2932                  * task to get the appropriate CPU it needs as per the
2933                  * non co-location placement policy without having to
2934                  * wait until the preferred cluster is updated.
2935                  */
2936                 env->rtg = NULL;
2937         }
2938
2939         for_each_sched_cluster(cluster) {
2940                 if (!skip_cluster(cluster, env)) {
2941                         int cpu = cluster_first_cpu(cluster);
2942
2943                         env->task_load = scale_load_to_cpu(task_load(env->p),
2944                                                                          cpu);
2945                         if (task_load_will_fit(env->p, env->task_load, cpu,
2946                                                env->boost_policy))
2947                                 return cluster;
2948
2949                         __set_bit(cluster->id, env->backup_list);
2950                         __clear_bit(cluster->id, env->candidate_list);
2951                 }
2952         }
2953
2954         return NULL;
2955 }
2956
2957 static struct sched_cluster *
2958 next_candidate(const unsigned long *list, int start, int end)
2959 {
2960         int cluster_id;
2961
2962         cluster_id = find_next_bit(list, end, start - 1 + 1);
2963         if (cluster_id >= end)
2964                 return NULL;
2965
2966         return sched_cluster[cluster_id];
2967 }
2968
2969 static void
2970 update_spare_capacity(struct cluster_cpu_stats *stats,
2971                       struct cpu_select_env *env, int cpu, int capacity,
2972                       u64 cpu_load)
2973 {
2974         s64 spare_capacity = sched_ravg_window - cpu_load;
2975
2976         if (spare_capacity > 0 &&
2977             (spare_capacity > stats->highest_spare_capacity ||
2978              (spare_capacity == stats->highest_spare_capacity &&
2979               ((!env->need_waker_cluster &&
2980                 capacity > cpu_capacity(stats->best_capacity_cpu)) ||
2981                (env->need_waker_cluster &&
2982                 cpu_rq(cpu)->nr_running <
2983                 cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
2984                 /*
2985                  * If sync waker is the only runnable of CPU, cr_avg of the
2986                  * CPU is 0 so we have high chance to place the wakee on the
2987                  * waker's CPU which likely causes preemtion of the waker.
2988                  * This can lead migration of preempted waker.  Place the
2989                  * wakee on the real idle CPU when it's possible by checking
2990                  * nr_running to avoid such preemption.
2991                  */
2992                 stats->highest_spare_capacity = spare_capacity;
2993                 stats->best_capacity_cpu = cpu;
2994         }
2995 }
2996
2997 static inline void find_backup_cluster(
2998 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
2999 {
3000         struct sched_cluster *next = NULL;
3001         int i;
3002         struct cpumask search_cpus;
3003
3004         while (!bitmap_empty(env->backup_list, num_clusters)) {
3005                 next = next_candidate(env->backup_list, 0, num_clusters);
3006                 __clear_bit(next->id, env->backup_list);
3007
3008                 cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
3009                 for_each_cpu(i, &search_cpus) {
3010                         trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3011                         sched_irqload(i), power_cost(i, task_load(env->p) +
3012                                         cpu_cravg_sync(i, env->sync)), 0);
3013
3014                         update_spare_capacity(stats, env, i, next->capacity,
3015                                           cpu_load_sync(i, env->sync));
3016                 }
3017                 env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
3018         }
3019 }
3020
3021 struct sched_cluster *
3022 next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
3023                                         struct cluster_cpu_stats *stats)
3024 {
3025         struct sched_cluster *next = NULL;
3026
3027         __clear_bit(cluster->id, env->candidate_list);
3028
3029         if (env->rtg && preferred_cluster(cluster, env->p))
3030                 return NULL;
3031
3032         do {
3033                 if (bitmap_empty(env->candidate_list, num_clusters))
3034                         return NULL;
3035
3036                 next = next_candidate(env->candidate_list, 0, num_clusters);
3037                 if (next) {
3038                         if (next->min_power_cost > stats->min_cost) {
3039                                 clear_bit(next->id, env->candidate_list);
3040                                 next = NULL;
3041                                 continue;
3042                         }
3043
3044                         if (skip_cluster(next, env))
3045                                 next = NULL;
3046                 }
3047         } while (!next);
3048
3049         env->task_load = scale_load_to_cpu(task_load(env->p),
3050                                         cluster_first_cpu(next));
3051         return next;
3052 }
3053
3054 #ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
3055 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3056                                    struct cpu_select_env *env, int cpu_cost)
3057 {
3058         int wakeup_latency;
3059         int prev_cpu = env->prev_cpu;
3060
3061         wakeup_latency = cpu_rq(cpu)->wakeup_latency;
3062
3063         if (env->need_idle) {
3064                 stats->min_cost = cpu_cost;
3065                 if (idle_cpu(cpu)) {
3066                         if (wakeup_latency < stats->best_cpu_wakeup_latency ||
3067                             (wakeup_latency == stats->best_cpu_wakeup_latency &&
3068                              cpu == prev_cpu)) {
3069                                 stats->best_idle_cpu = cpu;
3070                                 stats->best_cpu_wakeup_latency = wakeup_latency;
3071                         }
3072                 } else {
3073                         if (env->cpu_load < stats->min_load ||
3074                                 (env->cpu_load == stats->min_load &&
3075                                                         cpu == prev_cpu)) {
3076                                 stats->least_loaded_cpu = cpu;
3077                                 stats->min_load = env->cpu_load;
3078                         }
3079                 }
3080
3081                 return;
3082         }
3083
3084         if (cpu_cost < stats->min_cost)  {
3085                 stats->min_cost = cpu_cost;
3086                 stats->best_cpu_wakeup_latency = wakeup_latency;
3087                 stats->best_load = env->cpu_load;
3088                 stats->best_cpu = cpu;
3089                 env->sbc_best_flag = SBC_FLAG_CPU_COST;
3090                 return;
3091         }
3092
3093         /* CPU cost is the same. Start breaking the tie by C-state */
3094
3095         if (wakeup_latency > stats->best_cpu_wakeup_latency)
3096                 return;
3097
3098         if (wakeup_latency < stats->best_cpu_wakeup_latency) {
3099                 stats->best_cpu_wakeup_latency = wakeup_latency;
3100                 stats->best_load = env->cpu_load;
3101                 stats->best_cpu = cpu;
3102                 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
3103                 return;
3104         }
3105
3106         /* C-state is the same. Use prev CPU to break the tie */
3107         if (cpu == prev_cpu) {
3108                 stats->best_cpu = cpu;
3109                 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
3110                 return;
3111         }
3112
3113         if (stats->best_cpu != prev_cpu &&
3114             ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
3115             (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
3116                 stats->best_load = env->cpu_load;
3117                 stats->best_cpu = cpu;
3118                 env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
3119         }
3120 }
3121 #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3122 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3123                                    struct cpu_select_env *env, int cpu_cost)
3124 {
3125         int prev_cpu = env->prev_cpu;
3126
3127         if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
3128                 if (stats->best_sibling_cpu_cost > cpu_cost ||
3129                     (stats->best_sibling_cpu_cost == cpu_cost &&
3130                      stats->best_sibling_cpu_load > env->cpu_load)) {
3131                         stats->best_sibling_cpu_cost = cpu_cost;
3132                         stats->best_sibling_cpu_load = env->cpu_load;
3133                         stats->best_sibling_cpu = cpu;
3134                 }
3135         }
3136
3137         if ((cpu_cost < stats->min_cost) ||
3138             ((stats->best_cpu != prev_cpu &&
3139               stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
3140                 if (env->need_idle) {
3141                         if (idle_cpu(cpu)) {
3142                                 stats->min_cost = cpu_cost;
3143                                 stats->best_idle_cpu = cpu;
3144                         }
3145                 } else {
3146                         stats->min_cost = cpu_cost;
3147                         stats->min_load = env->cpu_load;
3148                         stats->best_cpu = cpu;
3149                         env->sbc_best_flag = SBC_FLAG_MIN_COST;
3150                 }
3151         }
3152 }
3153 #endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3154
3155 static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3156                                          struct cpu_select_env *env)
3157 {
3158         int cpu_cost;
3159
3160         /*
3161          * We try to find the least loaded *busy* CPU irrespective
3162          * of the power cost.
3163          */
3164         if (env->pack_task)
3165                 cpu_cost = cpu_min_power_cost(cpu);
3166
3167         else
3168                 cpu_cost = power_cost(cpu, task_load(env->p) +
3169                                 cpu_cravg_sync(cpu, env->sync));
3170
3171         if (cpu_cost <= stats->min_cost)
3172                 __update_cluster_stats(cpu, stats, env, cpu_cost);
3173 }
3174
3175 static void find_best_cpu_in_cluster(struct sched_cluster *c,
3176          struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3177 {
3178         int i;
3179         struct cpumask search_cpus;
3180
3181         cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
3182
3183         env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
3184
3185         for_each_cpu(i, &search_cpus) {
3186                 env->cpu_load = cpu_load_sync(i, env->sync);
3187
3188                 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3189                         sched_irqload(i),
3190                         power_cost(i, task_load(env->p) +
3191                                         cpu_cravg_sync(i, env->sync)), 0);
3192
3193                 if (skip_cpu(i, env))
3194                         continue;
3195
3196                 update_spare_capacity(stats, env, i, c->capacity,
3197                                       env->cpu_load);
3198
3199                 /*
3200                  * need_idle takes precedence over sched boost but when both
3201                  * are set, idlest CPU with in all the clusters is selected
3202                  * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
3203                  * big cluster is selected within boost_policy = BOOST_ON_BIG.
3204                  */
3205                 if ((!env->need_idle &&
3206                     env->boost_policy != SCHED_BOOST_NONE) ||
3207                     env->need_waker_cluster ||
3208                     sched_cpu_high_irqload(i) ||
3209                     spill_threshold_crossed(env, cpu_rq(i)))
3210                         continue;
3211
3212                 update_cluster_stats(i, stats, env);
3213         }
3214 }
3215
3216 static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
3217 {
3218         stats->best_cpu = stats->best_idle_cpu = -1;
3219         stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
3220         stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
3221         stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
3222         stats->highest_spare_capacity = 0;
3223         stats->least_loaded_cpu = -1;
3224         stats->best_cpu_wakeup_latency = INT_MAX;
3225         /* No need to initialize stats->best_load */
3226 }
3227
3228 static inline bool env_has_special_flags(struct cpu_select_env *env)
3229 {
3230         if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
3231             env->reason)
3232                 return true;
3233
3234         return false;
3235 }
3236
3237 static inline bool
3238 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3239 {
3240         int prev_cpu;
3241         struct task_struct *task = env->p;
3242         struct sched_cluster *cluster;
3243
3244         if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
3245                 return false;
3246
3247         prev_cpu = env->prev_cpu;
3248         if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
3249                 return false;
3250
3251         if (task->ravg.mark_start - task->last_cpu_selected_ts >=
3252                                 sched_long_cpu_selection_threshold)
3253                 return false;
3254
3255         /*
3256          * This function should be used by task wake up path only as it's
3257          * assuming p->last_switch_out_ts as last sleep time.
3258          * p->last_switch_out_ts can denote last preemption time as well as
3259          * last sleep time.
3260          */
3261         if (task->ravg.mark_start - task->last_switch_out_ts >=
3262                                         sched_short_sleep_task_threshold)
3263                 return false;
3264
3265         env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
3266         cluster = cpu_rq(prev_cpu)->cluster;
3267
3268         if (!task_load_will_fit(task, env->task_load, prev_cpu,
3269                                 sched_boost_policy())) {
3270
3271                 __set_bit(cluster->id, env->backup_list);
3272                 __clear_bit(cluster->id, env->candidate_list);
3273                 return false;
3274         }
3275
3276         env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
3277         if (sched_cpu_high_irqload(prev_cpu) ||
3278                         spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
3279                 update_spare_capacity(stats, env, prev_cpu,
3280                                 cluster->capacity, env->cpu_load);
3281                 cpumask_clear_cpu(prev_cpu, &env->search_cpus);
3282                 return false;
3283         }
3284
3285         return true;
3286 }
3287
3288 static inline bool
3289 wake_to_waker_cluster(struct cpu_select_env *env)
3290 {
3291         return env->sync &&
3292                task_load(current) > sched_big_waker_task_load &&
3293                task_load(env->p) < sched_small_wakee_task_load;
3294 }
3295
3296 static inline bool
3297 bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
3298 {
3299         return sysctl_sched_prefer_sync_wakee_to_waker &&
3300                cpu_rq(cpu)->nr_running == 1 &&
3301                cpumask_test_cpu(cpu, &env->search_cpus);
3302 }
3303
3304 static inline int
3305 cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
3306 {
3307         return cpumask_intersects(&env->search_cpus, &cluster->cpus);
3308 }
3309
3310 /* return cheapest cpu that can fit this task */
3311 static int select_best_cpu(struct task_struct *p, int target, int reason,
3312                            int sync)
3313 {
3314         struct sched_cluster *cluster, *pref_cluster = NULL;
3315         struct cluster_cpu_stats stats;
3316         struct related_thread_group *grp;
3317         unsigned int sbc_flag = 0;
3318         int cpu = raw_smp_processor_id();
3319         bool special;
3320
3321         struct cpu_select_env env = {
3322                 .p                      = p,
3323                 .reason                 = reason,
3324                 .need_idle              = wake_to_idle(p),
3325                 .need_waker_cluster     = 0,
3326                 .sync                   = sync,
3327                 .prev_cpu               = target,
3328                 .rtg                    = NULL,
3329                 .sbc_best_flag          = 0,
3330                 .sbc_best_cluster_flag  = 0,
3331                 .pack_task              = false,
3332         };
3333
3334         env.boost_policy = task_sched_boost(p) ?
3335                         sched_boost_policy() : SCHED_BOOST_NONE;
3336
3337         bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
3338         bitmap_zero(env.backup_list, NR_CPUS);
3339
3340         cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
3341         cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
3342
3343         init_cluster_cpu_stats(&stats);
3344         special = env_has_special_flags(&env);
3345
3346         rcu_read_lock();
3347
3348         grp = task_related_thread_group(p);
3349
3350         if (grp && grp->preferred_cluster) {
3351                 pref_cluster = grp->preferred_cluster;
3352                 if (!cluster_allowed(&env, pref_cluster))
3353                         clear_bit(pref_cluster->id, env.candidate_list);
3354                 else
3355                         env.rtg = grp;
3356         } else if (!special) {
3357                 cluster = cpu_rq(cpu)->cluster;
3358                 if (wake_to_waker_cluster(&env)) {
3359                         if (bias_to_waker_cpu(&env, cpu)) {
3360                                 target = cpu;
3361                                 sbc_flag = SBC_FLAG_WAKER_CLUSTER |
3362                                            SBC_FLAG_WAKER_CPU;
3363                                 goto out;
3364                         } else if (cluster_allowed(&env, cluster)) {
3365                                 env.need_waker_cluster = 1;
3366                                 bitmap_zero(env.candidate_list, NR_CPUS);
3367                                 __set_bit(cluster->id, env.candidate_list);
3368                                 env.sbc_best_cluster_flag =
3369                                                         SBC_FLAG_WAKER_CLUSTER;
3370                         }
3371                 } else if (bias_to_prev_cpu(&env, &stats)) {
3372                         sbc_flag = SBC_FLAG_PREV_CPU;
3373                         goto out;
3374                 }
3375         }
3376
3377         if (!special && is_short_burst_task(p)) {
3378                 env.pack_task = true;
3379                 sbc_flag = SBC_FLAG_PACK_TASK;
3380         }
3381 retry:
3382         cluster = select_least_power_cluster(&env);
3383
3384         if (!cluster)
3385                 goto out;
3386
3387         /*
3388          * 'cluster' now points to the minimum power cluster which can satisfy
3389          * task's perf goals. Walk down the cluster list starting with that
3390          * cluster. For non-small tasks, skip clusters that don't have
3391          * mostly_idle/idle cpus
3392          */
3393
3394         do {
3395                 find_best_cpu_in_cluster(cluster, &env, &stats);
3396
3397         } while ((cluster = next_best_cluster(cluster, &env, &stats)));
3398
3399         if (env.need_idle) {
3400                 if (stats.best_idle_cpu >= 0) {
3401                         target = stats.best_idle_cpu;
3402                         sbc_flag |= SBC_FLAG_IDLE_CSTATE;
3403                 } else if (stats.least_loaded_cpu >= 0) {
3404                         target = stats.least_loaded_cpu;
3405                         sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
3406                 }
3407         } else if (stats.best_cpu >= 0) {
3408                 if (stats.best_sibling_cpu >= 0 &&
3409                                 stats.best_cpu != task_cpu(p) &&
3410                                 stats.min_cost == stats.best_sibling_cpu_cost) {
3411                         stats.best_cpu = stats.best_sibling_cpu;
3412                         sbc_flag |= SBC_FLAG_BEST_SIBLING;
3413                 }
3414                 sbc_flag |= env.sbc_best_flag;
3415                 target = stats.best_cpu;
3416         } else {
3417                 if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
3418                         env.rtg = NULL;
3419                         goto retry;
3420                 }
3421
3422                 /*
3423                  * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
3424                  * backup_list = little cluster, candidate_list = none and
3425                  * stats->best_capacity_cpu points the best spare capacity
3426                  * CPU among the CPUs in the big cluster.
3427                  */
3428                 if (env.boost_policy == SCHED_BOOST_ON_BIG &&
3429                     stats.best_capacity_cpu >= 0)
3430                         sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
3431                 else
3432                         find_backup_cluster(&env, &stats);
3433
3434                 if (stats.best_capacity_cpu >= 0) {
3435                         target = stats.best_capacity_cpu;
3436                         sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
3437                 }
3438         }
3439         p->last_cpu_selected_ts = sched_ktime_clock();
3440 out:
3441         sbc_flag |= env.sbc_best_cluster_flag;
3442         rcu_read_unlock();
3443         trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
3444                 env.reason, env.sync, env.need_idle, sbc_flag, target);
3445         return target;
3446 }
3447
3448 #ifdef CONFIG_CFS_BANDWIDTH
3449
3450 static inline struct task_group *next_task_group(struct task_group *tg)
3451 {
3452         tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
3453
3454         return (&tg->list == &task_groups) ? NULL : tg;
3455 }
3456
3457 /* Iterate over all cfs_rq in a cpu */
3458 #define for_each_cfs_rq(cfs_rq, tg, cpu)        \
3459         for (tg = container_of(&task_groups, struct task_group, list);  \
3460                 ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
3461
3462 void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
3463 {
3464         struct task_group *tg;
3465         struct cfs_rq *cfs_rq;
3466
3467         rcu_read_lock();
3468
3469         for_each_cfs_rq(cfs_rq, tg, cpu)
3470                 reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
3471
3472         rcu_read_unlock();
3473 }
3474
3475 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
3476
3477 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3478          struct task_struct *p, int change_cra);
3479 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3480          struct task_struct *p, int change_cra);
3481
3482 /* Add task's contribution to a cpu' HMP statistics */
3483 void _inc_hmp_sched_stats_fair(struct rq *rq,
3484                         struct task_struct *p, int change_cra)
3485 {
3486         struct cfs_rq *cfs_rq;
3487         struct sched_entity *se = &p->se;
3488
3489         /*
3490          * Although below check is not strictly required  (as
3491          * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
3492          * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
3493          * efficiency by short-circuiting for_each_sched_entity() loop when
3494          * sched_disable_window_stats
3495          */
3496         if (sched_disable_window_stats)
3497                 return;
3498
3499         for_each_sched_entity(se) {
3500                 cfs_rq = cfs_rq_of(se);
3501                 inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3502                 if (cfs_rq_throttled(cfs_rq))
3503                         break;
3504         }
3505
3506         /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3507         if (!se)
3508                 inc_rq_hmp_stats(rq, p, change_cra);
3509 }
3510
3511 /* Remove task's contribution from a cpu' HMP statistics */
3512 static void
3513 _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
3514 {
3515         struct cfs_rq *cfs_rq;
3516         struct sched_entity *se = &p->se;
3517
3518         /* See comment on efficiency in _inc_hmp_sched_stats_fair */
3519         if (sched_disable_window_stats)
3520                 return;
3521
3522         for_each_sched_entity(se) {
3523                 cfs_rq = cfs_rq_of(se);
3524                 dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3525                 if (cfs_rq_throttled(cfs_rq))
3526                         break;
3527         }
3528
3529         /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3530         if (!se)
3531                 dec_rq_hmp_stats(rq, p, change_cra);
3532 }
3533
3534 static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3535 {
3536         _inc_hmp_sched_stats_fair(rq, p, 1);
3537 }
3538
3539 static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3540 {
3541         _dec_hmp_sched_stats_fair(rq, p, 1);
3542 }
3543
3544 static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3545                                        u32 new_task_load, u32 new_pred_demand)
3546 {
3547         struct cfs_rq *cfs_rq;
3548         struct sched_entity *se = &p->se;
3549         s64 task_load_delta = (s64)new_task_load - task_load(p);
3550         s64 pred_demand_delta = PRED_DEMAND_DELTA;
3551
3552         for_each_sched_entity(se) {
3553                 cfs_rq = cfs_rq_of(se);
3554
3555                 fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
3556                                               task_load_delta,
3557                                               pred_demand_delta);
3558                 fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
3559                 if (cfs_rq_throttled(cfs_rq))
3560                         break;
3561         }
3562
3563         /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
3564         if (!se) {
3565                 fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
3566                                               task_load_delta,
3567                                               pred_demand_delta);
3568                 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3569         }
3570 }
3571
3572 static int task_will_be_throttled(struct task_struct *p);
3573
3574 #else   /* CONFIG_CFS_BANDWIDTH */
3575
3576 inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
3577
3578 static void
3579 inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3580 {
3581         inc_nr_big_task(&rq->hmp_stats, p);
3582         inc_cumulative_runnable_avg(&rq->hmp_stats, p);
3583 }
3584
3585 static void
3586 dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3587 {
3588         dec_nr_big_task(&rq->hmp_stats, p);
3589         dec_cumulative_runnable_avg(&rq->hmp_stats, p);
3590 }
3591 static void
3592 fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3593                            u32 new_task_load, u32 new_pred_demand)
3594 {
3595         s64 task_load_delta = (s64)new_task_load - task_load(p);
3596         s64 pred_demand_delta = PRED_DEMAND_DELTA;
3597
3598         fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
3599                                       pred_demand_delta);
3600         fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3601 }
3602
3603 static inline int task_will_be_throttled(struct task_struct *p)
3604 {
3605         return 0;
3606 }
3607
3608 void _inc_hmp_sched_stats_fair(struct rq *rq,
3609                         struct task_struct *p, int change_cra)
3610 {
3611         inc_nr_big_task(&rq->hmp_stats, p);
3612 }
3613
3614 #endif  /* CONFIG_CFS_BANDWIDTH */
3615
3616 /*
3617  * Reset balance_interval at all sched_domain levels of given cpu, so that it
3618  * honors kick.
3619  */
3620 static inline void reset_balance_interval(int cpu)
3621 {
3622         struct sched_domain *sd;
3623
3624         if (cpu >= nr_cpu_ids)
3625                 return;
3626
3627         rcu_read_lock();
3628         for_each_domain(cpu, sd)
3629                 sd->balance_interval = 0;
3630         rcu_read_unlock();
3631 }
3632
3633 /*
3634  * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
3635  * cpu as per its demand or priority)
3636  *
3637  * Returns reason why task needs to be migrated
3638  */
3639 static inline int migration_needed(struct task_struct *p, int cpu)
3640 {
3641         int nice;
3642         struct related_thread_group *grp;
3643
3644         if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
3645                 return 0;
3646
3647         /* No need to migrate task that is about to be throttled */
3648         if (task_will_be_throttled(p))
3649                 return 0;
3650
3651         if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
3652                  cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
3653                 return UP_MIGRATION;
3654
3655         if (sched_cpu_high_irqload(cpu))
3656                 return IRQLOAD_MIGRATION;
3657
3658         nice = task_nice(p);
3659         rcu_read_lock();
3660         grp = task_related_thread_group(p);
3661         /*
3662          * Don't assume higher capacity means higher power. If the task
3663          * is running on the power efficient CPU, avoid migrating it
3664          * to a lower capacity cluster.
3665          */
3666         if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
3667                         upmigrate_discouraged(p)) &&
3668                         cpu_capacity(cpu) > min_capacity &&
3669                         cpu_max_power_cost(cpu) == max_power_cost) {
3670                 rcu_read_unlock();
3671                 return DOWN_MIGRATION;
3672         }
3673
3674         if (!task_will_fit(p, cpu)) {
3675                 rcu_read_unlock();
3676                 return UP_MIGRATION;
3677         }
3678         rcu_read_unlock();
3679
3680         return 0;
3681 }
3682
3683 static inline int
3684 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
3685 {
3686         unsigned long flags;
3687         int rc = 0;
3688
3689         /* Invoke active balance to force migrate currently running task */
3690         raw_spin_lock_irqsave(&rq->lock, flags);
3691         if (!rq->active_balance) {
3692                 rq->active_balance = 1;
3693                 rq->push_cpu = new_cpu;
3694                 get_task_struct(p);
3695                 rq->push_task = p;
3696                 rc = 1;
3697         }
3698         raw_spin_unlock_irqrestore(&rq->lock, flags);
3699
3700         return rc;
3701 }
3702
3703 static DEFINE_RAW_SPINLOCK(migration_lock);
3704
3705 static bool do_migration(int reason, int new_cpu, int cpu)
3706 {
3707         if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
3708                                 && same_cluster(new_cpu, cpu))
3709                 return false;
3710
3711         /* Inter cluster high irqload migrations are OK */
3712         return new_cpu != cpu;
3713 }
3714
3715 /*
3716  * Check if currently running task should be migrated to a better cpu.
3717  *
3718  * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
3719  */
3720 void check_for_migration(struct rq *rq, struct task_struct *p)
3721 {
3722         int cpu = cpu_of(rq), new_cpu;
3723         int active_balance = 0, reason;
3724
3725         reason = migration_needed(p, cpu);
3726         if (!reason)
3727                 return;
3728
3729         raw_spin_lock(&migration_lock);
3730         new_cpu = select_best_cpu(p, cpu, reason, 0);
3731
3732         if (do_migration(reason, new_cpu, cpu)) {
3733                 active_balance = kick_active_balance(rq, p, new_cpu);
3734                 if (active_balance)
3735                         mark_reserved(new_cpu);
3736         }
3737
3738         raw_spin_unlock(&migration_lock);
3739
3740         if (active_balance)
3741                 stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
3742                                         &rq->active_balance_work);
3743 }
3744
3745 #ifdef CONFIG_CFS_BANDWIDTH
3746
3747 static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
3748 {
3749         cfs_rq->hmp_stats.nr_big_tasks = 0;
3750         cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
3751         cfs_rq->hmp_stats.pred_demands_sum = 0;
3752 }
3753
3754 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3755                  struct task_struct *p, int change_cra)
3756 {
3757         inc_nr_big_task(&cfs_rq->hmp_stats, p);
3758         if (change_cra)
3759                 inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3760 }
3761
3762 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3763                  struct task_struct *p, int change_cra)
3764 {
3765         dec_nr_big_task(&cfs_rq->hmp_stats, p);
3766         if (change_cra)
3767                 dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3768 }
3769
3770 static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3771                          struct cfs_rq *cfs_rq)
3772 {
3773         stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
3774         stats->cumulative_runnable_avg +=
3775                                 cfs_rq->hmp_stats.cumulative_runnable_avg;
3776         stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
3777 }
3778
3779 static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3780                                  struct cfs_rq *cfs_rq)
3781 {
3782         stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
3783         stats->cumulative_runnable_avg -=
3784                                 cfs_rq->hmp_stats.cumulative_runnable_avg;
3785         stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
3786
3787         BUG_ON(stats->nr_big_tasks < 0 ||
3788                 (s64)stats->cumulative_runnable_avg < 0);
3789         BUG_ON((s64)stats->pred_demands_sum < 0);
3790 }
3791
3792 #else   /* CONFIG_CFS_BANDWIDTH */
3793
3794 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3795          struct task_struct *p, int change_cra) { }
3796
3797 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3798          struct task_struct *p, int change_cra) { }
3799
3800 #endif  /* CONFIG_CFS_BANDWIDTH */
3801
3802 #else   /* CONFIG_SCHED_HMP */
3803
3804 static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
3805
3806 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3807          struct task_struct *p, int change_cra) { }
3808
3809 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3810          struct task_struct *p, int change_cra) { }
3811
3812 #define dec_throttled_cfs_rq_hmp_stats(...)
3813 #define inc_throttled_cfs_rq_hmp_stats(...)
3814
3815 #endif  /* CONFIG_SCHED_HMP */
3816
3817 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
3818 #error "load tracking assumes 2^10 as unit"
3819 #endif
3820
3821 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
3822
3823 /*
3824  * We can represent the historical contribution to runnable average as the
3825  * coefficients of a geometric series.  To do this we sub-divide our runnable
3826  * history into segments of approximately 1ms (1024us); label the segment that
3827  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3828  *
3829  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3830  *      p0            p1           p2
3831  *     (now)       (~1ms ago)  (~2ms ago)
3832  *
3833  * Let u_i denote the fraction of p_i that the entity was runnable.
3834  *
3835  * We then designate the fractions u_i as our co-efficients, yielding the
3836  * following representation of historical load:
3837  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3838  *
3839  * We choose y based on the with of a reasonably scheduling period, fixing:
3840  *   y^32 = 0.5
3841  *
3842  * This means that the contribution to load ~32ms ago (u_32) will be weighted
3843  * approximately half as much as the contribution to load within the last ms
3844  * (u_0).
3845  *
3846  * When a period "rolls over" and we have new u_0`, multiplying the previous
3847  * sum again by y is sufficient to update:
3848  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3849  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3850  */
3851 static __always_inline int
3852 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3853                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
3854 {
3855         u64 delta, scaled_delta, periods;
3856         u32 contrib;
3857         unsigned int delta_w, scaled_delta_w, decayed = 0;
3858         unsigned long scale_freq, scale_cpu;
3859
3860         delta = now - sa->last_update_time;
3861         /*
3862          * This should only happen when time goes backwards, which it
3863          * unfortunately does during sched clock init when we swap over to TSC.
3864          */
3865         if ((s64)delta < 0) {
3866                 sa->last_update_time = now;
3867                 return 0;
3868         }
3869
3870         /*
3871          * Use 1024ns as the unit of measurement since it's a reasonable
3872          * approximation of 1us and fast to compute.
3873          */
3874         delta >>= 10;
3875         if (!delta)
3876                 return 0;
3877         sa->last_update_time = now;
3878
3879         scale_freq = arch_scale_freq_capacity(NULL, cpu);
3880         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3881         trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
3882
3883         /* delta_w is the amount already accumulated against our next period */
3884         delta_w = sa->period_contrib;
3885         if (delta + delta_w >= 1024) {
3886                 decayed = 1;
3887
3888                 /* how much left for next period will start over, we don't know yet */
3889                 sa->period_contrib = 0;
3890
3891                 /*
3892                  * Now that we know we're crossing a period boundary, figure
3893                  * out how much from delta we need to complete the current
3894                  * period and accrue it.
3895                  */
3896                 delta_w = 1024 - delta_w;
3897                 scaled_delta_w = cap_scale(delta_w, scale_freq);
3898                 if (weight) {
3899                         sa->load_sum += weight * scaled_delta_w;
3900                         if (cfs_rq) {
3901                                 cfs_rq->runnable_load_sum +=
3902                                                 weight * scaled_delta_w;
3903                         }
3904                 }
3905                 if (running)
3906                         sa->util_sum += scaled_delta_w * scale_cpu;
3907
3908                 delta -= delta_w;
3909
3910                 /* Figure out how many additional periods this update spans */
3911                 periods = delta / 1024;
3912                 delta %= 1024;
3913
3914                 sa->load_sum = decay_load(sa->load_sum, periods + 1);
3915                 if (cfs_rq) {
3916                         cfs_rq->runnable_load_sum =
3917                                 decay_load(cfs_rq->runnable_load_sum, periods + 1);
3918                 }
3919                 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
3920
3921                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
3922                 contrib = __compute_runnable_contrib(periods);
3923                 contrib = cap_scale(contrib, scale_freq);
3924                 if (weight) {
3925                         sa->load_sum += weight * contrib;
3926                         if (cfs_rq)
3927                                 cfs_rq->runnable_load_sum += weight * contrib;
3928                 }
3929                 if (running)
3930                         sa->util_sum += contrib * scale_cpu;
3931         }
3932
3933         /* Remainder of delta accrued against u_0` */
3934         scaled_delta = cap_scale(delta, scale_freq);
3935         if (weight) {
3936                 sa->load_sum += weight * scaled_delta;
3937                 if (cfs_rq)
3938                         cfs_rq->runnable_load_sum += weight * scaled_delta;
3939         }
3940
3941         if (running)
3942                 sa->util_sum += scaled_delta * scale_cpu;
3943
3944         sa->period_contrib += delta;
3945
3946         if (decayed) {
3947                 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
3948                 if (cfs_rq) {
3949                         cfs_rq->runnable_load_avg =
3950                                 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3951                 }
3952                 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
3953         }
3954
3955         return decayed;
3956 }
3957
3958 /*
3959  * Signed add and clamp on underflow.
3960  *
3961  * Explicitly do a load-store to ensure the intermediate value never hits
3962  * memory. This allows lockless observations without ever seeing the negative
3963  * values.
3964  */
3965 #define add_positive(_ptr, _val) do {                           \
3966         typeof(_ptr) ptr = (_ptr);                              \
3967         typeof(_val) val = (_val);                              \
3968         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3969                                                                 \
3970         res = var + val;                                        \
3971                                                                 \
3972         if (val < 0 && res > var)                               \
3973                 res = 0;                                        \
3974                                                                 \
3975         WRITE_ONCE(*ptr, res);                                  \
3976 } while (0)
3977
3978 #ifdef CONFIG_FAIR_GROUP_SCHED
3979 /**
3980  * update_tg_load_avg - update the tg's load avg
3981  * @cfs_rq: the cfs_rq whose avg changed
3982  * @force: update regardless of how small the difference
3983  *
3984  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3985  * However, because tg->load_avg is a global value there are performance
3986  * considerations.
3987  *
3988  * In order to avoid having to look at the other cfs_rq's, we use a
3989  * differential update where we store the last value we propagated. This in
3990  * turn allows skipping updates if the differential is 'small'.
3991  *
3992  * Updating tg's load_avg is necessary before update_cfs_share() (which is
3993  * done) and effective_load() (which is not done because it is too costly).
3994  */
3995 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3996 {
3997         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3998
3999         /*
4000          * No need to update load_avg for root_task_group as it is not used.
4001          */
4002         if (cfs_rq->tg == &root_task_group)
4003                 return;
4004
4005         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4006                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4007                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4008         }
4009 }
4010
4011 /*
4012  * Called within set_task_rq() right before setting a task's cpu. The
4013  * caller only guarantees p->pi_lock is held; no other assumptions,
4014  * including the state of rq->lock, should be made.
4015  */
4016 void set_task_rq_fair(struct sched_entity *se,
4017                       struct cfs_rq *prev, struct cfs_rq *next)
4018 {
4019         if (!sched_feat(ATTACH_AGE_LOAD))
4020                 return;
4021
4022         /*
4023          * We are supposed to update the task to "current" time, then its up to
4024          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4025          * getting what current time is, so simply throw away the out-of-date
4026          * time. This will result in the wakee task is less decayed, but giving
4027          * the wakee more load sounds not bad.
4028          */
4029         if (se->avg.last_update_time && prev) {
4030                 u64 p_last_update_time;
4031                 u64 n_last_update_time;
4032
4033 #ifndef CONFIG_64BIT
4034                 u64 p_last_update_time_copy;
4035                 u64 n_last_update_time_copy;
4036
4037                 do {
4038                         p_last_update_time_copy = prev->load_last_update_time_copy;
4039                         n_last_update_time_copy = next->load_last_update_time_copy;
4040
4041                         smp_rmb();
4042
4043                         p_last_update_time = prev->avg.last_update_time;
4044                         n_last_update_time = next->avg.last_update_time;
4045
4046                 } while (p_last_update_time != p_last_update_time_copy ||
4047                          n_last_update_time != n_last_update_time_copy);
4048 #else
4049                 p_last_update_time = prev->avg.last_update_time;
4050                 n_last_update_time = next->avg.last_update_time;
4051 #endif
4052                 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
4053                                   &se->avg, 0, 0, NULL);
4054                 se->avg.last_update_time = n_last_update_time;
4055         }
4056 }
4057
4058 /* Take into account change of utilization of a child task group */
4059 static inline void
4060 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
4061 {
4062         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4063         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
4064
4065         /* Nothing to update */
4066         if (!delta)
4067                 return;
4068
4069         /* Set new sched_entity's utilization */
4070         se->avg.util_avg = gcfs_rq->avg.util_avg;
4071         se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
4072
4073         /* Update parent cfs_rq utilization */
4074         add_positive(&cfs_rq->avg.util_avg, delta);
4075         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
4076 }
4077
4078 /* Take into account change of load of a child task group */
4079 static inline void
4080 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
4081 {
4082         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4083         long delta, load = gcfs_rq->avg.load_avg;
4084
4085         /*
4086          * If the load of group cfs_rq is null, the load of the
4087          * sched_entity will also be null so we can skip the formula
4088          */
4089         if (load) {
4090                 long tg_load;
4091
4092                 /* Get tg's load and ensure tg_load > 0 */
4093                 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
4094
4095                 /* Ensure tg_load >= load and updated with current load*/
4096                 tg_load -= gcfs_rq->tg_load_avg_contrib;
4097                 tg_load += load;
4098
4099                 /*
4100                  * We need to compute a correction term in the case that the
4101                  * task group is consuming more CPU than a task of equal
4102                  * weight. A task with a weight equals to tg->shares will have
4103                  * a load less or equal to scale_load_down(tg->shares).
4104                  * Similarly, the sched_entities that represent the task group
4105                  * at parent level, can't have a load higher than
4106                  * scale_load_down(tg->shares). And the Sum of sched_entities'
4107                  * load must be <= scale_load_down(tg->shares).
4108                  */
4109                 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
4110                         /* scale gcfs_rq's load into tg's shares*/
4111                         load *= scale_load_down(gcfs_rq->tg->shares);
4112                         load /= tg_load;
4113                 }
4114         }
4115
4116         delta = load - se->avg.load_avg;
4117
4118         /* Nothing to update */
4119         if (!delta)
4120                 return;
4121
4122         /* Set new sched_entity's load */
4123         se->avg.load_avg = load;
4124         se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
4125
4126         /* Update parent cfs_rq load */
4127         add_positive(&cfs_rq->avg.load_avg, delta);
4128         cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
4129
4130         /*
4131          * If the sched_entity is already enqueued, we also have to update the
4132          * runnable load avg.
4133          */
4134         if (se->on_rq) {
4135                 /* Update parent cfs_rq runnable_load_avg */
4136                 add_positive(&cfs_rq->runnable_load_avg, delta);
4137                 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
4138         }
4139 }
4140
4141 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
4142 {
4143         cfs_rq->propagate_avg = 1;
4144 }
4145
4146 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
4147 {
4148         struct cfs_rq *cfs_rq = group_cfs_rq(se);
4149
4150         if (!cfs_rq->propagate_avg)
4151                 return 0;
4152
4153         cfs_rq->propagate_avg = 0;
4154         return 1;
4155 }
4156
4157 /* Update task and its cfs_rq load average */
4158 static inline int propagate_entity_load_avg(struct sched_entity *se)
4159 {
4160         struct cfs_rq *cfs_rq;
4161
4162         if (entity_is_task(se))
4163                 return 0;
4164
4165         if (!test_and_clear_tg_cfs_propagate(se))
4166                 return 0;
4167
4168         cfs_rq = cfs_rq_of(se);
4169
4170         set_tg_cfs_propagate(cfs_rq);
4171
4172         update_tg_cfs_util(cfs_rq, se);
4173         update_tg_cfs_load(cfs_rq, se);
4174
4175         return 1;
4176 }
4177
4178 #else /* CONFIG_FAIR_GROUP_SCHED */
4179
4180 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
4181
4182 static inline int propagate_entity_load_avg(struct sched_entity *se)
4183 {
4184         return 0;
4185 }
4186
4187 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
4188
4189 #endif /* CONFIG_FAIR_GROUP_SCHED */
4190
4191 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
4192 {
4193         if (&this_rq()->cfs == cfs_rq) {
4194                 /*
4195                  * There are a few boundary cases this might miss but it should
4196                  * get called often enough that that should (hopefully) not be
4197                  * a real problem -- added to that it only calls on the local
4198                  * CPU, so if we enqueue remotely we'll miss an update, but
4199                  * the next tick/schedule should update.
4200                  *
4201                  * It will not get called when we go idle, because the idle
4202                  * thread is a different class (!fair), nor will the utilization
4203                  * number include things like RT tasks.
4204                  *
4205                  * As is, the util number is not freq-invariant (we'd have to
4206                  * implement arch_scale_freq_capacity() for that).
4207                  *
4208                  * See cpu_util().
4209                  */
4210                 cpufreq_update_util(rq_of(cfs_rq), 0);
4211         }
4212 }
4213
4214 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
4215
4216 /*
4217  * Unsigned subtract and clamp on underflow.
4218  *
4219  * Explicitly do a load-store to ensure the intermediate value never hits
4220  * memory. This allows lockless observations without ever seeing the negative
4221  * values.
4222  */
4223 #define sub_positive(_ptr, _val) do {                           \
4224         typeof(_ptr) ptr = (_ptr);                              \
4225         typeof(*ptr) val = (_val);                              \
4226         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
4227         res = var - val;                                        \
4228         if (res > var)                                          \
4229                 res = 0;                                        \
4230         WRITE_ONCE(*ptr, res);                                  \
4231 } while (0)
4232
4233 /**
4234  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4235  * @now: current time, as per cfs_rq_clock_task()
4236  * @cfs_rq: cfs_rq to update
4237  * @update_freq: should we call cfs_rq_util_change() or will the call do so
4238  *
4239  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4240  * avg. The immediate corollary is that all (fair) tasks must be attached, see
4241  * post_init_entity_util_avg().
4242  *
4243  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4244  *
4245  * Returns true if the load decayed or we removed load.
4246  *
4247  * Since both these conditions indicate a changed cfs_rq->avg.load we should
4248  * call update_tg_load_avg() when this function returns true.
4249  */
4250 static inline int
4251 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4252 {
4253         struct sched_avg *sa = &cfs_rq->avg;
4254         int decayed, removed = 0, removed_util = 0;
4255
4256         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
4257                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
4258                 sub_positive(&sa->load_avg, r);
4259                 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
4260                 removed = 1;
4261                 set_tg_cfs_propagate(cfs_rq);
4262         }
4263
4264         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
4265                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
4266                 sub_positive(&sa->util_avg, r);
4267                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
4268                 removed_util = 1;
4269                 set_tg_cfs_propagate(cfs_rq);
4270         }
4271
4272         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
4273                 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
4274
4275 #ifndef CONFIG_64BIT
4276         smp_wmb();
4277         cfs_rq->load_last_update_time_copy = sa->last_update_time;
4278 #endif
4279
4280         /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
4281         if (cfs_rq == &rq_of(cfs_rq)->cfs)
4282                 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
4283
4284         if (update_freq && (decayed || removed_util))
4285                 cfs_rq_util_change(cfs_rq);
4286
4287         return decayed || removed;
4288 }
4289
4290 /*
4291  * Optional action to be done while updating the load average
4292  */
4293 #define UPDATE_TG       0x1
4294 #define SKIP_AGE_LOAD   0x2
4295
4296 /* Update task and its cfs_rq load average */
4297 static inline void update_load_avg(struct sched_entity *se, int flags)
4298 {
4299         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4300         u64 now = cfs_rq_clock_task(cfs_rq);
4301         int cpu = cpu_of(rq_of(cfs_rq));
4302         int decayed;
4303         void *ptr = NULL;
4304
4305         /*
4306          * Track task load average for carrying it to new CPU after migrated, and
4307          * track group sched_entity load average for task_h_load calc in migration
4308          */
4309         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
4310                 __update_load_avg(now, cpu, &se->avg,
4311                           se->on_rq * scale_load_down(se->load.weight),
4312                           cfs_rq->curr == se, NULL);
4313         }
4314
4315         decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
4316         decayed |= propagate_entity_load_avg(se);
4317
4318         if (decayed && (flags & UPDATE_TG))
4319                 update_tg_load_avg(cfs_rq, 0);
4320
4321         if (entity_is_task(se)) {
4322 #ifdef CONFIG_SCHED_WALT
4323                 ptr = (void *)&(task_of(se)->ravg);
4324 #endif
4325                 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
4326         }
4327 }
4328
4329 /**
4330  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4331  * @cfs_rq: cfs_rq to attach to
4332  * @se: sched_entity to attach
4333  *
4334  * Must call update_cfs_rq_load_avg() before this, since we rely on
4335  * cfs_rq->avg.last_update_time being current.
4336  */
4337 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4338 {
4339         se->avg.last_update_time = cfs_rq->avg.last_update_time;
4340         cfs_rq->avg.load_avg += se->avg.load_avg;
4341         cfs_rq->avg.load_sum += se->avg.load_sum;
4342         cfs_rq->avg.util_avg += se->avg.util_avg;
4343         cfs_rq->avg.util_sum += se->avg.util_sum;
4344         set_tg_cfs_propagate(cfs_rq);
4345
4346         cfs_rq_util_change(cfs_rq);
4347 }
4348
4349 /**
4350  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4351  * @cfs_rq: cfs_rq to detach from
4352  * @se: sched_entity to detach
4353  *
4354  * Must call update_cfs_rq_load_avg() before this, since we rely on
4355  * cfs_rq->avg.last_update_time being current.
4356  */
4357 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4358 {
4359
4360         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
4361         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
4362         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4363         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4364         set_tg_cfs_propagate(cfs_rq);
4365
4366         cfs_rq_util_change(cfs_rq);
4367 }
4368
4369 /* Add the load generated by se into cfs_rq's load average */
4370 static inline void
4371 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4372 {
4373         struct sched_avg *sa = &se->avg;
4374
4375         cfs_rq->runnable_load_avg += sa->load_avg;
4376         cfs_rq->runnable_load_sum += sa->load_sum;
4377
4378         if (!sa->last_update_time) {
4379                 attach_entity_load_avg(cfs_rq, se);
4380                 update_tg_load_avg(cfs_rq, 0);
4381         }
4382 }
4383
4384 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
4385 static inline void
4386 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4387 {
4388         cfs_rq->runnable_load_avg =
4389                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
4390         cfs_rq->runnable_load_sum =
4391                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
4392 }
4393
4394 #ifndef CONFIG_64BIT
4395 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4396 {
4397         u64 last_update_time_copy;
4398         u64 last_update_time;
4399
4400         do {
4401                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
4402                 smp_rmb();
4403                 last_update_time = cfs_rq->avg.last_update_time;
4404         } while (last_update_time != last_update_time_copy);
4405
4406         return last_update_time;
4407 }
4408 #else
4409 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4410 {
4411         return cfs_rq->avg.last_update_time;
4412 }
4413 #endif
4414
4415 /*
4416  * Synchronize entity load avg of dequeued entity without locking
4417  * the previous rq.
4418  */
4419 void sync_entity_load_avg(struct sched_entity *se)
4420 {
4421         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4422         u64 last_update_time;
4423
4424         last_update_time = cfs_rq_last_update_time(cfs_rq);
4425         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
4426 }
4427
4428 /*
4429  * Task first catches up with cfs_rq, and then subtract
4430  * itself from the cfs_rq (task must be off the queue now).
4431  */
4432 void remove_entity_load_avg(struct sched_entity *se)
4433 {
4434         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4435
4436         /*
4437          * tasks cannot exit without having gone through wake_up_new_task() ->
4438          * post_init_entity_util_avg() which will have added things to the
4439          * cfs_rq, so we can remove unconditionally.
4440          *
4441          * Similarly for groups, they will have passed through
4442          * post_init_entity_util_avg() before unregister_sched_fair_group()
4443          * calls this.
4444          */
4445
4446         sync_entity_load_avg(se);
4447         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
4448         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
4449 }
4450
4451 /*
4452  * Update the rq's load with the elapsed running time before entering
4453  * idle. if the last scheduled task is not a CFS task, idle_enter will
4454  * be the only way to update the runnable statistic.
4455  */
4456 void idle_enter_fair(struct rq *this_rq)
4457 {
4458 }
4459
4460 /*
4461  * Update the rq's load with the elapsed idle time before a task is
4462  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
4463  * be the only way to update the runnable statistic.
4464  */
4465 void idle_exit_fair(struct rq *this_rq)
4466 {
4467 }
4468
4469 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
4470 {
4471         return cfs_rq->runnable_load_avg;
4472 }
4473
4474 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4475 {
4476         return cfs_rq->avg.load_avg;
4477 }
4478
4479 static int idle_balance(struct rq *this_rq);
4480
4481 #else /* CONFIG_SMP */
4482
4483 static inline int
4484 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4485 {
4486         return 0;
4487 }
4488
4489 #define UPDATE_TG       0x0
4490 #define SKIP_AGE_LOAD   0x0
4491
4492 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
4493 static inline void
4494 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4495 static inline void
4496 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4497 static inline void remove_entity_load_avg(struct sched_entity *se) {}
4498
4499 static inline void
4500 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4501 static inline void
4502 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4503
4504 static inline int idle_balance(struct rq *rq)
4505 {
4506         return 0;
4507 }
4508
4509 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4510          struct task_struct *p, int change_cra) { }
4511
4512 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4513          struct task_struct *p, int change_cra) { }
4514
4515 #endif /* CONFIG_SMP */
4516
4517 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
4518 {
4519 #ifdef CONFIG_SCHEDSTATS
4520         struct task_struct *tsk = NULL;
4521
4522         if (entity_is_task(se))
4523                 tsk = task_of(se);
4524
4525         if (se->statistics.sleep_start) {
4526                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
4527
4528                 if ((s64)delta < 0)
4529                         delta = 0;
4530
4531                 if (unlikely(delta > se->statistics.sleep_max))
4532                         se->statistics.sleep_max = delta;
4533
4534                 se->statistics.sleep_start = 0;
4535                 se->statistics.sum_sleep_runtime += delta;
4536
4537                 if (tsk) {
4538                         account_scheduler_latency(tsk, delta >> 10, 1);
4539                         trace_sched_stat_sleep(tsk, delta);
4540                 }
4541         }
4542         if (se->statistics.block_start) {
4543                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
4544
4545                 if ((s64)delta < 0)
4546                         delta = 0;
4547
4548                 if (unlikely(delta > se->statistics.block_max))
4549                         se->statistics.block_max = delta;
4550
4551                 se->statistics.block_start = 0;
4552                 se->statistics.sum_sleep_runtime += delta;
4553
4554                 if (tsk) {
4555                         if (tsk->in_iowait) {
4556                                 se->statistics.iowait_sum += delta;
4557                                 se->statistics.iowait_count++;
4558                                 trace_sched_stat_iowait(tsk, delta);
4559                         }
4560
4561                         trace_sched_stat_blocked(tsk, delta);
4562                         trace_sched_blocked_reason(tsk);
4563
4564                         /*
4565                          * Blocking time is in units of nanosecs, so shift by
4566                          * 20 to get a milliseconds-range estimation of the
4567                          * amount of time that the task spent sleeping:
4568                          */
4569                         if (unlikely(prof_on == SLEEP_PROFILING)) {
4570                                 profile_hits(SLEEP_PROFILING,
4571                                                 (void *)get_wchan(tsk),
4572                                                 delta >> 20);
4573                         }
4574                         account_scheduler_latency(tsk, delta >> 10, 0);
4575                 }
4576         }
4577 #endif
4578 }
4579
4580 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4581 {
4582 #ifdef CONFIG_SCHED_DEBUG
4583         s64 d = se->vruntime - cfs_rq->min_vruntime;
4584
4585         if (d < 0)
4586                 d = -d;
4587
4588         if (d > 3*sysctl_sched_latency)
4589                 schedstat_inc(cfs_rq, nr_spread_over);
4590 #endif
4591 }
4592
4593 static void
4594 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4595 {
4596         u64 vruntime = cfs_rq->min_vruntime;
4597
4598         /*
4599          * The 'current' period is already promised to the current tasks,
4600          * however the extra weight of the new task will slow them down a
4601          * little, place the new task so that it fits in the slot that
4602          * stays open at the end.
4603          */
4604         if (initial && sched_feat(START_DEBIT))
4605                 vruntime += sched_vslice(cfs_rq, se);
4606
4607         /* sleeps up to a single latency don't count. */
4608         if (!initial) {
4609                 unsigned long thresh = sysctl_sched_latency;
4610
4611                 /*
4612                  * Halve their sleep time's effect, to allow
4613                  * for a gentler effect of sleepers:
4614                  */
4615                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4616                         thresh >>= 1;
4617
4618                 vruntime -= thresh;
4619         }
4620
4621         /* ensure we never gain time by being placed backwards. */
4622         se->vruntime = max_vruntime(se->vruntime, vruntime);
4623 }
4624
4625 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4626
4627 static void
4628 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4629 {
4630         /*
4631          * Update the normalized vruntime before updating min_vruntime
4632          * through calling update_curr().
4633          */
4634         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
4635                 se->vruntime += cfs_rq->min_vruntime;
4636
4637         /*
4638          * Update run-time statistics of the 'current'.
4639          */
4640         update_curr(cfs_rq);
4641         update_load_avg(se, UPDATE_TG);
4642         enqueue_entity_load_avg(cfs_rq, se);
4643         update_cfs_shares(se);
4644         account_entity_enqueue(cfs_rq, se);
4645
4646         if (flags & ENQUEUE_WAKEUP) {
4647                 place_entity(cfs_rq, se, 0);
4648                 enqueue_sleeper(cfs_rq, se);
4649         }
4650
4651         update_stats_enqueue(cfs_rq, se);
4652         check_spread(cfs_rq, se);
4653         if (se != cfs_rq->curr)
4654                 __enqueue_entity(cfs_rq, se);
4655         se->on_rq = 1;
4656
4657         if (cfs_rq->nr_running == 1) {
4658                 list_add_leaf_cfs_rq(cfs_rq);
4659                 check_enqueue_throttle(cfs_rq);
4660         }
4661 }
4662
4663 static void __clear_buddies_last(struct sched_entity *se)
4664 {
4665         for_each_sched_entity(se) {
4666                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4667                 if (cfs_rq->last != se)
4668                         break;
4669
4670                 cfs_rq->last = NULL;
4671         }
4672 }
4673
4674 static void __clear_buddies_next(struct sched_entity *se)
4675 {
4676         for_each_sched_entity(se) {
4677                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4678                 if (cfs_rq->next != se)
4679                         break;
4680
4681                 cfs_rq->next = NULL;
4682         }
4683 }
4684
4685 static void __clear_buddies_skip(struct sched_entity *se)
4686 {
4687         for_each_sched_entity(se) {
4688                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4689                 if (cfs_rq->skip != se)
4690                         break;
4691
4692                 cfs_rq->skip = NULL;
4693         }
4694 }
4695
4696 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4697 {
4698         if (cfs_rq->last == se)
4699                 __clear_buddies_last(se);
4700
4701         if (cfs_rq->next == se)
4702                 __clear_buddies_next(se);
4703
4704         if (cfs_rq->skip == se)
4705                 __clear_buddies_skip(se);
4706 }
4707
4708 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4709
4710 static void
4711 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4712 {
4713         /*
4714          * Update run-time statistics of the 'current'.
4715          */
4716         update_curr(cfs_rq);
4717
4718         /*
4719          * When dequeuing a sched_entity, we must:
4720          *   - Update loads to have both entity and cfs_rq synced with now.
4721          *   - Substract its load from the cfs_rq->runnable_avg.
4722          *   - Substract its previous weight from cfs_rq->load.weight.
4723          *   - For group entity, update its weight to reflect the new share
4724          *     of its group cfs_rq.
4725          */
4726         update_load_avg(se, UPDATE_TG);
4727         dequeue_entity_load_avg(cfs_rq, se);
4728
4729         update_stats_dequeue(cfs_rq, se);
4730         if (flags & DEQUEUE_SLEEP) {
4731 #ifdef CONFIG_SCHEDSTATS
4732                 if (entity_is_task(se)) {
4733                         struct task_struct *tsk = task_of(se);
4734
4735                         if (tsk->state & TASK_INTERRUPTIBLE)
4736                                 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
4737                         if (tsk->state & TASK_UNINTERRUPTIBLE)
4738                                 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
4739                 }
4740 #endif
4741         }
4742
4743         clear_buddies(cfs_rq, se);
4744
4745         if (se != cfs_rq->curr)
4746                 __dequeue_entity(cfs_rq, se);
4747         se->on_rq = 0;
4748         account_entity_dequeue(cfs_rq, se);
4749
4750         /*
4751          * Normalize the entity after updating the min_vruntime because the
4752          * update can refer to the ->curr item and we need to reflect this
4753          * movement in our normalized position.
4754          */
4755         if (!(flags & DEQUEUE_SLEEP))
4756                 se->vruntime -= cfs_rq->min_vruntime;
4757
4758         /* return excess runtime on last dequeue */
4759         return_cfs_rq_runtime(cfs_rq);
4760
4761         update_min_vruntime(cfs_rq);
4762         update_cfs_shares(se);
4763 }
4764
4765 /*
4766  * Preempt the current task with a newly woken task if needed:
4767  */
4768 static void
4769 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4770 {
4771         unsigned long ideal_runtime, delta_exec;
4772         struct sched_entity *se;
4773         s64 delta;
4774
4775         ideal_runtime = sched_slice(cfs_rq, curr);
4776         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4777         if (delta_exec > ideal_runtime) {
4778                 resched_curr(rq_of(cfs_rq));
4779                 /*
4780                  * The current task ran long enough, ensure it doesn't get
4781                  * re-elected due to buddy favours.
4782                  */
4783                 clear_buddies(cfs_rq, curr);
4784                 return;
4785         }
4786
4787         /*
4788          * Ensure that a task that missed wakeup preemption by a
4789          * narrow margin doesn't have to wait for a full slice.
4790          * This also mitigates buddy induced latencies under load.
4791          */
4792         if (delta_exec < sysctl_sched_min_granularity)
4793                 return;
4794
4795         se = __pick_first_entity(cfs_rq);
4796         delta = curr->vruntime - se->vruntime;
4797
4798         if (delta < 0)
4799                 return;
4800
4801         if (delta > ideal_runtime)
4802                 resched_curr(rq_of(cfs_rq));
4803 }
4804
4805 static void
4806 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4807 {
4808         /* 'current' is not kept within the tree. */
4809         if (se->on_rq) {
4810                 /*
4811                  * Any task has to be enqueued before it get to execute on
4812                  * a CPU. So account for the time it spent waiting on the
4813                  * runqueue.
4814                  */
4815                 update_stats_wait_end(cfs_rq, se);
4816                 __dequeue_entity(cfs_rq, se);
4817                 update_load_avg(se, UPDATE_TG);
4818         }
4819
4820         update_stats_curr_start(cfs_rq, se);
4821         cfs_rq->curr = se;
4822 #ifdef CONFIG_SCHEDSTATS
4823         /*
4824          * Track our maximum slice length, if the CPU's load is at
4825          * least twice that of our own weight (i.e. dont track it
4826          * when there are only lesser-weight tasks around):
4827          */
4828         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4829                 se->statistics.slice_max = max(se->statistics.slice_max,
4830                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
4831         }
4832 #endif
4833         se->prev_sum_exec_runtime = se->sum_exec_runtime;
4834 }
4835
4836 static int
4837 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4838
4839 /*
4840  * Pick the next process, keeping these things in mind, in this order:
4841  * 1) keep things fair between processes/task groups
4842  * 2) pick the "next" process, since someone really wants that to run
4843  * 3) pick the "last" process, for cache locality
4844  * 4) do not run the "skip" process, if something else is available
4845  */
4846 static struct sched_entity *
4847 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4848 {
4849         struct sched_entity *left = __pick_first_entity(cfs_rq);
4850         struct sched_entity *se;
4851
4852         /*
4853          * If curr is set we have to see if its left of the leftmost entity
4854          * still in the tree, provided there was anything in the tree at all.
4855          */
4856         if (!left || (curr && entity_before(curr, left)))
4857                 left = curr;
4858
4859         se = left; /* ideally we run the leftmost entity */
4860
4861         /*
4862          * Avoid running the skip buddy, if running something else can
4863          * be done without getting too unfair.
4864          */
4865         if (cfs_rq->skip == se) {
4866                 struct sched_entity *second;
4867
4868                 if (se == curr) {
4869                         second = __pick_first_entity(cfs_rq);
4870                 } else {
4871                         second = __pick_next_entity(se);
4872                         if (!second || (curr && entity_before(curr, second)))
4873                                 second = curr;
4874                 }
4875
4876                 if (second && wakeup_preempt_entity(second, left) < 1)
4877                         se = second;
4878         }
4879
4880         /*
4881          * Prefer last buddy, try to return the CPU to a preempted task.
4882          */
4883         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4884                 se = cfs_rq->last;
4885
4886         /*
4887          * Someone really wants this to run. If it's not unfair, run it.
4888          */
4889         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4890                 se = cfs_rq->next;
4891
4892         clear_buddies(cfs_rq, se);
4893
4894         return se;
4895 }
4896
4897 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4898
4899 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4900 {
4901         /*
4902          * If still on the runqueue then deactivate_task()
4903          * was not called and update_curr() has to be done:
4904          */
4905         if (prev->on_rq)
4906                 update_curr(cfs_rq);
4907
4908         /* throttle cfs_rqs exceeding runtime */
4909         check_cfs_rq_runtime(cfs_rq);
4910
4911         check_spread(cfs_rq, prev);
4912         if (prev->on_rq) {
4913                 update_stats_wait_start(cfs_rq, prev);
4914                 /* Put 'current' back into the tree. */
4915                 __enqueue_entity(cfs_rq, prev);
4916                 /* in !on_rq case, update occurred at dequeue */
4917                 update_load_avg(prev, 0);
4918         }
4919         cfs_rq->curr = NULL;
4920 }
4921
4922 static void
4923 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4924 {
4925         /*
4926          * Update run-time statistics of the 'current'.
4927          */
4928         update_curr(cfs_rq);
4929
4930         /*
4931          * Ensure that runnable average is periodically updated.
4932          */
4933         update_load_avg(curr, UPDATE_TG);
4934         update_cfs_shares(curr);
4935
4936 #ifdef CONFIG_SCHED_HRTICK
4937         /*
4938          * queued ticks are scheduled to match the slice, so don't bother
4939          * validating it and just reschedule.
4940          */
4941         if (queued) {
4942                 resched_curr(rq_of(cfs_rq));
4943                 return;
4944         }
4945         /*
4946          * don't let the period tick interfere with the hrtick preemption
4947          */
4948         if (!sched_feat(DOUBLE_TICK) &&
4949                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4950                 return;
4951 #endif
4952
4953         if (cfs_rq->nr_running > 1)
4954                 check_preempt_tick(cfs_rq, curr);
4955 }
4956
4957
4958 /**************************************************
4959  * CFS bandwidth control machinery
4960  */
4961
4962 #ifdef CONFIG_CFS_BANDWIDTH
4963
4964 #ifdef HAVE_JUMP_LABEL
4965 static struct static_key __cfs_bandwidth_used;
4966
4967 static inline bool cfs_bandwidth_used(void)
4968 {
4969         return static_key_false(&__cfs_bandwidth_used);
4970 }
4971
4972 void cfs_bandwidth_usage_inc(void)
4973 {
4974         static_key_slow_inc(&__cfs_bandwidth_used);
4975 }
4976
4977 void cfs_bandwidth_usage_dec(void)
4978 {
4979         static_key_slow_dec(&__cfs_bandwidth_used);
4980 }
4981 #else /* HAVE_JUMP_LABEL */
4982 static bool cfs_bandwidth_used(void)
4983 {
4984         return true;
4985 }
4986
4987 void cfs_bandwidth_usage_inc(void) {}
4988 void cfs_bandwidth_usage_dec(void) {}
4989 #endif /* HAVE_JUMP_LABEL */
4990
4991 /*
4992  * default period for cfs group bandwidth.
4993  * default: 0.1s, units: nanoseconds
4994  */
4995 static inline u64 default_cfs_period(void)
4996 {
4997         return 100000000ULL;
4998 }
4999
5000 static inline u64 sched_cfs_bandwidth_slice(void)
5001 {
5002         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5003 }
5004
5005 /*
5006  * Replenish runtime according to assigned quota and update expiration time.
5007  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
5008  * additional synchronization around rq->lock.
5009  *
5010  * requires cfs_b->lock
5011  */
5012 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5013 {
5014         u64 now;
5015
5016         if (cfs_b->quota == RUNTIME_INF)
5017                 return;
5018
5019         now = sched_clock_cpu(smp_processor_id());
5020         cfs_b->runtime = cfs_b->quota;
5021         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
5022 }
5023
5024 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5025 {
5026         return &tg->cfs_bandwidth;
5027 }
5028
5029 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
5030 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5031 {
5032         if (unlikely(cfs_rq->throttle_count))
5033                 return cfs_rq->throttled_clock_task;
5034
5035         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
5036 }
5037
5038 /* returns 0 on failure to allocate runtime */
5039 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5040 {
5041         struct task_group *tg = cfs_rq->tg;
5042         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
5043         u64 amount = 0, min_amount, expires;
5044
5045         /* note: this is a positive sum as runtime_remaining <= 0 */
5046         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
5047
5048         raw_spin_lock(&cfs_b->lock);
5049         if (cfs_b->quota == RUNTIME_INF)
5050                 amount = min_amount;
5051         else {
5052                 start_cfs_bandwidth(cfs_b);
5053
5054                 if (cfs_b->runtime > 0) {
5055                         amount = min(cfs_b->runtime, min_amount);
5056                         cfs_b->runtime -= amount;
5057                         cfs_b->idle = 0;
5058                 }
5059         }
5060         expires = cfs_b->runtime_expires;
5061         raw_spin_unlock(&cfs_b->lock);
5062
5063         cfs_rq->runtime_remaining += amount;
5064         /*
5065          * we may have advanced our local expiration to account for allowed
5066          * spread between our sched_clock and the one on which runtime was
5067          * issued.
5068          */
5069         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
5070                 cfs_rq->runtime_expires = expires;
5071
5072         return cfs_rq->runtime_remaining > 0;
5073 }
5074
5075 /*
5076  * Note: This depends on the synchronization provided by sched_clock and the
5077  * fact that rq->clock snapshots this value.
5078  */
5079 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5080 {
5081         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5082
5083         /* if the deadline is ahead of our clock, nothing to do */
5084         if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
5085                 return;
5086
5087         if (cfs_rq->runtime_remaining < 0)
5088                 return;
5089
5090         /*
5091          * If the local deadline has passed we have to consider the
5092          * possibility that our sched_clock is 'fast' and the global deadline
5093          * has not truly expired.
5094          *
5095          * Fortunately we can check determine whether this the case by checking
5096          * whether the global deadline has advanced. It is valid to compare
5097          * cfs_b->runtime_expires without any locks since we only care about
5098          * exact equality, so a partial write will still work.
5099          */
5100
5101         if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
5102                 /* extend local deadline, drift is bounded above by 2 ticks */
5103                 cfs_rq->runtime_expires += TICK_NSEC;
5104         } else {
5105                 /* global deadline is ahead, expiration has passed */
5106                 cfs_rq->runtime_remaining = 0;
5107         }
5108 }
5109
5110 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5111 {
5112         /* dock delta_exec before expiring quota (as it could span periods) */
5113         cfs_rq->runtime_remaining -= delta_exec;
5114         expire_cfs_rq_runtime(cfs_rq);
5115
5116         if (likely(cfs_rq->runtime_remaining > 0))
5117                 return;
5118
5119         /*
5120          * if we're unable to extend our runtime we resched so that the active
5121          * hierarchy can be throttled
5122          */
5123         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5124                 resched_curr(rq_of(cfs_rq));
5125 }
5126
5127 static __always_inline
5128 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5129 {
5130         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5131                 return;
5132
5133         __account_cfs_rq_runtime(cfs_rq, delta_exec);
5134 }
5135
5136 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5137 {
5138         return cfs_bandwidth_used() && cfs_rq->throttled;
5139 }
5140
5141 #ifdef CONFIG_SCHED_HMP
5142 /*
5143  * Check if task is part of a hierarchy where some cfs_rq does not have any
5144  * runtime left.
5145  *
5146  * We can't rely on throttled_hierarchy() to do this test, as
5147  * cfs_rq->throttle_count will not be updated yet when this function is called
5148  * from scheduler_tick()
5149  */
5150 static int task_will_be_throttled(struct task_struct *p)
5151 {
5152         struct sched_entity *se = &p->se;
5153         struct cfs_rq *cfs_rq;
5154
5155         if (!cfs_bandwidth_used())
5156                 return 0;
5157
5158         for_each_sched_entity(se) {
5159                 cfs_rq = cfs_rq_of(se);
5160                 if (!cfs_rq->runtime_enabled)
5161                         continue;
5162                 if (cfs_rq->runtime_remaining <= 0)
5163                         return 1;
5164         }
5165
5166         return 0;
5167 }
5168 #endif
5169
5170 /* check whether cfs_rq, or any parent, is throttled */
5171 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5172 {
5173         return cfs_bandwidth_used() && cfs_rq->throttle_count;
5174 }
5175
5176 /*
5177  * Ensure that neither of the group entities corresponding to src_cpu or
5178  * dest_cpu are members of a throttled hierarchy when performing group
5179  * load-balance operations.
5180  */
5181 static inline int throttled_lb_pair(struct task_group *tg,
5182                                     int src_cpu, int dest_cpu)
5183 {
5184         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5185
5186         src_cfs_rq = tg->cfs_rq[src_cpu];
5187         dest_cfs_rq = tg->cfs_rq[dest_cpu];
5188
5189         return throttled_hierarchy(src_cfs_rq) ||
5190                throttled_hierarchy(dest_cfs_rq);
5191 }
5192
5193 /* updated child weight may affect parent so we have to do this bottom up */
5194 static int tg_unthrottle_up(struct task_group *tg, void *data)
5195 {
5196         struct rq *rq = data;
5197         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5198
5199         cfs_rq->throttle_count--;
5200 #ifdef CONFIG_SMP
5201         if (!cfs_rq->throttle_count) {
5202                 /* adjust cfs_rq_clock_task() */
5203                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
5204                                              cfs_rq->throttled_clock_task;
5205         }
5206 #endif
5207
5208         return 0;
5209 }
5210
5211 static int tg_throttle_down(struct task_group *tg, void *data)
5212 {
5213         struct rq *rq = data;
5214         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5215
5216         /* group is entering throttled state, stop time */
5217         if (!cfs_rq->throttle_count)
5218                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5219         cfs_rq->throttle_count++;
5220
5221         return 0;
5222 }
5223
5224 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
5225 {
5226         struct rq *rq = rq_of(cfs_rq);
5227         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5228         struct sched_entity *se;
5229         long task_delta, dequeue = 1;
5230         bool empty;
5231
5232         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5233
5234         /* freeze hierarchy runnable averages while throttled */
5235         rcu_read_lock();
5236         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5237         rcu_read_unlock();
5238
5239         task_delta = cfs_rq->h_nr_running;
5240         for_each_sched_entity(se) {
5241                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5242                 /* throttled entity or throttle-on-deactivate */
5243                 if (!se->on_rq)
5244                         break;
5245
5246                 if (dequeue)
5247                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5248                 qcfs_rq->h_nr_running -= task_delta;
5249                 dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
5250
5251                 if (qcfs_rq->load.weight)
5252                         dequeue = 0;
5253         }
5254
5255         if (!se) {
5256                 sub_nr_running(rq, task_delta);
5257                 dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
5258         }
5259
5260         cfs_rq->throttled = 1;
5261         cfs_rq->throttled_clock = rq_clock(rq);
5262         raw_spin_lock(&cfs_b->lock);
5263         empty = list_empty(&cfs_b->throttled_cfs_rq);
5264
5265         /*
5266          * Add to the _head_ of the list, so that an already-started
5267          * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
5268          * not running add to the tail so that later runqueues don't get starved.
5269          */
5270         if (cfs_b->distribute_running)
5271                 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5272         else
5273                 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5274
5275         /*
5276          * If we're the first throttled task, make sure the bandwidth
5277          * timer is running.
5278          */
5279         if (empty)
5280                 start_cfs_bandwidth(cfs_b);
5281
5282         raw_spin_unlock(&cfs_b->lock);
5283
5284         /* Log effect on hmp stats after throttling */
5285         trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5286                              sched_irqload(cpu_of(rq)),
5287                              power_cost(cpu_of(rq), 0),
5288                              cpu_temp(cpu_of(rq)));
5289 }
5290
5291 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5292 {
5293         struct rq *rq = rq_of(cfs_rq);
5294         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5295         struct sched_entity *se;
5296         int enqueue = 1;
5297         long task_delta;
5298         struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
5299
5300         se = cfs_rq->tg->se[cpu_of(rq)];
5301
5302         cfs_rq->throttled = 0;
5303
5304         update_rq_clock(rq);
5305
5306         raw_spin_lock(&cfs_b->lock);
5307         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5308         list_del_rcu(&cfs_rq->throttled_list);
5309         raw_spin_unlock(&cfs_b->lock);
5310
5311         /* update hierarchical throttle state */
5312         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5313
5314         if (!cfs_rq->load.weight)
5315                 return;
5316
5317         task_delta = cfs_rq->h_nr_running;
5318         for_each_sched_entity(se) {
5319                 if (se->on_rq)
5320                         enqueue = 0;
5321
5322                 cfs_rq = cfs_rq_of(se);
5323                 if (enqueue)
5324                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
5325                 cfs_rq->h_nr_running += task_delta;
5326                 inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
5327
5328                 if (cfs_rq_throttled(cfs_rq))
5329                         break;
5330         }
5331
5332         if (!se) {
5333                 add_nr_running(rq, task_delta);
5334                 inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
5335         }
5336
5337         /* determine whether we need to wake up potentially idle cpu */
5338         if (rq->curr == rq->idle && rq->cfs.nr_running)
5339                 resched_curr(rq);
5340
5341         /* Log effect on hmp stats after un-throttling */
5342         trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5343                              sched_irqload(cpu_of(rq)),
5344                              power_cost(cpu_of(rq), 0),
5345                              cpu_temp(cpu_of(rq)));
5346 }
5347
5348 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
5349                 u64 remaining, u64 expires)
5350 {
5351         struct cfs_rq *cfs_rq;
5352         u64 runtime;
5353         u64 starting_runtime = remaining;
5354
5355         rcu_read_lock();
5356         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5357                                 throttled_list) {
5358                 struct rq *rq = rq_of(cfs_rq);
5359
5360                 raw_spin_lock(&rq->lock);
5361                 if (!cfs_rq_throttled(cfs_rq))
5362                         goto next;
5363
5364                 runtime = -cfs_rq->runtime_remaining + 1;
5365                 if (runtime > remaining)
5366                         runtime = remaining;
5367                 remaining -= runtime;
5368
5369                 cfs_rq->runtime_remaining += runtime;
5370                 cfs_rq->runtime_expires = expires;
5371
5372                 /* we check whether we're throttled above */
5373                 if (cfs_rq->runtime_remaining > 0)
5374                         unthrottle_cfs_rq(cfs_rq);
5375
5376 next:
5377                 raw_spin_unlock(&rq->lock);
5378
5379                 if (!remaining)
5380                         break;
5381         }
5382         rcu_read_unlock();
5383
5384         return starting_runtime - remaining;
5385 }
5386
5387 /*
5388  * Responsible for refilling a task_group's bandwidth and unthrottling its
5389  * cfs_rqs as appropriate. If there has been no activity within the last
5390  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5391  * used to track this state.
5392  */
5393 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5394 {
5395         u64 runtime, runtime_expires;
5396         int throttled;
5397
5398         /* no need to continue the timer with no bandwidth constraint */
5399         if (cfs_b->quota == RUNTIME_INF)
5400                 goto out_deactivate;
5401
5402         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5403         cfs_b->nr_periods += overrun;
5404
5405         /*
5406          * idle depends on !throttled (for the case of a large deficit), and if
5407          * we're going inactive then everything else can be deferred
5408          */
5409         if (cfs_b->idle && !throttled)
5410                 goto out_deactivate;
5411
5412         __refill_cfs_bandwidth_runtime(cfs_b);
5413
5414         if (!throttled) {
5415                 /* mark as potentially idle for the upcoming period */
5416                 cfs_b->idle = 1;
5417                 return 0;
5418         }
5419
5420         /* account preceding periods in which throttling occurred */
5421         cfs_b->nr_throttled += overrun;
5422
5423         runtime_expires = cfs_b->runtime_expires;
5424
5425         /*
5426          * This check is repeated as we are holding onto the new bandwidth while
5427          * we unthrottle. This can potentially race with an unthrottled group
5428          * trying to acquire new bandwidth from the global pool. This can result
5429          * in us over-using our runtime if it is all used during this loop, but
5430          * only by limited amounts in that extreme case.
5431          */
5432         while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
5433                 runtime = cfs_b->runtime;
5434                 cfs_b->distribute_running = 1;
5435                 raw_spin_unlock(&cfs_b->lock);
5436                 /* we can't nest cfs_b->lock while distributing bandwidth */
5437                 runtime = distribute_cfs_runtime(cfs_b, runtime,
5438                                                  runtime_expires);
5439                 raw_spin_lock(&cfs_b->lock);
5440
5441                 cfs_b->distribute_running = 0;
5442                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5443
5444                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5445         }
5446
5447         /*
5448          * While we are ensured activity in the period following an
5449          * unthrottle, this also covers the case in which the new bandwidth is
5450          * insufficient to cover the existing bandwidth deficit.  (Forcing the
5451          * timer to remain active while there are any throttled entities.)
5452          */
5453         cfs_b->idle = 0;
5454
5455         return 0;
5456
5457 out_deactivate:
5458         return 1;
5459 }
5460
5461 /* a cfs_rq won't donate quota below this amount */
5462 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5463 /* minimum remaining period time to redistribute slack quota */
5464 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5465 /* how long we wait to gather additional slack before distributing */
5466 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5467
5468 /*
5469  * Are we near the end of the current quota period?
5470  *
5471  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
5472  * hrtimer base being cleared by hrtimer_start. In the case of
5473  * migrate_hrtimers, base is never cleared, so we are fine.
5474  */
5475 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5476 {
5477         struct hrtimer *refresh_timer = &cfs_b->period_timer;
5478         u64 remaining;
5479
5480         /* if the call-back is running a quota refresh is already occurring */
5481         if (hrtimer_callback_running(refresh_timer))
5482                 return 1;
5483
5484         /* is a quota refresh about to occur? */
5485         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5486         if (remaining < min_expire)
5487                 return 1;
5488
5489         return 0;
5490 }
5491
5492 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5493 {
5494         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5495
5496         /* if there's a quota refresh soon don't bother with slack */
5497         if (runtime_refresh_within(cfs_b, min_left))
5498                 return;
5499
5500         hrtimer_start(&cfs_b->slack_timer,
5501                         ns_to_ktime(cfs_bandwidth_slack_period),
5502                         HRTIMER_MODE_REL);
5503 }
5504
5505 /* we know any runtime found here is valid as update_curr() precedes return */
5506 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5507 {
5508         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5509         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5510
5511         if (slack_runtime <= 0)
5512                 return;
5513
5514         raw_spin_lock(&cfs_b->lock);
5515         if (cfs_b->quota != RUNTIME_INF &&
5516             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5517                 cfs_b->runtime += slack_runtime;
5518
5519                 /* we are under rq->lock, defer unthrottling using a timer */
5520                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5521                     !list_empty(&cfs_b->throttled_cfs_rq))
5522                         start_cfs_slack_bandwidth(cfs_b);
5523         }
5524         raw_spin_unlock(&cfs_b->lock);
5525
5526         /* even if it's not valid for return we don't want to try again */
5527         cfs_rq->runtime_remaining -= slack_runtime;
5528 }
5529
5530 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5531 {
5532         if (!cfs_bandwidth_used())
5533                 return;
5534
5535         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5536                 return;
5537
5538         __return_cfs_rq_runtime(cfs_rq);
5539 }
5540
5541 /*
5542  * This is done with a timer (instead of inline with bandwidth return) since
5543  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5544  */
5545 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5546 {
5547         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5548         u64 expires;
5549
5550         /* confirm we're still not at a refresh boundary */
5551         raw_spin_lock(&cfs_b->lock);
5552         if (cfs_b->distribute_running) {
5553                 raw_spin_unlock(&cfs_b->lock);
5554                 return;
5555         }
5556
5557         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5558                 raw_spin_unlock(&cfs_b->lock);
5559                 return;
5560         }
5561
5562         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5563                 runtime = cfs_b->runtime;
5564
5565         expires = cfs_b->runtime_expires;
5566         if (runtime)
5567                 cfs_b->distribute_running = 1;
5568
5569         raw_spin_unlock(&cfs_b->lock);
5570
5571         if (!runtime)
5572                 return;
5573
5574         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5575
5576         raw_spin_lock(&cfs_b->lock);
5577         if (expires == cfs_b->runtime_expires)
5578                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5579         cfs_b->distribute_running = 0;
5580         raw_spin_unlock(&cfs_b->lock);
5581 }
5582
5583 /*
5584  * When a group wakes up we want to make sure that its quota is not already
5585  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5586  * runtime as update_curr() throttling can not not trigger until it's on-rq.
5587  */
5588 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5589 {
5590         if (!cfs_bandwidth_used())
5591                 return;
5592
5593         /* Synchronize hierarchical throttle counter: */
5594         if (unlikely(!cfs_rq->throttle_uptodate)) {
5595                 struct rq *rq = rq_of(cfs_rq);
5596                 struct cfs_rq *pcfs_rq;
5597                 struct task_group *tg;
5598
5599                 cfs_rq->throttle_uptodate = 1;
5600
5601                 /* Get closest up-to-date node, because leaves go first: */
5602                 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
5603                         pcfs_rq = tg->cfs_rq[cpu_of(rq)];
5604                         if (pcfs_rq->throttle_uptodate)
5605                                 break;
5606                 }
5607                 if (tg) {
5608                         cfs_rq->throttle_count = pcfs_rq->throttle_count;
5609                         cfs_rq->throttled_clock_task = rq_clock_task(rq);
5610                 }
5611         }
5612
5613         /* an active group must be handled by the update_curr()->put() path */
5614         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5615                 return;
5616
5617         /* ensure the group is not already throttled */
5618         if (cfs_rq_throttled(cfs_rq))
5619                 return;
5620
5621         /* update runtime allocation */
5622         account_cfs_rq_runtime(cfs_rq, 0);
5623         if (cfs_rq->runtime_remaining <= 0)
5624                 throttle_cfs_rq(cfs_rq);
5625 }
5626
5627 /* conditionally throttle active cfs_rq's from put_prev_entity() */
5628 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5629 {
5630         if (!cfs_bandwidth_used())
5631                 return false;
5632
5633         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5634                 return false;
5635
5636         /*
5637          * it's possible for a throttled entity to be forced into a running
5638          * state (e.g. set_curr_task), in this case we're finished.
5639          */
5640         if (cfs_rq_throttled(cfs_rq))
5641                 return true;
5642
5643         throttle_cfs_rq(cfs_rq);
5644         return true;
5645 }
5646
5647 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5648 {
5649         struct cfs_bandwidth *cfs_b =
5650                 container_of(timer, struct cfs_bandwidth, slack_timer);
5651
5652         do_sched_cfs_slack_timer(cfs_b);
5653
5654         return HRTIMER_NORESTART;
5655 }
5656
5657 extern const u64 max_cfs_quota_period;
5658
5659 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5660 {
5661         struct cfs_bandwidth *cfs_b =
5662                 container_of(timer, struct cfs_bandwidth, period_timer);
5663         int overrun;
5664         int idle = 0;
5665         int count = 0;
5666
5667         raw_spin_lock(&cfs_b->lock);
5668         for (;;) {
5669                 overrun = hrtimer_forward_now(timer, cfs_b->period);
5670                 if (!overrun)
5671                         break;
5672
5673                 if (++count > 3) {
5674                         u64 new, old = ktime_to_ns(cfs_b->period);
5675
5676                         new = (old * 147) / 128; /* ~115% */
5677                         new = min(new, max_cfs_quota_period);
5678
5679                         cfs_b->period = ns_to_ktime(new);
5680
5681                         /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
5682                         cfs_b->quota *= new;
5683                         cfs_b->quota = div64_u64(cfs_b->quota, old);
5684
5685                         pr_warn_ratelimited(
5686         "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
5687                                 smp_processor_id(),
5688                                 div_u64(new, NSEC_PER_USEC),
5689                                 div_u64(cfs_b->quota, NSEC_PER_USEC));
5690
5691                         /* reset count so we don't come right back in here */
5692                         count = 0;
5693                 }
5694
5695                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5696         }
5697         if (idle)
5698                 cfs_b->period_active = 0;
5699         raw_spin_unlock(&cfs_b->lock);
5700
5701         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5702 }
5703
5704 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5705 {
5706         raw_spin_lock_init(&cfs_b->lock);
5707         cfs_b->runtime = 0;
5708         cfs_b->quota = RUNTIME_INF;
5709         cfs_b->period = ns_to_ktime(default_cfs_period());
5710
5711         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5712         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5713         cfs_b->period_timer.function = sched_cfs_period_timer;
5714         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5715         cfs_b->slack_timer.function = sched_cfs_slack_timer;
5716         cfs_b->distribute_running = 0;
5717 }
5718
5719 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5720 {
5721         cfs_rq->runtime_enabled = 0;
5722         INIT_LIST_HEAD(&cfs_rq->throttled_list);
5723         init_cfs_rq_hmp_stats(cfs_rq);
5724 }
5725
5726 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5727 {
5728         lockdep_assert_held(&cfs_b->lock);
5729
5730         if (!cfs_b->period_active) {
5731                 cfs_b->period_active = 1;
5732                 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5733                 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5734         }
5735 }
5736
5737 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5738 {
5739         /* init_cfs_bandwidth() was not called */
5740         if (!cfs_b->throttled_cfs_rq.next)
5741                 return;
5742
5743         hrtimer_cancel(&cfs_b->period_timer);
5744         hrtimer_cancel(&cfs_b->slack_timer);
5745 }
5746
5747 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5748 {
5749         struct cfs_rq *cfs_rq;
5750
5751         for_each_leaf_cfs_rq(rq, cfs_rq) {
5752                 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
5753
5754                 raw_spin_lock(&cfs_b->lock);
5755                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5756                 raw_spin_unlock(&cfs_b->lock);
5757         }
5758 }
5759
5760 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5761 {
5762         struct cfs_rq *cfs_rq;
5763
5764         for_each_leaf_cfs_rq(rq, cfs_rq) {
5765                 if (!cfs_rq->runtime_enabled)
5766                         continue;
5767
5768                 /*
5769                  * clock_task is not advancing so we just need to make sure
5770                  * there's some valid quota amount
5771                  */
5772                 cfs_rq->runtime_remaining = 1;
5773                 /*
5774                  * Offline rq is schedulable till cpu is completely disabled
5775                  * in take_cpu_down(), so we prevent new cfs throttling here.
5776                  */
5777                 cfs_rq->runtime_enabled = 0;
5778
5779                 if (cfs_rq_throttled(cfs_rq))
5780                         unthrottle_cfs_rq(cfs_rq);
5781         }
5782 }
5783
5784 #else /* CONFIG_CFS_BANDWIDTH */
5785 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5786 {
5787         return rq_clock_task(rq_of(cfs_rq));
5788 }
5789
5790 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5791 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5792 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5793 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5794
5795 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5796 {
5797         return 0;
5798 }
5799
5800 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5801 {
5802         return 0;
5803 }
5804
5805 static inline int throttled_lb_pair(struct task_group *tg,
5806                                     int src_cpu, int dest_cpu)
5807 {
5808         return 0;
5809 }
5810
5811 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5812
5813 #ifdef CONFIG_FAIR_GROUP_SCHED
5814 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5815 #endif
5816
5817 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5818 {
5819         return NULL;
5820 }
5821 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5822 static inline void update_runtime_enabled(struct rq *rq) {}
5823 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5824
5825 #endif /* CONFIG_CFS_BANDWIDTH */
5826
5827 /**************************************************
5828  * CFS operations on tasks:
5829  */
5830
5831 #ifdef CONFIG_SCHED_HRTICK
5832 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5833 {
5834         struct sched_entity *se = &p->se;
5835         struct cfs_rq *cfs_rq = cfs_rq_of(se);
5836
5837         WARN_ON(task_rq(p) != rq);
5838
5839         if (rq->cfs.h_nr_running > 1) {
5840                 u64 slice = sched_slice(cfs_rq, se);
5841                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5842                 s64 delta = slice - ran;
5843
5844                 if (delta < 0) {
5845                         if (rq->curr == p)
5846                                 resched_curr(rq);
5847                         return;
5848                 }
5849                 hrtick_start(rq, delta);
5850         }
5851 }
5852
5853 /*
5854  * called from enqueue/dequeue and updates the hrtick when the
5855  * current task is from our class.
5856  */
5857 static void hrtick_update(struct rq *rq)
5858 {
5859         struct task_struct *curr = rq->curr;
5860
5861         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5862                 return;
5863
5864         hrtick_start_fair(rq, curr);
5865 }
5866 #else /* !CONFIG_SCHED_HRTICK */
5867 static inline void
5868 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5869 {
5870 }
5871
5872 static inline void hrtick_update(struct rq *rq)
5873 {
5874 }
5875 #endif
5876
5877 #ifdef CONFIG_SMP
5878 static bool __cpu_overutilized(int cpu, int delta);
5879 static bool cpu_overutilized(int cpu);
5880 unsigned long boosted_cpu_util(int cpu);
5881 #else
5882 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
5883 #endif
5884
5885 /*
5886  * The enqueue_task method is called before nr_running is
5887  * increased. Here we update the fair scheduling stats and
5888  * then put the task into the rbtree:
5889  */
5890 static void
5891 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5892 {
5893         struct cfs_rq *cfs_rq;
5894         struct sched_entity *se = &p->se;
5895 #ifdef CONFIG_SMP
5896         int task_new = flags & ENQUEUE_WAKEUP_NEW;
5897 #endif
5898
5899         /*
5900          * If in_iowait is set, the code below may not trigger any cpufreq
5901          * utilization updates, so do it here explicitly with the IOWAIT flag
5902          * passed.
5903          */
5904         if (p->in_iowait)
5905                 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5906
5907         for_each_sched_entity(se) {
5908                 if (se->on_rq)
5909                         break;
5910                 cfs_rq = cfs_rq_of(se);
5911                 enqueue_entity(cfs_rq, se, flags);
5912
5913                 /*
5914                  * end evaluation on encountering a throttled cfs_rq
5915                  *
5916                  * note: in the case of encountering a throttled cfs_rq we will
5917                  * post the final h_nr_running increment below.
5918                  */
5919                 if (cfs_rq_throttled(cfs_rq))
5920                         break;
5921                 cfs_rq->h_nr_running++;
5922                 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5923
5924                 flags = ENQUEUE_WAKEUP;
5925         }
5926
5927         for_each_sched_entity(se) {
5928                 cfs_rq = cfs_rq_of(se);
5929                 cfs_rq->h_nr_running++;
5930                 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5931
5932                 if (cfs_rq_throttled(cfs_rq))
5933                         break;
5934
5935                 update_load_avg(se, UPDATE_TG);
5936                 update_cfs_shares(se);
5937         }
5938
5939         if (!se) {
5940                 add_nr_running(rq, 1);
5941                 inc_rq_hmp_stats(rq, p, 1);
5942         }
5943
5944 #ifdef CONFIG_SMP
5945
5946         /*
5947          * Update SchedTune accounting.
5948          *
5949          * We do it before updating the CPU capacity to ensure the
5950          * boost value of the current task is accounted for in the
5951          * selection of the OPP.
5952          *
5953          * We do it also in the case where we enqueue a throttled task;
5954          * we could argue that a throttled task should not boost a CPU,
5955          * however:
5956          * a) properly implementing CPU boosting considering throttled
5957          *    tasks will increase a lot the complexity of the solution
5958          * b) it's not easy to quantify the benefits introduced by
5959          *    such a more complex solution.
5960          * Thus, for the time being we go for the simple solution and boost
5961          * also for throttled RQs.
5962          */
5963         schedtune_enqueue_task(p, cpu_of(rq));
5964
5965         if (energy_aware() && !se) {
5966                 if (!task_new && !rq->rd->overutilized &&
5967                     cpu_overutilized(rq->cpu)) {
5968                         rq->rd->overutilized = true;
5969                         trace_sched_overutilized(true);
5970                 }
5971         }
5972
5973 #endif /* CONFIG_SMP */
5974         hrtick_update(rq);
5975 }
5976
5977 static void set_next_buddy(struct sched_entity *se);
5978
5979 /*
5980  * The dequeue_task method is called before nr_running is
5981  * decreased. We remove the task from the rbtree and
5982  * update the fair scheduling stats:
5983  */
5984 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5985 {
5986         struct cfs_rq *cfs_rq;
5987         struct sched_entity *se = &p->se;
5988         int task_sleep = flags & DEQUEUE_SLEEP;
5989
5990         for_each_sched_entity(se) {
5991                 cfs_rq = cfs_rq_of(se);
5992                 dequeue_entity(cfs_rq, se, flags);
5993
5994                 /*
5995                  * end evaluation on encountering a throttled cfs_rq
5996                  *
5997                  * note: in the case of encountering a throttled cfs_rq we will
5998                  * post the final h_nr_running decrement below.
5999                 */
6000                 if (cfs_rq_throttled(cfs_rq))
6001                         break;
6002                 cfs_rq->h_nr_running--;
6003                 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6004
6005                 /* Don't dequeue parent if it has other entities besides us */
6006                 if (cfs_rq->load.weight) {
6007                         /* Avoid re-evaluating load for this entity: */
6008                         se = parent_entity(se);
6009                         /*
6010                          * Bias pick_next to pick a task from this cfs_rq, as
6011                          * p is sleeping when it is within its sched_slice.
6012                          */
6013                         if (task_sleep && se && !throttled_hierarchy(cfs_rq))
6014                                 set_next_buddy(se);
6015                         break;
6016                 }
6017                 flags |= DEQUEUE_SLEEP;
6018         }
6019
6020         for_each_sched_entity(se) {
6021                 cfs_rq = cfs_rq_of(se);
6022                 cfs_rq->h_nr_running--;
6023                 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6024
6025                 if (cfs_rq_throttled(cfs_rq))
6026                         break;
6027
6028                 update_load_avg(se, UPDATE_TG);
6029                 update_cfs_shares(se);
6030         }
6031
6032         if (!se) {
6033                 sub_nr_running(rq, 1);
6034                 dec_rq_hmp_stats(rq, p, 1);
6035         }
6036
6037 #ifdef CONFIG_SMP
6038
6039         /*
6040          * Update SchedTune accounting
6041          *
6042          * We do it before updating the CPU capacity to ensure the
6043          * boost value of the current task is accounted for in the
6044          * selection of the OPP.
6045          */
6046         schedtune_dequeue_task(p, cpu_of(rq));
6047
6048 #endif /* CONFIG_SMP */
6049
6050         hrtick_update(rq);
6051 }
6052
6053 #ifdef CONFIG_SMP
6054
6055 /*
6056  * per rq 'load' arrray crap; XXX kill this.
6057  */
6058
6059 /*
6060  * The exact cpuload at various idx values, calculated at every tick would be
6061  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
6062  *
6063  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
6064  * on nth tick when cpu may be busy, then we have:
6065  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6066  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
6067  *
6068  * decay_load_missed() below does efficient calculation of
6069  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6070  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
6071  *
6072  * The calculation is approximated on a 128 point scale.
6073  * degrade_zero_ticks is the number of ticks after which load at any
6074  * particular idx is approximated to be zero.
6075  * degrade_factor is a precomputed table, a row for each load idx.
6076  * Each column corresponds to degradation factor for a power of two ticks,
6077  * based on 128 point scale.
6078  * Example:
6079  * row 2, col 3 (=12) says that the degradation at load idx 2 after
6080  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
6081  *
6082  * With this power of 2 load factors, we can degrade the load n times
6083  * by looking at 1 bits in n and doing as many mult/shift instead of
6084  * n mult/shifts needed by the exact degradation.
6085  */
6086 #define DEGRADE_SHIFT           7
6087 static const unsigned char
6088                 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
6089 static const unsigned char
6090                 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
6091                                         {0, 0, 0, 0, 0, 0, 0, 0},
6092                                         {64, 32, 8, 0, 0, 0, 0, 0},
6093                                         {96, 72, 40, 12, 1, 0, 0},
6094                                         {112, 98, 75, 43, 15, 1, 0},
6095                                         {120, 112, 98, 76, 45, 16, 2} };
6096
6097 /*
6098  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
6099  * would be when CPU is idle and so we just decay the old load without
6100  * adding any new load.
6101  */
6102 static unsigned long
6103 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
6104 {
6105         int j = 0;
6106
6107         if (!missed_updates)
6108                 return load;
6109
6110         if (missed_updates >= degrade_zero_ticks[idx])
6111                 return 0;
6112
6113         if (idx == 1)
6114                 return load >> missed_updates;
6115
6116         while (missed_updates) {
6117                 if (missed_updates % 2)
6118                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
6119
6120                 missed_updates >>= 1;
6121                 j++;
6122         }
6123         return load;
6124 }
6125
6126 /*
6127  * Update rq->cpu_load[] statistics. This function is usually called every
6128  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
6129  * every tick. We fix it up based on jiffies.
6130  */
6131 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
6132                               unsigned long pending_updates)
6133 {
6134         int i, scale;
6135
6136         this_rq->nr_load_updates++;
6137
6138         /* Update our load: */
6139         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
6140         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
6141                 unsigned long old_load, new_load;
6142
6143                 /* scale is effectively 1 << i now, and >> i divides by scale */
6144
6145                 old_load = this_rq->cpu_load[i];
6146                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
6147                 new_load = this_load;
6148                 /*
6149                  * Round up the averaging division if load is increasing. This
6150                  * prevents us from getting stuck on 9 if the load is 10, for
6151                  * example.
6152                  */
6153                 if (new_load > old_load)
6154                         new_load += scale - 1;
6155
6156                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
6157         }
6158
6159         sched_avg_update(this_rq);
6160 }
6161
6162 /* Used instead of source_load when we know the type == 0 */
6163 static unsigned long weighted_cpuload(const int cpu)
6164 {
6165         return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
6166 }
6167
6168 #ifdef CONFIG_NO_HZ_COMMON
6169 /*
6170  * There is no sane way to deal with nohz on smp when using jiffies because the
6171  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
6172  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
6173  *
6174  * Therefore we cannot use the delta approach from the regular tick since that
6175  * would seriously skew the load calculation. However we'll make do for those
6176  * updates happening while idle (nohz_idle_balance) or coming out of idle
6177  * (tick_nohz_idle_exit).
6178  *
6179  * This means we might still be one tick off for nohz periods.
6180  */
6181
6182 /*
6183  * Called from nohz_idle_balance() to update the load ratings before doing the
6184  * idle balance.
6185  */
6186 static void update_idle_cpu_load(struct rq *this_rq)
6187 {
6188         unsigned long curr_jiffies = READ_ONCE(jiffies);
6189         unsigned long load = weighted_cpuload(cpu_of(this_rq));
6190         unsigned long pending_updates;
6191
6192         /*
6193          * bail if there's load or we're actually up-to-date.
6194          */
6195         if (load || curr_jiffies == this_rq->last_load_update_tick)
6196                 return;
6197
6198         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6199         this_rq->last_load_update_tick = curr_jiffies;
6200
6201         __update_cpu_load(this_rq, load, pending_updates);
6202 }
6203
6204 /*
6205  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
6206  */
6207 void update_cpu_load_nohz(void)
6208 {
6209         struct rq *this_rq = this_rq();
6210         unsigned long curr_jiffies = READ_ONCE(jiffies);
6211         unsigned long pending_updates;
6212
6213         if (curr_jiffies == this_rq->last_load_update_tick)
6214                 return;
6215
6216         raw_spin_lock(&this_rq->lock);
6217         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6218         if (pending_updates) {
6219                 this_rq->last_load_update_tick = curr_jiffies;
6220                 /*
6221                  * We were idle, this means load 0, the current load might be
6222                  * !0 due to remote wakeups and the sort.
6223                  */
6224                 __update_cpu_load(this_rq, 0, pending_updates);
6225         }
6226         raw_spin_unlock(&this_rq->lock);
6227 }
6228 #endif /* CONFIG_NO_HZ */
6229
6230 /*
6231  * Called from scheduler_tick()
6232  */
6233 void update_cpu_load_active(struct rq *this_rq)
6234 {
6235         unsigned long load = weighted_cpuload(cpu_of(this_rq));
6236         /*
6237          * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
6238          */
6239         this_rq->last_load_update_tick = jiffies;
6240         __update_cpu_load(this_rq, load, 1);
6241 }
6242
6243 /*
6244  * Return a low guess at the load of a migration-source cpu weighted
6245  * according to the scheduling class and "nice" value.
6246  *
6247  * We want to under-estimate the load of migration sources, to
6248  * balance conservatively.
6249  */
6250 static unsigned long source_load(int cpu, int type)
6251 {
6252         struct rq *rq = cpu_rq(cpu);
6253         unsigned long total = weighted_cpuload(cpu);
6254
6255         if (type == 0 || !sched_feat(LB_BIAS))
6256                 return total;
6257
6258         return min(rq->cpu_load[type-1], total);
6259 }
6260
6261 /*
6262  * Return a high guess at the load of a migration-target cpu weighted
6263  * according to the scheduling class and "nice" value.
6264  */
6265 static unsigned long target_load(int cpu, int type)
6266 {
6267         struct rq *rq = cpu_rq(cpu);
6268         unsigned long total = weighted_cpuload(cpu);
6269
6270         if (type == 0 || !sched_feat(LB_BIAS))
6271                 return total;
6272
6273         return max(rq->cpu_load[type-1], total);
6274 }
6275
6276
6277 static unsigned long cpu_avg_load_per_task(int cpu)
6278 {
6279         struct rq *rq = cpu_rq(cpu);
6280         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
6281         unsigned long load_avg = weighted_cpuload(cpu);
6282
6283         if (nr_running)
6284                 return load_avg / nr_running;
6285
6286         return 0;
6287 }
6288
6289 static void record_wakee(struct task_struct *p)
6290 {
6291         /*
6292          * Rough decay (wiping) for cost saving, don't worry
6293          * about the boundary, really active task won't care
6294          * about the loss.
6295          */
6296         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6297                 current->wakee_flips >>= 1;
6298                 current->wakee_flip_decay_ts = jiffies;
6299         }
6300
6301         if (current->last_wakee != p) {
6302                 current->last_wakee = p;
6303                 current->wakee_flips++;
6304         }
6305 }
6306
6307 static void task_waking_fair(struct task_struct *p)
6308 {
6309         struct sched_entity *se = &p->se;
6310         struct cfs_rq *cfs_rq = cfs_rq_of(se);
6311         u64 min_vruntime;
6312
6313 #ifndef CONFIG_64BIT
6314         u64 min_vruntime_copy;
6315
6316         do {
6317                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6318                 smp_rmb();
6319                 min_vruntime = cfs_rq->min_vruntime;
6320         } while (min_vruntime != min_vruntime_copy);
6321 #else
6322         min_vruntime = cfs_rq->min_vruntime;
6323 #endif
6324
6325         se->vruntime -= min_vruntime;
6326         record_wakee(p);
6327 }
6328
6329 #ifdef CONFIG_FAIR_GROUP_SCHED
6330 /*
6331  * effective_load() calculates the load change as seen from the root_task_group
6332  *
6333  * Adding load to a group doesn't make a group heavier, but can cause movement
6334  * of group shares between cpus. Assuming the shares were perfectly aligned one
6335  * can calculate the shift in shares.
6336  *
6337  * Calculate the effective load difference if @wl is added (subtracted) to @tg
6338  * on this @cpu and results in a total addition (subtraction) of @wg to the
6339  * total group weight.
6340  *
6341  * Given a runqueue weight distribution (rw_i) we can compute a shares
6342  * distribution (s_i) using:
6343  *
6344  *   s_i = rw_i / \Sum rw_j                                             (1)
6345  *
6346  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
6347  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
6348  * shares distribution (s_i):
6349  *
6350  *   rw_i = {   2,   4,   1,   0 }
6351  *   s_i  = { 2/7, 4/7, 1/7,   0 }
6352  *
6353  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
6354  * task used to run on and the CPU the waker is running on), we need to
6355  * compute the effect of waking a task on either CPU and, in case of a sync
6356  * wakeup, compute the effect of the current task going to sleep.
6357  *
6358  * So for a change of @wl to the local @cpu with an overall group weight change
6359  * of @wl we can compute the new shares distribution (s'_i) using:
6360  *
6361  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
6362  *
6363  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
6364  * differences in waking a task to CPU 0. The additional task changes the
6365  * weight and shares distributions like:
6366  *
6367  *   rw'_i = {   3,   4,   1,   0 }
6368  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
6369  *
6370  * We can then compute the difference in effective weight by using:
6371  *
6372  *   dw_i = S * (s'_i - s_i)                                            (3)
6373  *
6374  * Where 'S' is the group weight as seen by its parent.
6375  *
6376  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
6377  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
6378  * 4/7) times the weight of the group.
6379  */
6380 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6381 {
6382         struct sched_entity *se = tg->se[cpu];
6383
6384         if (!tg->parent)        /* the trivial, non-cgroup case */
6385                 return wl;
6386
6387         for_each_sched_entity(se) {
6388                 struct cfs_rq *cfs_rq = se->my_q;
6389                 long W, w = cfs_rq_load_avg(cfs_rq);
6390
6391                 tg = cfs_rq->tg;
6392
6393                 /*
6394                  * W = @wg + \Sum rw_j
6395                  */
6396                 W = wg + atomic_long_read(&tg->load_avg);
6397
6398                 /* Ensure \Sum rw_j >= rw_i */
6399                 W -= cfs_rq->tg_load_avg_contrib;
6400                 W += w;
6401
6402                 /*
6403                  * w = rw_i + @wl
6404                  */
6405                 w += wl;
6406
6407                 /*
6408                  * wl = S * s'_i; see (2)
6409                  */
6410                 if (W > 0 && w < W)
6411                         wl = (w * (long)tg->shares) / W;
6412                 else
6413                         wl = tg->shares;
6414
6415                 /*
6416                  * Per the above, wl is the new se->load.weight value; since
6417                  * those are clipped to [MIN_SHARES, ...) do so now. See
6418                  * calc_cfs_shares().
6419                  */
6420                 if (wl < MIN_SHARES)
6421                         wl = MIN_SHARES;
6422
6423                 /*
6424                  * wl = dw_i = S * (s'_i - s_i); see (3)
6425                  */
6426                 wl -= se->avg.load_avg;
6427
6428                 /*
6429                  * Recursively apply this logic to all parent groups to compute
6430                  * the final effective load change on the root group. Since
6431                  * only the @tg group gets extra weight, all parent groups can
6432                  * only redistribute existing shares. @wl is the shift in shares
6433                  * resulting from this level per the above.
6434                  */
6435                 wg = 0;
6436         }
6437
6438         return wl;
6439 }
6440 #else
6441
6442 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6443 {
6444         return wl;
6445 }
6446
6447 #endif
6448
6449 /*
6450  * Returns the current capacity of cpu after applying both
6451  * cpu and freq scaling.
6452  */
6453 unsigned long capacity_curr_of(int cpu)
6454 {
6455         return cpu_rq(cpu)->cpu_capacity_orig *
6456                arch_scale_freq_capacity(NULL, cpu)
6457                >> SCHED_CAPACITY_SHIFT;
6458 }
6459
6460 struct energy_env {
6461         struct sched_group      *sg_top;
6462         struct sched_group      *sg_cap;
6463         int                     cap_idx;
6464         int                     util_delta;
6465         int                     src_cpu;
6466         int                     dst_cpu;
6467         int                     trg_cpu;
6468         int                     energy;
6469         int                     payoff;
6470         struct task_struct      *task;
6471         struct {
6472                 int before;
6473                 int after;
6474                 int delta;
6475                 int diff;
6476         } nrg;
6477         struct {
6478                 int before;
6479                 int after;
6480                 int delta;
6481         } cap;
6482 };
6483
6484 static int cpu_util_wake(int cpu, struct task_struct *p);
6485
6486 /*
6487  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
6488  * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
6489  * energy calculations.
6490  *
6491  * Since util is a scale-invariant utilization defined as:
6492  *
6493  *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
6494  *
6495  * the normalized util can be found using the specific capacity.
6496  *
6497  *   capacity = capacity_orig * curr_freq/max_freq
6498  *
6499  *   norm_util = running_time/time ~ util/capacity
6500  */
6501 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
6502 {
6503         if (util >= capacity)
6504                 return SCHED_CAPACITY_SCALE;
6505
6506         return (util << SCHED_CAPACITY_SHIFT)/capacity;
6507 }
6508
6509 static unsigned long group_max_util(struct energy_env *eenv)
6510 {
6511         unsigned long max_util = 0;
6512         unsigned long util;
6513         int cpu;
6514
6515         for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
6516                 util = cpu_util_wake(cpu, eenv->task);
6517
6518                 /*
6519                  * If we are looking at the target CPU specified by the eenv,
6520                  * then we should add the (estimated) utilization of the task
6521                  * assuming we will wake it up on that CPU.
6522                  */
6523                 if (unlikely(cpu == eenv->trg_cpu))
6524                         util += eenv->util_delta;
6525
6526                 max_util = max(max_util, util);
6527         }
6528
6529         return max_util;
6530 }
6531
6532 /*
6533  * group_norm_util() returns the approximated group util relative to it's
6534  * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
6535  * in energy calculations.
6536  *
6537  * Since task executions may or may not overlap in time in the group the true
6538  * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
6539  * when iterating over all CPUs in the group.
6540  * The latter estimate is used as it leads to a more pessimistic energy
6541  * estimate (more busy).
6542  */
6543 static unsigned
6544 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
6545 {
6546         unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
6547         unsigned long util, util_sum = 0;
6548         int cpu;
6549
6550         for_each_cpu(cpu, sched_group_cpus(sg)) {
6551                 util = cpu_util_wake(cpu, eenv->task);
6552
6553                 /*
6554                  * If we are looking at the target CPU specified by the eenv,
6555                  * then we should add the (estimated) utilization of the task
6556                  * assuming we will wake it up on that CPU.
6557                  */
6558                 if (unlikely(cpu == eenv->trg_cpu))
6559                         util += eenv->util_delta;
6560
6561                 util_sum += __cpu_norm_util(util, capacity);
6562         }
6563
6564         return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
6565 }
6566
6567 static int find_new_capacity(struct energy_env *eenv,
6568         const struct sched_group_energy * const sge)
6569 {
6570         int idx, max_idx = sge->nr_cap_states - 1;
6571         unsigned long util = group_max_util(eenv);
6572
6573         /* default is max_cap if we don't find a match */
6574         eenv->cap_idx = max_idx;
6575
6576         for (idx = 0; idx < sge->nr_cap_states; idx++) {
6577                 if (sge->cap_states[idx].cap >= util) {
6578                         eenv->cap_idx = idx;
6579                         break;
6580                 }
6581         }
6582
6583         return eenv->cap_idx;
6584 }
6585
6586 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
6587 {
6588         int i, state = INT_MAX;
6589         int src_in_grp, dst_in_grp;
6590         long grp_util = 0;
6591
6592         /* Find the shallowest idle state in the sched group. */
6593         for_each_cpu(i, sched_group_cpus(sg))
6594                 state = min(state, idle_get_state_idx(cpu_rq(i)));
6595
6596         /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
6597         state++;
6598
6599         src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
6600         dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
6601         if (src_in_grp == dst_in_grp) {
6602                 /* both CPUs under consideration are in the same group or not in
6603                  * either group, migration should leave idle state the same.
6604                  */
6605                 goto end;
6606         }
6607
6608         /*
6609          * Try to estimate if a deeper idle state is
6610          * achievable when we move the task.
6611          */
6612         for_each_cpu(i, sched_group_cpus(sg)) {
6613                 grp_util += cpu_util_wake(i, eenv->task);
6614                 if (unlikely(i == eenv->trg_cpu))
6615                         grp_util += eenv->util_delta;
6616         }
6617
6618         if (grp_util <=
6619                 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
6620                 /* after moving, this group is at most partly
6621                  * occupied, so it should have some idle time.
6622                  */
6623                 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
6624                 int new_state = grp_util * max_idle_state_idx;
6625                 if (grp_util <= 0)
6626                         /* group will have no util, use lowest state */
6627                         new_state = max_idle_state_idx + 1;
6628                 else {
6629                         /* for partially idle, linearly map util to idle
6630                          * states, excluding the lowest one. This does not
6631                          * correspond to the state we expect to enter in
6632                          * reality, but an indication of what might happen.
6633                          */
6634                         new_state = min(max_idle_state_idx, (int)
6635                                         (new_state / sg->sgc->max_capacity));
6636                         new_state = max_idle_state_idx - new_state;
6637                 }
6638                 state = new_state;
6639         } else {
6640                 /* After moving, the group will be fully occupied
6641                  * so assume it will not be idle at all.
6642                  */
6643                 state = 0;
6644         }
6645 end:
6646         return state;
6647 }
6648
6649 /*
6650  * sched_group_energy(): Computes the absolute energy consumption of cpus
6651  * belonging to the sched_group including shared resources shared only by
6652  * members of the group. Iterates over all cpus in the hierarchy below the
6653  * sched_group starting from the bottom working it's way up before going to
6654  * the next cpu until all cpus are covered at all levels. The current
6655  * implementation is likely to gather the same util statistics multiple times.
6656  * This can probably be done in a faster but more complex way.
6657  * Note: sched_group_energy() may fail when racing with sched_domain updates.
6658  */
6659 static int sched_group_energy(struct energy_env *eenv)
6660 {
6661         struct cpumask visit_cpus;
6662         u64 total_energy = 0;
6663         int cpu_count;
6664
6665         WARN_ON(!eenv->sg_top->sge);
6666
6667         cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
6668         /* If a cpu is hotplugged in while we are in this function,
6669          * it does not appear in the existing visit_cpus mask
6670          * which came from the sched_group pointer of the
6671          * sched_domain pointed at by sd_ea for either the prev
6672          * or next cpu and was dereferenced in __energy_diff.
6673          * Since we will dereference sd_scs later as we iterate
6674          * through the CPUs we expect to visit, new CPUs can
6675          * be present which are not in the visit_cpus mask.
6676          * Guard this with cpu_count.
6677          */
6678         cpu_count = cpumask_weight(&visit_cpus);
6679
6680         while (!cpumask_empty(&visit_cpus)) {
6681                 struct sched_group *sg_shared_cap = NULL;
6682                 int cpu = cpumask_first(&visit_cpus);
6683                 struct sched_domain *sd;
6684
6685                 /*
6686                  * Is the group utilization affected by cpus outside this
6687                  * sched_group?
6688                  * This sd may have groups with cpus which were not present
6689                  * when we took visit_cpus.
6690                  */
6691                 sd = rcu_dereference(per_cpu(sd_scs, cpu));
6692
6693                 if (sd && sd->parent)
6694                         sg_shared_cap = sd->parent->groups;
6695
6696                 for_each_domain(cpu, sd) {
6697                         struct sched_group *sg = sd->groups;
6698
6699                         /* Has this sched_domain already been visited? */
6700                         if (sd->child && group_first_cpu(sg) != cpu)
6701                                 break;
6702
6703                         do {
6704                                 unsigned long group_util;
6705                                 int sg_busy_energy, sg_idle_energy;
6706                                 int cap_idx, idle_idx;
6707
6708                                 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
6709                                         eenv->sg_cap = sg_shared_cap;
6710                                 else
6711                                         eenv->sg_cap = sg;
6712
6713                                 cap_idx = find_new_capacity(eenv, sg->sge);
6714
6715                                 if (sg->group_weight == 1) {
6716                                         /* Remove capacity of src CPU (before task move) */
6717                                         if (eenv->trg_cpu == eenv->src_cpu &&
6718                                             cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
6719                                                 eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
6720                                                 eenv->cap.delta -= eenv->cap.before;
6721                                         }
6722                                         /* Add capacity of dst CPU  (after task move) */
6723                                         if (eenv->trg_cpu == eenv->dst_cpu &&
6724                                             cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
6725                                                 eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
6726                                                 eenv->cap.delta += eenv->cap.after;
6727                                         }
6728                                 }
6729
6730                                 idle_idx = group_idle_state(eenv, sg);
6731                                 group_util = group_norm_util(eenv, sg);
6732
6733                                 sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
6734                                 sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
6735                                                                 * sg->sge->idle_states[idle_idx].power);
6736
6737                                 total_energy += sg_busy_energy + sg_idle_energy;
6738
6739                                 if (!sd->child) {
6740                                         /*
6741                                          * cpu_count here is the number of
6742                                          * cpus we expect to visit in this
6743                                          * calculation. If we race against
6744                                          * hotplug, we can have extra cpus
6745                                          * added to the groups we are
6746                                          * iterating which do not appear in
6747                                          * the visit_cpus mask. In that case
6748                                          * we are not able to calculate energy
6749                                          * without restarting so we will bail
6750                                          * out and use prev_cpu this time.
6751                                          */
6752                                         if (!cpu_count)
6753                                                 return -EINVAL;
6754                                         cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
6755                                         cpu_count--;
6756                                 }
6757
6758                                 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
6759                                         goto next_cpu;
6760
6761                         } while (sg = sg->next, sg != sd->groups);
6762                 }
6763
6764                 /*
6765                  * If we raced with hotplug and got an sd NULL-pointer;
6766                  * returning a wrong energy estimation is better than
6767                  * entering an infinite loop.
6768                  * Specifically: If a cpu is unplugged after we took
6769                  * the visit_cpus mask, it no longer has an sd_scs
6770                  * pointer, so when we dereference it, we get NULL.
6771                  */
6772                 if (cpumask_test_cpu(cpu, &visit_cpus))
6773                         return -EINVAL;
6774 next_cpu:
6775                 cpumask_clear_cpu(cpu, &visit_cpus);
6776                 continue;
6777         }
6778
6779         eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
6780         return 0;
6781 }
6782
6783 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
6784 {
6785         return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
6786 }
6787
6788 static inline unsigned long task_util(struct task_struct *p);
6789
6790 /*
6791  * energy_diff(): Estimate the energy impact of changing the utilization
6792  * distribution. eenv specifies the change: utilisation amount, source, and
6793  * destination cpu. Source or destination cpu may be -1 in which case the
6794  * utilization is removed from or added to the system (e.g. task wake-up). If
6795  * both are specified, the utilization is migrated.
6796  */
6797 static inline int __energy_diff(struct energy_env *eenv)
6798 {
6799         struct sched_domain *sd;
6800         struct sched_group *sg;
6801         int sd_cpu = -1, energy_before = 0, energy_after = 0;
6802         int diff, margin;
6803
6804         struct energy_env eenv_before = {
6805                 .util_delta     = task_util(eenv->task),
6806                 .src_cpu        = eenv->src_cpu,
6807                 .dst_cpu        = eenv->dst_cpu,
6808                 .trg_cpu        = eenv->src_cpu,
6809                 .nrg            = { 0, 0, 0, 0},
6810                 .cap            = { 0, 0, 0 },
6811                 .task           = eenv->task,
6812         };
6813
6814         if (eenv->src_cpu == eenv->dst_cpu)
6815                 return 0;
6816
6817         sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
6818         sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
6819
6820         if (!sd)
6821                 return 0; /* Error */
6822
6823         sg = sd->groups;
6824
6825         do {
6826                 if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
6827                         eenv_before.sg_top = eenv->sg_top = sg;
6828
6829                         if (sched_group_energy(&eenv_before))
6830                                 return 0; /* Invalid result abort */
6831                         energy_before += eenv_before.energy;
6832
6833                         /* Keep track of SRC cpu (before) capacity */
6834                         eenv->cap.before = eenv_before.cap.before;
6835                         eenv->cap.delta = eenv_before.cap.delta;
6836
6837                         if (sched_group_energy(eenv))
6838                                 return 0; /* Invalid result abort */
6839                         energy_after += eenv->energy;
6840                 }
6841         } while (sg = sg->next, sg != sd->groups);
6842
6843         eenv->nrg.before = energy_before;
6844         eenv->nrg.after = energy_after;
6845         eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
6846         eenv->payoff = 0;
6847 #ifndef CONFIG_SCHED_TUNE
6848         trace_sched_energy_diff(eenv->task,
6849                         eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6850                         eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6851                         eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6852                         eenv->nrg.delta, eenv->payoff);
6853 #endif
6854         /*
6855          * Dead-zone margin preventing too many migrations.
6856          */
6857
6858         margin = eenv->nrg.before >> 6; /* ~1.56% */
6859
6860         diff = eenv->nrg.after - eenv->nrg.before;
6861
6862         eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
6863
6864         return eenv->nrg.diff;
6865 }
6866
6867 #ifdef CONFIG_SCHED_TUNE
6868
6869 struct target_nrg schedtune_target_nrg;
6870
6871 #ifdef CONFIG_CGROUP_SCHEDTUNE
6872 extern bool schedtune_initialized;
6873 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6874
6875 /*
6876  * System energy normalization
6877  * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
6878  * corresponding to the specified energy variation.
6879  */
6880 static inline int
6881 normalize_energy(int energy_diff)
6882 {
6883         u32 normalized_nrg;
6884
6885 #ifdef CONFIG_CGROUP_SCHEDTUNE
6886         /* during early setup, we don't know the extents */
6887         if (unlikely(!schedtune_initialized))
6888                 return energy_diff < 0 ? -1 : 1 ;
6889 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6890
6891 #ifdef CONFIG_SCHED_DEBUG
6892         {
6893         int max_delta;
6894
6895         /* Check for boundaries */
6896         max_delta  = schedtune_target_nrg.max_power;
6897         max_delta -= schedtune_target_nrg.min_power;
6898         WARN_ON(abs(energy_diff) >= max_delta);
6899         }
6900 #endif
6901
6902         /* Do scaling using positive numbers to increase the range */
6903         normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
6904
6905         /* Scale by energy magnitude */
6906         normalized_nrg <<= SCHED_CAPACITY_SHIFT;
6907
6908         /* Normalize on max energy for target platform */
6909         normalized_nrg = reciprocal_divide(
6910                         normalized_nrg, schedtune_target_nrg.rdiv);
6911
6912         return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
6913 }
6914
6915 static inline int
6916 energy_diff(struct energy_env *eenv)
6917 {
6918         int boost = schedtune_task_boost(eenv->task);
6919         int nrg_delta;
6920
6921         /* Conpute "absolute" energy diff */
6922         __energy_diff(eenv);
6923
6924         /* Return energy diff when boost margin is 0 */
6925         if (boost == 0) {
6926                 trace_sched_energy_diff(eenv->task,
6927                                 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6928                                 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6929                                 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6930                                 0, -eenv->nrg.diff);
6931                 return eenv->nrg.diff;
6932         }
6933
6934         /* Compute normalized energy diff */
6935         nrg_delta = normalize_energy(eenv->nrg.diff);
6936         eenv->nrg.delta = nrg_delta;
6937
6938         eenv->payoff = schedtune_accept_deltas(
6939                         eenv->nrg.delta,
6940                         eenv->cap.delta,
6941                         eenv->task);
6942
6943         trace_sched_energy_diff(eenv->task,
6944                         eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6945                         eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6946                         eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6947                         eenv->nrg.delta, eenv->payoff);
6948
6949         /*
6950          * When SchedTune is enabled, the energy_diff() function will return
6951          * the computed energy payoff value. Since the energy_diff() return
6952          * value is expected to be negative by its callers, this evaluation
6953          * function return a negative value each time the evaluation return a
6954          * positive payoff, which is the condition for the acceptance of
6955          * a scheduling decision
6956          */
6957         return -eenv->payoff;
6958 }
6959 #else /* CONFIG_SCHED_TUNE */
6960 #define energy_diff(eenv) __energy_diff(eenv)
6961 #endif
6962
6963 /*
6964  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
6965  * A waker of many should wake a different task than the one last awakened
6966  * at a frequency roughly N times higher than one of its wakees.  In order
6967  * to determine whether we should let the load spread vs consolodating to
6968  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
6969  * partner, and a factor of lls_size higher frequency in the other.  With
6970  * both conditions met, we can be relatively sure that the relationship is
6971  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
6972  * being client/server, worker/dispatcher, interrupt source or whatever is
6973  * irrelevant, spread criteria is apparent partner count exceeds socket size.
6974  */
6975 static int wake_wide(struct task_struct *p, int sibling_count_hint)
6976 {
6977         unsigned int master = current->wakee_flips;
6978         unsigned int slave = p->wakee_flips;
6979         int llc_size = this_cpu_read(sd_llc_size);
6980
6981         if (sibling_count_hint >= llc_size)
6982                 return 1;
6983
6984         if (master < slave)
6985                 swap(master, slave);
6986         if (slave < llc_size || master < slave * llc_size)
6987                 return 0;
6988         return 1;
6989 }
6990
6991 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6992                        int prev_cpu, int sync)
6993 {
6994         s64 this_load, load;
6995         s64 this_eff_load, prev_eff_load;
6996         int idx, this_cpu;
6997         struct task_group *tg;
6998         unsigned long weight;
6999         int balanced;
7000
7001         idx       = sd->wake_idx;
7002         this_cpu  = smp_processor_id();
7003         load      = source_load(prev_cpu, idx);
7004         this_load = target_load(this_cpu, idx);
7005
7006         /*
7007          * If sync wakeup then subtract the (maximum possible)
7008          * effect of the currently running task from the load
7009          * of the current CPU:
7010          */
7011         if (sync) {
7012                 tg = task_group(current);
7013                 weight = current->se.avg.load_avg;
7014
7015                 this_load += effective_load(tg, this_cpu, -weight, -weight);
7016                 load += effective_load(tg, prev_cpu, 0, -weight);
7017         }
7018
7019         tg = task_group(p);
7020         weight = p->se.avg.load_avg;
7021
7022         /*
7023          * In low-load situations, where prev_cpu is idle and this_cpu is idle
7024          * due to the sync cause above having dropped this_load to 0, we'll
7025          * always have an imbalance, but there's really nothing you can do
7026          * about that, so that's good too.
7027          *
7028          * Otherwise check if either cpus are near enough in load to allow this
7029          * task to be woken on this_cpu.
7030          */
7031         this_eff_load = 100;
7032         this_eff_load *= capacity_of(prev_cpu);
7033
7034         prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
7035         prev_eff_load *= capacity_of(this_cpu);
7036
7037         if (this_load > 0) {
7038                 this_eff_load *= this_load +
7039                         effective_load(tg, this_cpu, weight, weight);
7040
7041                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
7042         }
7043
7044         balanced = this_eff_load <= prev_eff_load;
7045
7046         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
7047
7048         if (!balanced)
7049                 return 0;
7050
7051         schedstat_inc(sd, ttwu_move_affine);
7052         schedstat_inc(p, se.statistics.nr_wakeups_affine);
7053
7054         return 1;
7055 }
7056
7057 static inline unsigned long task_util(struct task_struct *p)
7058 {
7059         return p->se.avg.util_avg;
7060 }
7061
7062 static inline unsigned long boosted_task_util(struct task_struct *task);
7063
7064 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
7065 {
7066         unsigned long capacity = capacity_of(cpu);
7067
7068         util += boosted_task_util(p);
7069
7070         return (capacity * 1024) > (util * capacity_margin);
7071 }
7072
7073 static inline bool task_fits_max(struct task_struct *p, int cpu)
7074 {
7075         unsigned long capacity = capacity_of(cpu);
7076         unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7077
7078         if (capacity == max_capacity)
7079                 return true;
7080
7081         if (capacity * capacity_margin > max_capacity * 1024)
7082                 return true;
7083
7084         return __task_fits(p, cpu, 0);
7085 }
7086
7087 static bool __cpu_overutilized(int cpu, int delta)
7088 {
7089         return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
7090 }
7091
7092 static bool cpu_overutilized(int cpu)
7093 {
7094         return __cpu_overutilized(cpu, 0);
7095 }
7096
7097 #ifdef CONFIG_SCHED_TUNE
7098
7099 struct reciprocal_value schedtune_spc_rdiv;
7100
7101 static long
7102 schedtune_margin(unsigned long signal, long boost)
7103 {
7104         long long margin = 0;
7105
7106         /*
7107          * Signal proportional compensation (SPC)
7108          *
7109          * The Boost (B) value is used to compute a Margin (M) which is
7110          * proportional to the complement of the original Signal (S):
7111          *   M = B * (SCHED_CAPACITY_SCALE - S)
7112          * The obtained M could be used by the caller to "boost" S.
7113          */
7114         if (boost >= 0) {
7115                 margin  = SCHED_CAPACITY_SCALE - signal;
7116                 margin *= boost;
7117         } else
7118                 margin = -signal * boost;
7119
7120         margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
7121
7122         if (boost < 0)
7123                 margin *= -1;
7124         return margin;
7125 }
7126
7127 static inline int
7128 schedtune_cpu_margin(unsigned long util, int cpu)
7129 {
7130         int boost = schedtune_cpu_boost(cpu);
7131
7132         if (boost == 0)
7133                 return 0;
7134
7135         return schedtune_margin(util, boost);
7136 }
7137
7138 static inline long
7139 schedtune_task_margin(struct task_struct *task)
7140 {
7141         int boost = schedtune_task_boost(task);
7142         unsigned long util;
7143         long margin;
7144
7145         if (boost == 0)
7146                 return 0;
7147
7148         util = task_util(task);
7149         margin = schedtune_margin(util, boost);
7150
7151         return margin;
7152 }
7153
7154 #else /* CONFIG_SCHED_TUNE */
7155
7156 static inline int
7157 schedtune_cpu_margin(unsigned long util, int cpu)
7158 {
7159         return 0;
7160 }
7161
7162 static inline int
7163 schedtune_task_margin(struct task_struct *task)
7164 {
7165         return 0;
7166 }
7167
7168 #endif /* CONFIG_SCHED_TUNE */
7169
7170 unsigned long
7171 boosted_cpu_util(int cpu)
7172 {
7173         unsigned long util = cpu_util_freq(cpu);
7174         long margin = schedtune_cpu_margin(util, cpu);
7175
7176         trace_sched_boost_cpu(cpu, util, margin);
7177
7178         return util + margin;
7179 }
7180
7181 static inline unsigned long
7182 boosted_task_util(struct task_struct *task)
7183 {
7184         unsigned long util = task_util(task);
7185         long margin = schedtune_task_margin(task);
7186
7187         trace_sched_boost_task(task, util, margin);
7188
7189         return util + margin;
7190 }
7191
7192 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
7193 {
7194         return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
7195 }
7196
7197 /*
7198  * find_idlest_group finds and returns the least busy CPU group within the
7199  * domain.
7200  *
7201  * Assumes p is allowed on at least one CPU in sd.
7202  */
7203 static struct sched_group *
7204 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
7205                   int this_cpu, int sd_flag)
7206 {
7207         struct sched_group *idlest = NULL, *group = sd->groups;
7208         struct sched_group *most_spare_sg = NULL;
7209         unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
7210         unsigned long most_spare = 0, this_spare = 0;
7211         int load_idx = sd->forkexec_idx;
7212         int imbalance = 100 + (sd->imbalance_pct-100)/2;
7213
7214         if (sd_flag & SD_BALANCE_WAKE)
7215                 load_idx = sd->wake_idx;
7216
7217         do {
7218                 unsigned long load, avg_load, spare_cap, max_spare_cap;
7219                 int local_group;
7220                 int i;
7221
7222                 /* Skip over this group if it has no CPUs allowed */
7223                 if (!cpumask_intersects(sched_group_cpus(group),
7224                                         tsk_cpus_allowed(p)))
7225                         continue;
7226
7227                 local_group = cpumask_test_cpu(this_cpu,
7228                                                sched_group_cpus(group));
7229
7230                 /*
7231                  * Tally up the load of all CPUs in the group and find
7232                  * the group containing the CPU with most spare capacity.
7233                  */
7234                 avg_load = 0;
7235                 max_spare_cap = 0;
7236
7237                 for_each_cpu(i, sched_group_cpus(group)) {
7238                         /* Bias balancing toward cpus of our domain */
7239                         if (local_group)
7240                                 load = source_load(i, load_idx);
7241                         else
7242                                 load = target_load(i, load_idx);
7243
7244                         avg_load += load;
7245
7246                         spare_cap = capacity_spare_wake(i, p);
7247
7248                         if (spare_cap > max_spare_cap)
7249                                 max_spare_cap = spare_cap;
7250                 }
7251
7252                 /* Adjust by relative CPU capacity of the group */
7253                 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
7254
7255                 if (local_group) {
7256                         this_load = avg_load;
7257                         this_spare = max_spare_cap;
7258                 } else {
7259                         if (avg_load < min_load) {
7260                                 min_load = avg_load;
7261                                 idlest = group;
7262                         }
7263
7264                         if (most_spare < max_spare_cap) {
7265                                 most_spare = max_spare_cap;
7266                                 most_spare_sg = group;
7267                         }
7268                 }
7269         } while (group = group->next, group != sd->groups);
7270
7271         /*
7272          * The cross-over point between using spare capacity or least load
7273          * is too conservative for high utilization tasks on partially
7274          * utilized systems if we require spare_capacity > task_util(p),
7275          * so we allow for some task stuffing by using
7276          * spare_capacity > task_util(p)/2.
7277          *
7278          * Spare capacity can't be used for fork because the utilization has
7279          * not been set yet, we must first select a rq to compute the initial
7280          * utilization.
7281          */
7282         if (sd_flag & SD_BALANCE_FORK)
7283                 goto skip_spare;
7284
7285         if (this_spare > task_util(p) / 2 &&
7286             imbalance*this_spare > 100*most_spare)
7287                 return NULL;
7288         else if (most_spare > task_util(p) / 2)
7289                 return most_spare_sg;
7290
7291 skip_spare:
7292         if (!idlest || 100*this_load < imbalance*min_load)
7293                 return NULL;
7294         return idlest;
7295 }
7296
7297 /*
7298  * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
7299  */
7300 static int
7301 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7302 {
7303         unsigned long load, min_load = ULONG_MAX;
7304         unsigned int min_exit_latency = UINT_MAX;
7305         u64 latest_idle_timestamp = 0;
7306         int least_loaded_cpu = this_cpu;
7307         int shallowest_idle_cpu = -1;
7308         int i;
7309
7310         /* Check if we have any choice: */
7311         if (group->group_weight == 1)
7312                 return cpumask_first(sched_group_cpus(group));
7313
7314         /* Traverse only the allowed CPUs */
7315         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
7316                 if (idle_cpu(i)) {
7317                         struct rq *rq = cpu_rq(i);
7318                         struct cpuidle_state *idle = idle_get_state(rq);
7319                         if (idle && idle->exit_latency < min_exit_latency) {
7320                                 /*
7321                                  * We give priority to a CPU whose idle state
7322                                  * has the smallest exit latency irrespective
7323                                  * of any idle timestamp.
7324                                  */
7325                                 min_exit_latency = idle->exit_latency;
7326                                 latest_idle_timestamp = rq->idle_stamp;
7327                                 shallowest_idle_cpu = i;
7328                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7329                                    rq->idle_stamp > latest_idle_timestamp) {
7330                                 /*
7331                                  * If equal or no active idle state, then
7332                                  * the most recently idled CPU might have
7333                                  * a warmer cache.
7334                                  */
7335                                 latest_idle_timestamp = rq->idle_stamp;
7336                                 shallowest_idle_cpu = i;
7337                         }
7338                 } else if (shallowest_idle_cpu == -1) {
7339                         load = weighted_cpuload(i);
7340                         if (load < min_load || (load == min_load && i == this_cpu)) {
7341                                 min_load = load;
7342                                 least_loaded_cpu = i;
7343                         }
7344                 }
7345         }
7346
7347         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7348  }
7349
7350 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
7351                                   int cpu, int prev_cpu, int sd_flag)
7352 {
7353         int new_cpu = cpu;
7354         int wu = sd_flag & SD_BALANCE_WAKE;
7355         int cas_cpu = -1;
7356
7357         if (wu) {
7358                 schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
7359                 schedstat_inc(this_rq(), eas_stats.cas_attempts);
7360         }
7361
7362         if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
7363                 return prev_cpu;
7364
7365         while (sd) {
7366                 struct sched_group *group;
7367                 struct sched_domain *tmp;
7368                 int weight;
7369
7370                 if (wu)
7371                         schedstat_inc(sd, eas_stats.cas_attempts);
7372
7373                 if (!(sd->flags & sd_flag)) {
7374                         sd = sd->child;
7375                         continue;
7376                 }
7377
7378                 group = find_idlest_group(sd, p, cpu, sd_flag);
7379                 if (!group) {
7380                         sd = sd->child;
7381                         continue;
7382                 }
7383
7384                 new_cpu = find_idlest_group_cpu(group, p, cpu);
7385                 if (new_cpu == cpu) {
7386                         /* Now try balancing at a lower domain level of cpu */
7387                         sd = sd->child;
7388                         continue;
7389                 }
7390
7391                 /* Now try balancing at a lower domain level of new_cpu */
7392                 cpu = cas_cpu = new_cpu;
7393                 weight = sd->span_weight;
7394                 sd = NULL;
7395                 for_each_domain(cpu, tmp) {
7396                         if (weight <= tmp->span_weight)
7397                                 break;
7398                         if (tmp->flags & sd_flag)
7399                                 sd = tmp;
7400                 }
7401                 /* while loop will break here if sd == NULL */
7402         }
7403
7404         if (wu && (cas_cpu >= 0)) {
7405                 schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
7406                 schedstat_inc(this_rq(), eas_stats.cas_count);
7407         }
7408
7409         return new_cpu;
7410 }
7411
7412 /*
7413  * Try and locate an idle CPU in the sched_domain.
7414  */
7415 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7416 {
7417         struct sched_domain *sd;
7418         struct sched_group *sg;
7419         int best_idle_cpu = -1;
7420         int best_idle_cstate = INT_MAX;
7421         unsigned long best_idle_capacity = ULONG_MAX;
7422
7423         schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
7424         schedstat_inc(this_rq(), eas_stats.sis_attempts);
7425
7426         if (!sysctl_sched_cstate_aware) {
7427                 if (idle_cpu(target)) {
7428                         schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
7429                         schedstat_inc(this_rq(), eas_stats.sis_idle);
7430                         return target;
7431                 }
7432
7433                 /*
7434                  * If the prevous cpu is cache affine and idle, don't be stupid.
7435                  */
7436                 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
7437                         schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
7438                         schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
7439                         return prev;
7440                 }
7441         }
7442
7443         if (!(current->flags & PF_WAKE_UP_IDLE) &&
7444                         !(p->flags & PF_WAKE_UP_IDLE))
7445                 return target;
7446
7447         /*
7448          * Otherwise, iterate the domains and find an elegible idle cpu.
7449          */
7450         sd = rcu_dereference(per_cpu(sd_llc, target));
7451         for_each_lower_domain(sd) {
7452                 sg = sd->groups;
7453                 do {
7454                         int i;
7455                         if (!cpumask_intersects(sched_group_cpus(sg),
7456                                                 tsk_cpus_allowed(p)))
7457                                 goto next;
7458
7459                         if (sysctl_sched_cstate_aware) {
7460                                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7461                                         int idle_idx = idle_get_state_idx(cpu_rq(i));
7462                                         unsigned long new_usage = boosted_task_util(p);
7463                                         unsigned long capacity_orig = capacity_orig_of(i);
7464
7465                                         if (new_usage > capacity_orig || !idle_cpu(i))
7466                                                 goto next;
7467
7468                                         if (i == target && new_usage <= capacity_curr_of(target)) {
7469                                                 schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
7470                                                 schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
7471                                                 schedstat_inc(sd, eas_stats.sis_suff_cap);
7472                                                 return target;
7473                                         }
7474
7475                                         if (idle_idx < best_idle_cstate &&
7476                                             capacity_orig <= best_idle_capacity) {
7477                                                 best_idle_cpu = i;
7478                                                 best_idle_cstate = idle_idx;
7479                                                 best_idle_capacity = capacity_orig;
7480                                         }
7481                                 }
7482                         } else {
7483                                 for_each_cpu(i, sched_group_cpus(sg)) {
7484                                         if (i == target || !idle_cpu(i))
7485                                                 goto next;
7486                                 }
7487
7488                                 target = cpumask_first_and(sched_group_cpus(sg),
7489                                         tsk_cpus_allowed(p));
7490                                 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
7491                                 schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
7492                                 schedstat_inc(sd, eas_stats.sis_idle_cpu);
7493                                 goto done;
7494                         }
7495 next:
7496                         sg = sg->next;
7497                 } while (sg != sd->groups);
7498         }
7499
7500         if (best_idle_cpu >= 0)
7501                 target = best_idle_cpu;
7502
7503 done:
7504         schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
7505         schedstat_inc(this_rq(), eas_stats.sis_count);
7506
7507         return target;
7508 }
7509
7510 /*
7511  * cpu_util_wake: Compute cpu utilization with any contributions from
7512  * the waking task p removed.  check_for_migration() looks for a better CPU of
7513  * rq->curr. For that case we should return cpu util with contributions from
7514  * currently running task p removed.
7515  */
7516 static int cpu_util_wake(int cpu, struct task_struct *p)
7517 {
7518         unsigned long util, capacity;
7519
7520 #ifdef CONFIG_SCHED_WALT
7521         /*
7522          * WALT does not decay idle tasks in the same manner
7523          * as PELT, so it makes little sense to subtract task
7524          * utilization from cpu utilization. Instead just use
7525          * cpu_util for this case.
7526          */
7527         if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7528             p->state == TASK_WAKING)
7529                 return cpu_util(cpu);
7530 #endif
7531         /* Task has no contribution or is new */
7532         if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
7533                 return cpu_util(cpu);
7534
7535         capacity = capacity_orig_of(cpu);
7536         util = max_t(long, cpu_util(cpu) - task_util(p), 0);
7537
7538         return (util >= capacity) ? capacity : util;
7539 }
7540
7541 static int start_cpu(bool boosted)
7542 {
7543         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7544
7545         return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
7546 }
7547
7548 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
7549                                    bool boosted, bool prefer_idle)
7550 {
7551         unsigned long best_idle_min_cap_orig = ULONG_MAX;
7552         unsigned long min_util = boosted_task_util(p);
7553         unsigned long target_capacity = ULONG_MAX;
7554         unsigned long min_wake_util = ULONG_MAX;
7555         unsigned long target_max_spare_cap = 0;
7556         unsigned long best_active_util = ULONG_MAX;
7557         int best_idle_cstate = INT_MAX;
7558         struct sched_domain *sd;
7559         struct sched_group *sg;
7560         int best_active_cpu = -1;
7561         int best_idle_cpu = -1;
7562         int target_cpu = -1;
7563         int cpu, i;
7564
7565         *backup_cpu = -1;
7566
7567         schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
7568         schedstat_inc(this_rq(), eas_stats.fbt_attempts);
7569
7570         /* Find start CPU based on boost value */
7571         cpu = start_cpu(boosted);
7572         if (cpu < 0) {
7573                 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
7574                 schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
7575                 return -1;
7576         }
7577
7578         /* Find SD for the start CPU */
7579         sd = rcu_dereference(per_cpu(sd_ea, cpu));
7580         if (!sd) {
7581                 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
7582                 schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
7583                 return -1;
7584         }
7585
7586         /* Scan CPUs in all SDs */
7587         sg = sd->groups;
7588         do {
7589                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7590                         unsigned long capacity_curr = capacity_curr_of(i);
7591                         unsigned long capacity_orig = capacity_orig_of(i);
7592                         unsigned long wake_util, new_util;
7593
7594                         if (!cpu_online(i))
7595                                 continue;
7596
7597                         if (walt_cpu_high_irqload(i))
7598                                 continue;
7599
7600                         /*
7601                          * p's blocked utilization is still accounted for on prev_cpu
7602                          * so prev_cpu will receive a negative bias due to the double
7603                          * accounting. However, the blocked utilization may be zero.
7604                          */
7605                         wake_util = cpu_util_wake(i, p);
7606                         new_util = wake_util + task_util(p);
7607
7608                         /*
7609                          * Ensure minimum capacity to grant the required boost.
7610                          * The target CPU can be already at a capacity level higher
7611                          * than the one required to boost the task.
7612                          */
7613                         new_util = max(min_util, new_util);
7614                         if (new_util > capacity_orig)
7615                                 continue;
7616
7617                         /*
7618                          * Case A) Latency sensitive tasks
7619                          *
7620                          * Unconditionally favoring tasks that prefer idle CPU to
7621                          * improve latency.
7622                          *
7623                          * Looking for:
7624                          * - an idle CPU, whatever its idle_state is, since
7625                          *   the first CPUs we explore are more likely to be
7626                          *   reserved for latency sensitive tasks.
7627                          * - a non idle CPU where the task fits in its current
7628                          *   capacity and has the maximum spare capacity.
7629                          * - a non idle CPU with lower contention from other
7630                          *   tasks and running at the lowest possible OPP.
7631                          *
7632                          * The last two goals tries to favor a non idle CPU
7633                          * where the task can run as if it is "almost alone".
7634                          * A maximum spare capacity CPU is favoured since
7635                          * the task already fits into that CPU's capacity
7636                          * without waiting for an OPP chance.
7637                          *
7638                          * The following code path is the only one in the CPUs
7639                          * exploration loop which is always used by
7640                          * prefer_idle tasks. It exits the loop with wither a
7641                          * best_active_cpu or a target_cpu which should
7642                          * represent an optimal choice for latency sensitive
7643                          * tasks.
7644                          */
7645                         if (prefer_idle) {
7646
7647                                 /*
7648                                  * Case A.1: IDLE CPU
7649                                  * Return the first IDLE CPU we find.
7650                                  */
7651                                 if (idle_cpu(i)) {
7652                                         schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
7653                                         schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
7654
7655                                         trace_sched_find_best_target(p,
7656                                                         prefer_idle, min_util,
7657                                                         cpu, best_idle_cpu,
7658                                                         best_active_cpu, i);
7659
7660                                         return i;
7661                                 }
7662
7663                                 /*
7664                                  * Case A.2: Target ACTIVE CPU
7665                                  * Favor CPUs with max spare capacity.
7666                                  */
7667                                 if ((capacity_curr > new_util) &&
7668                                         (capacity_orig - new_util > target_max_spare_cap)) {
7669                                         target_max_spare_cap = capacity_orig - new_util;
7670                                         target_cpu = i;
7671                                         continue;
7672                                 }
7673                                 if (target_cpu != -1)
7674                                         continue;
7675
7676
7677                                 /*
7678                                  * Case A.3: Backup ACTIVE CPU
7679                                  * Favor CPUs with:
7680                                  * - lower utilization due to other tasks
7681                                  * - lower utilization with the task in
7682                                  */
7683                                 if (wake_util > min_wake_util)
7684                                         continue;
7685                                 if (new_util > best_active_util)
7686                                         continue;
7687                                 min_wake_util = wake_util;
7688                                 best_active_util = new_util;
7689                                 best_active_cpu = i;
7690                                 continue;
7691                         }
7692
7693                         /*
7694                          * Enforce EAS mode
7695                          *
7696                          * For non latency sensitive tasks, skip CPUs that
7697                          * will be overutilized by moving the task there.
7698                          *
7699                          * The goal here is to remain in EAS mode as long as
7700                          * possible at least for !prefer_idle tasks.
7701                          */
7702                         if ((new_util * capacity_margin) >
7703                             (capacity_orig * SCHED_CAPACITY_SCALE))
7704                                 continue;
7705
7706                         /*
7707                          * Case B) Non latency sensitive tasks on IDLE CPUs.
7708                          *
7709                          * Find an optimal backup IDLE CPU for non latency
7710                          * sensitive tasks.
7711                          *
7712                          * Looking for:
7713                          * - minimizing the capacity_orig,
7714                          *   i.e. preferring LITTLE CPUs
7715                          * - favoring shallowest idle states
7716                          *   i.e. avoid to wakeup deep-idle CPUs
7717                          *
7718                          * The following code path is used by non latency
7719                          * sensitive tasks if IDLE CPUs are available. If at
7720                          * least one of such CPUs are available it sets the
7721                          * best_idle_cpu to the most suitable idle CPU to be
7722                          * selected.
7723                          *
7724                          * If idle CPUs are available, favour these CPUs to
7725                          * improve performances by spreading tasks.
7726                          * Indeed, the energy_diff() computed by the caller
7727                          * will take care to ensure the minimization of energy
7728                          * consumptions without affecting performance.
7729                          */
7730                         if (idle_cpu(i)) {
7731                                 int idle_idx = idle_get_state_idx(cpu_rq(i));
7732
7733                                 /* Select idle CPU with lower cap_orig */
7734                                 if (capacity_orig > best_idle_min_cap_orig)
7735                                         continue;
7736
7737                                 /*
7738                                  * Skip CPUs in deeper idle state, but only
7739                                  * if they are also less energy efficient.
7740                                  * IOW, prefer a deep IDLE LITTLE CPU vs a
7741                                  * shallow idle big CPU.
7742                                  */
7743                                 if (sysctl_sched_cstate_aware &&
7744                                     best_idle_cstate <= idle_idx)
7745                                         continue;
7746
7747                                 /* Keep track of best idle CPU */
7748                                 best_idle_min_cap_orig = capacity_orig;
7749                                 best_idle_cstate = idle_idx;
7750                                 best_idle_cpu = i;
7751                                 continue;
7752                         }
7753
7754                         /*
7755                          * Case C) Non latency sensitive tasks on ACTIVE CPUs.
7756                          *
7757                          * Pack tasks in the most energy efficient capacities.
7758                          *
7759                          * This task packing strategy prefers more energy
7760                          * efficient CPUs (i.e. pack on smaller maximum
7761                          * capacity CPUs) while also trying to spread tasks to
7762                          * run them all at the lower OPP.
7763                          *
7764                          * This assumes for example that it's more energy
7765                          * efficient to run two tasks on two CPUs at a lower
7766                          * OPP than packing both on a single CPU but running
7767                          * that CPU at an higher OPP.
7768                          *
7769                          * Thus, this case keep track of the CPU with the
7770                          * smallest maximum capacity and highest spare maximum
7771                          * capacity.
7772                          */
7773
7774                         /* Favor CPUs with smaller capacity */
7775                         if (capacity_orig > target_capacity)
7776                                 continue;
7777
7778                         /* Favor CPUs with maximum spare capacity */
7779                         if ((capacity_orig - new_util) < target_max_spare_cap)
7780                                 continue;
7781
7782                         target_max_spare_cap = capacity_orig - new_util;
7783                         target_capacity = capacity_orig;
7784                         target_cpu = i;
7785                 }
7786
7787         } while (sg = sg->next, sg != sd->groups);
7788
7789         /*
7790          * For non latency sensitive tasks, cases B and C in the previous loop,
7791          * we pick the best IDLE CPU only if we was not able to find a target
7792          * ACTIVE CPU.
7793          *
7794          * Policies priorities:
7795          *
7796          * - prefer_idle tasks:
7797          *
7798          *   a) IDLE CPU available, we return immediately
7799          *   b) ACTIVE CPU where task fits and has the bigger maximum spare
7800          *      capacity (i.e. target_cpu)
7801          *   c) ACTIVE CPU with less contention due to other tasks
7802          *      (i.e. best_active_cpu)
7803          *
7804          * - NON prefer_idle tasks:
7805          *
7806          *   a) ACTIVE CPU: target_cpu
7807          *   b) IDLE CPU: best_idle_cpu
7808          */
7809         if (target_cpu == -1)
7810                 target_cpu = prefer_idle
7811                         ? best_active_cpu
7812                         : best_idle_cpu;
7813         else
7814                 *backup_cpu = prefer_idle
7815                 ? best_active_cpu
7816                 : best_idle_cpu;
7817
7818         trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
7819                                      best_idle_cpu, best_active_cpu,
7820                                      target_cpu);
7821
7822         schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
7823         schedstat_inc(this_rq(), eas_stats.fbt_count);
7824
7825         return target_cpu;
7826 }
7827
7828 /*
7829  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
7830  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
7831  *
7832  * In that case WAKE_AFFINE doesn't make sense and we'll let
7833  * BALANCE_WAKE sort things out.
7834  */
7835 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
7836 {
7837         long min_cap, max_cap;
7838
7839         min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
7840         max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7841
7842         /* Minimum capacity is close to max, no need to abort wake_affine */
7843         if (max_cap - min_cap < max_cap >> 3)
7844                 return 0;
7845
7846         /* Bring task utilization in sync with prev_cpu */
7847         sync_entity_load_avg(&p->se);
7848
7849         return min_cap * 1024 < task_util(p) * capacity_margin;
7850 }
7851
7852 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
7853 {
7854         struct sched_domain *sd;
7855         int target_cpu = prev_cpu, tmp_target, tmp_backup;
7856         bool boosted, prefer_idle;
7857
7858         schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
7859         schedstat_inc(this_rq(), eas_stats.secb_attempts);
7860
7861         if (sysctl_sched_sync_hint_enable && sync) {
7862                 int cpu = smp_processor_id();
7863
7864                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7865                         schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
7866                         schedstat_inc(this_rq(), eas_stats.secb_sync);
7867                         return cpu;
7868                 }
7869         }
7870
7871         rcu_read_lock();
7872 #ifdef CONFIG_CGROUP_SCHEDTUNE
7873         boosted = schedtune_task_boost(p) > 0;
7874         prefer_idle = schedtune_prefer_idle(p) > 0;
7875 #else
7876         boosted = get_sysctl_sched_cfs_boost() > 0;
7877         prefer_idle = 0;
7878 #endif
7879
7880         sync_entity_load_avg(&p->se);
7881
7882         sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
7883         /* Find a cpu with sufficient capacity */
7884         tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
7885
7886         if (!sd)
7887                 goto unlock;
7888         if (tmp_target >= 0) {
7889                 target_cpu = tmp_target;
7890                 if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
7891                         schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
7892                         schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
7893                         goto unlock;
7894                 }
7895         }
7896
7897         if (target_cpu != prev_cpu) {
7898                 int delta = 0;
7899                 struct energy_env eenv = {
7900                         .util_delta     = task_util(p),
7901                         .src_cpu        = prev_cpu,
7902                         .dst_cpu        = target_cpu,
7903                         .task           = p,
7904                         .trg_cpu        = target_cpu,
7905                 };
7906
7907
7908 #ifdef CONFIG_SCHED_WALT
7909                 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7910                         p->state == TASK_WAKING)
7911                         delta = task_util(p);
7912 #endif
7913                 /* Not enough spare capacity on previous cpu */
7914                 if (__cpu_overutilized(prev_cpu, delta)) {
7915                         schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
7916                         schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
7917                         goto unlock;
7918                 }
7919
7920                 if (energy_diff(&eenv) >= 0) {
7921                         /* No energy saving for target_cpu, try backup */
7922                         target_cpu = tmp_backup;
7923                         eenv.dst_cpu = target_cpu;
7924                         eenv.trg_cpu = target_cpu;
7925                         if (tmp_backup < 0 ||
7926                             tmp_backup == prev_cpu ||
7927                             energy_diff(&eenv) >= 0) {
7928                                 schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
7929                                 schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
7930                                 target_cpu = prev_cpu;
7931                                 goto unlock;
7932                         }
7933                 }
7934
7935                 schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
7936                 schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
7937                 goto unlock;
7938         }
7939
7940         schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
7941         schedstat_inc(this_rq(), eas_stats.secb_count);
7942
7943 unlock:
7944         rcu_read_unlock();
7945
7946         return target_cpu;
7947 }
7948
7949 /*
7950  * select_task_rq_fair: Select target runqueue for the waking task in domains
7951  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
7952  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
7953  *
7954  * Balances load by selecting the idlest cpu in the idlest group, or under
7955  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
7956  *
7957  * Returns the target cpu number.
7958  *
7959  * preempt must be disabled.
7960  */
7961 static int
7962 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7963                     int sibling_count_hint)
7964 {
7965         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
7966         int cpu = smp_processor_id();
7967         int new_cpu = prev_cpu;
7968         int want_affine = 0;
7969         int sync = wake_flags & WF_SYNC;
7970
7971 #ifdef CONFIG_SCHED_HMP
7972         return select_best_cpu(p, prev_cpu, 0, sync);
7973 #endif
7974
7975         if (sd_flag & SD_BALANCE_WAKE) {
7976                 record_wakee(p);
7977                 want_affine = !wake_wide(p, sibling_count_hint) &&
7978                               !wake_cap(p, cpu, prev_cpu) &&
7979                               cpumask_test_cpu(cpu, &p->cpus_allowed);
7980         }
7981
7982         if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
7983                 return select_energy_cpu_brute(p, prev_cpu, sync);
7984
7985         rcu_read_lock();
7986         for_each_domain(cpu, tmp) {
7987                 if (!(tmp->flags & SD_LOAD_BALANCE))
7988                         break;
7989
7990                 /*
7991                  * If both cpu and prev_cpu are part of this domain,
7992                  * cpu is a valid SD_WAKE_AFFINE target.
7993                  */
7994                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7995                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
7996                         affine_sd = tmp;
7997                         break;
7998                 }
7999
8000                 if (tmp->flags & sd_flag)
8001                         sd = tmp;
8002                 else if (!want_affine)
8003                         break;
8004         }
8005
8006         if (affine_sd) {
8007                 sd = NULL; /* Prefer wake_affine over balance flags */
8008                 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
8009                         new_cpu = cpu;
8010         }
8011
8012         if (sd && !(sd_flag & SD_BALANCE_FORK)) {
8013                 /*
8014                  * We're going to need the task's util for capacity_spare_wake
8015                  * in find_idlest_group. Sync it up to prev_cpu's
8016                  * last_update_time.
8017                  */
8018                 sync_entity_load_avg(&p->se);
8019         }
8020
8021         if (!sd) {
8022                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
8023                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8024
8025         } else {
8026                 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
8027         }
8028         rcu_read_unlock();
8029
8030         return new_cpu;
8031 }
8032
8033 /*
8034  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
8035  * cfs_rq_of(p) references at time of call are still valid and identify the
8036  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
8037  * other assumptions, including the state of rq->lock, should be made.
8038  */
8039 static void migrate_task_rq_fair(struct task_struct *p)
8040 {
8041         /*
8042          * We are supposed to update the task to "current" time, then its up to date
8043          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
8044          * what current time is, so simply throw away the out-of-date time. This
8045          * will result in the wakee task is less decayed, but giving the wakee more
8046          * load sounds not bad.
8047          */
8048         remove_entity_load_avg(&p->se);
8049
8050         /* Tell new CPU we are migrated */
8051         p->se.avg.last_update_time = 0;
8052
8053         /* We have migrated, no longer consider this task hot */
8054         p->se.exec_start = 0;
8055 }
8056
8057 static void task_dead_fair(struct task_struct *p)
8058 {
8059         remove_entity_load_avg(&p->se);
8060 }
8061 #else
8062 #define task_fits_max(p, cpu) true
8063 #endif /* CONFIG_SMP */
8064
8065 static unsigned long
8066 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
8067 {
8068         unsigned long gran = sysctl_sched_wakeup_granularity;
8069
8070         /*
8071          * Since its curr running now, convert the gran from real-time
8072          * to virtual-time in his units.
8073          *
8074          * By using 'se' instead of 'curr' we penalize light tasks, so
8075          * they get preempted easier. That is, if 'se' < 'curr' then
8076          * the resulting gran will be larger, therefore penalizing the
8077          * lighter, if otoh 'se' > 'curr' then the resulting gran will
8078          * be smaller, again penalizing the lighter task.
8079          *
8080          * This is especially important for buddies when the leftmost
8081          * task is higher priority than the buddy.
8082          */
8083         return calc_delta_fair(gran, se);
8084 }
8085
8086 /*
8087  * Should 'se' preempt 'curr'.
8088  *
8089  *             |s1
8090  *        |s2
8091  *   |s3
8092  *         g
8093  *      |<--->|c
8094  *
8095  *  w(c, s1) = -1
8096  *  w(c, s2) =  0
8097  *  w(c, s3) =  1
8098  *
8099  */
8100 static int
8101 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
8102 {
8103         s64 gran, vdiff = curr->vruntime - se->vruntime;
8104
8105         if (vdiff <= 0)
8106                 return -1;
8107
8108         gran = wakeup_gran(curr, se);
8109         if (vdiff > gran)
8110                 return 1;
8111
8112         return 0;
8113 }
8114
8115 static void set_last_buddy(struct sched_entity *se)
8116 {
8117         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8118                 return;
8119
8120         for_each_sched_entity(se)
8121                 cfs_rq_of(se)->last = se;
8122 }
8123
8124 static void set_next_buddy(struct sched_entity *se)
8125 {
8126         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8127                 return;
8128
8129         for_each_sched_entity(se)
8130                 cfs_rq_of(se)->next = se;
8131 }
8132
8133 static void set_skip_buddy(struct sched_entity *se)
8134 {
8135         for_each_sched_entity(se)
8136                 cfs_rq_of(se)->skip = se;
8137 }
8138
8139 /*
8140  * Preempt the current task with a newly woken task if needed:
8141  */
8142 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
8143 {
8144         struct task_struct *curr = rq->curr;
8145         struct sched_entity *se = &curr->se, *pse = &p->se;
8146         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8147         int scale = cfs_rq->nr_running >= sched_nr_latency;
8148         int next_buddy_marked = 0;
8149
8150         if (unlikely(se == pse))
8151                 return;
8152
8153         /*
8154          * This is possible from callers such as attach_tasks(), in which we
8155          * unconditionally check_prempt_curr() after an enqueue (which may have
8156          * lead to a throttle).  This both saves work and prevents false
8157          * next-buddy nomination below.
8158          */
8159         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8160                 return;
8161
8162         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
8163                 set_next_buddy(pse);
8164                 next_buddy_marked = 1;
8165         }
8166
8167         /*
8168          * We can come here with TIF_NEED_RESCHED already set from new task
8169          * wake up path.
8170          *
8171          * Note: this also catches the edge-case of curr being in a throttled
8172          * group (e.g. via set_curr_task), since update_curr() (in the
8173          * enqueue of curr) will have resulted in resched being set.  This
8174          * prevents us from potentially nominating it as a false LAST_BUDDY
8175          * below.
8176          */
8177         if (test_tsk_need_resched(curr))
8178                 return;
8179
8180         /* Idle tasks are by definition preempted by non-idle tasks. */
8181         if (unlikely(curr->policy == SCHED_IDLE) &&
8182             likely(p->policy != SCHED_IDLE))
8183                 goto preempt;
8184
8185         /*
8186          * Batch and idle tasks do not preempt non-idle tasks (their preemption
8187          * is driven by the tick):
8188          */
8189         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8190                 return;
8191
8192         find_matching_se(&se, &pse);
8193         update_curr(cfs_rq_of(se));
8194         BUG_ON(!pse);
8195         if (wakeup_preempt_entity(se, pse) == 1) {
8196                 /*
8197                  * Bias pick_next to pick the sched entity that is
8198                  * triggering this preemption.
8199                  */
8200                 if (!next_buddy_marked)
8201                         set_next_buddy(pse);
8202                 goto preempt;
8203         }
8204
8205         return;
8206
8207 preempt:
8208         resched_curr(rq);
8209         /*
8210          * Only set the backward buddy when the current task is still
8211          * on the rq. This can happen when a wakeup gets interleaved
8212          * with schedule on the ->pre_schedule() or idle_balance()
8213          * point, either of which can * drop the rq lock.
8214          *
8215          * Also, during early boot the idle thread is in the fair class,
8216          * for obvious reasons its a bad idea to schedule back to it.
8217          */
8218         if (unlikely(!se->on_rq || curr == rq->idle))
8219                 return;
8220
8221         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
8222                 set_last_buddy(se);
8223 }
8224
8225 static struct task_struct *
8226 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
8227 {
8228         struct cfs_rq *cfs_rq = &rq->cfs;
8229         struct sched_entity *se;
8230         struct task_struct *p;
8231         int new_tasks;
8232
8233 again:
8234 #ifdef CONFIG_FAIR_GROUP_SCHED
8235         if (!cfs_rq->nr_running)
8236                 goto idle;
8237
8238         if (prev->sched_class != &fair_sched_class)
8239                 goto simple;
8240
8241         /*
8242          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8243          * likely that a next task is from the same cgroup as the current.
8244          *
8245          * Therefore attempt to avoid putting and setting the entire cgroup
8246          * hierarchy, only change the part that actually changes.
8247          */
8248
8249         do {
8250                 struct sched_entity *curr = cfs_rq->curr;
8251
8252                 /*
8253                  * Since we got here without doing put_prev_entity() we also
8254                  * have to consider cfs_rq->curr. If it is still a runnable
8255                  * entity, update_curr() will update its vruntime, otherwise
8256                  * forget we've ever seen it.
8257                  */
8258                 if (curr) {
8259                         if (curr->on_rq)
8260                                 update_curr(cfs_rq);
8261                         else
8262                                 curr = NULL;
8263
8264                         /*
8265                          * This call to check_cfs_rq_runtime() will do the
8266                          * throttle and dequeue its entity in the parent(s).
8267                          * Therefore the 'simple' nr_running test will indeed
8268                          * be correct.
8269                          */
8270                         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8271                                 goto simple;
8272                 }
8273
8274                 se = pick_next_entity(cfs_rq, curr);
8275                 cfs_rq = group_cfs_rq(se);
8276         } while (cfs_rq);
8277
8278         p = task_of(se);
8279
8280         /*
8281          * Since we haven't yet done put_prev_entity and if the selected task
8282          * is a different task than we started out with, try and touch the
8283          * least amount of cfs_rqs.
8284          */
8285         if (prev != p) {
8286                 struct sched_entity *pse = &prev->se;
8287
8288                 while (!(cfs_rq = is_same_group(se, pse))) {
8289                         int se_depth = se->depth;
8290                         int pse_depth = pse->depth;
8291
8292                         if (se_depth <= pse_depth) {
8293                                 put_prev_entity(cfs_rq_of(pse), pse);
8294                                 pse = parent_entity(pse);
8295                         }
8296                         if (se_depth >= pse_depth) {
8297                                 set_next_entity(cfs_rq_of(se), se);
8298                                 se = parent_entity(se);
8299                         }
8300                 }
8301
8302                 put_prev_entity(cfs_rq, pse);
8303                 set_next_entity(cfs_rq, se);
8304         }
8305
8306         if (hrtick_enabled(rq))
8307                 hrtick_start_fair(rq, p);
8308
8309         rq->misfit_task = !task_fits_max(p, rq->cpu);
8310
8311         return p;
8312 simple:
8313         cfs_rq = &rq->cfs;
8314 #endif
8315
8316         if (!cfs_rq->nr_running)
8317                 goto idle;
8318
8319         put_prev_task(rq, prev);
8320
8321         do {
8322                 se = pick_next_entity(cfs_rq, NULL);
8323                 set_next_entity(cfs_rq, se);
8324                 cfs_rq = group_cfs_rq(se);
8325         } while (cfs_rq);
8326
8327         p = task_of(se);
8328
8329         if (hrtick_enabled(rq))
8330                 hrtick_start_fair(rq, p);
8331
8332         rq->misfit_task = !task_fits_max(p, rq->cpu);
8333
8334         return p;
8335
8336 idle:
8337         rq->misfit_task = 0;
8338         /*
8339          * This is OK, because current is on_cpu, which avoids it being picked
8340          * for load-balance and preemption/IRQs are still disabled avoiding
8341          * further scheduler activity on it and we're being very careful to
8342          * re-start the picking loop.
8343          */
8344         lockdep_unpin_lock(&rq->lock);
8345         new_tasks = idle_balance(rq);
8346         lockdep_pin_lock(&rq->lock);
8347         /*
8348          * Because idle_balance() releases (and re-acquires) rq->lock, it is
8349          * possible for any higher priority task to appear. In that case we
8350          * must re-start the pick_next_entity() loop.
8351          */
8352         if (new_tasks < 0)
8353                 return RETRY_TASK;
8354
8355         if (new_tasks > 0)
8356                 goto again;
8357
8358         return NULL;
8359 }
8360
8361 /*
8362  * Account for a descheduled task:
8363  */
8364 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8365 {
8366         struct sched_entity *se = &prev->se;
8367         struct cfs_rq *cfs_rq;
8368
8369         for_each_sched_entity(se) {
8370                 cfs_rq = cfs_rq_of(se);
8371                 put_prev_entity(cfs_rq, se);
8372         }
8373 }
8374
8375 /*
8376  * sched_yield() is very simple
8377  *
8378  * The magic of dealing with the ->skip buddy is in pick_next_entity.
8379  */
8380 static void yield_task_fair(struct rq *rq)
8381 {
8382         struct task_struct *curr = rq->curr;
8383         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8384         struct sched_entity *se = &curr->se;
8385
8386         /*
8387          * Are we the only task in the tree?
8388          */
8389         if (unlikely(rq->nr_running == 1))
8390                 return;
8391
8392         clear_buddies(cfs_rq, se);
8393
8394         if (curr->policy != SCHED_BATCH) {
8395                 update_rq_clock(rq);
8396                 /*
8397                  * Update run-time statistics of the 'current'.
8398                  */
8399                 update_curr(cfs_rq);
8400                 /*
8401                  * Tell update_rq_clock() that we've just updated,
8402                  * so we don't do microscopic update in schedule()
8403                  * and double the fastpath cost.
8404                  */
8405                 rq_clock_skip_update(rq, true);
8406         }
8407
8408         set_skip_buddy(se);
8409 }
8410
8411 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
8412 {
8413         struct sched_entity *se = &p->se;
8414
8415         /* throttled hierarchies are not runnable */
8416         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8417                 return false;
8418
8419         /* Tell the scheduler that we'd really like pse to run next. */
8420         set_next_buddy(se);
8421
8422         yield_task_fair(rq);
8423
8424         return true;
8425 }
8426
8427 #ifdef CONFIG_SMP
8428 /**************************************************
8429  * Fair scheduling class load-balancing methods.
8430  *
8431  * BASICS
8432  *
8433  * The purpose of load-balancing is to achieve the same basic fairness the
8434  * per-cpu scheduler provides, namely provide a proportional amount of compute
8435  * time to each task. This is expressed in the following equation:
8436  *
8437  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
8438  *
8439  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
8440  * W_i,0 is defined as:
8441  *
8442  *   W_i,0 = \Sum_j w_i,j                                             (2)
8443  *
8444  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
8445  * is derived from the nice value as per prio_to_weight[].
8446  *
8447  * The weight average is an exponential decay average of the instantaneous
8448  * weight:
8449  *
8450  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
8451  *
8452  * C_i is the compute capacity of cpu i, typically it is the
8453  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8454  * can also include other factors [XXX].
8455  *
8456  * To achieve this balance we define a measure of imbalance which follows
8457  * directly from (1):
8458  *
8459  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
8460  *
8461  * We them move tasks around to minimize the imbalance. In the continuous
8462  * function space it is obvious this converges, in the discrete case we get
8463  * a few fun cases generally called infeasible weight scenarios.
8464  *
8465  * [XXX expand on:
8466  *     - infeasible weights;
8467  *     - local vs global optima in the discrete case. ]
8468  *
8469  *
8470  * SCHED DOMAINS
8471  *
8472  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8473  * for all i,j solution, we create a tree of cpus that follows the hardware
8474  * topology where each level pairs two lower groups (or better). This results
8475  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
8476  * tree to only the first of the previous level and we decrease the frequency
8477  * of load-balance at each level inv. proportional to the number of cpus in
8478  * the groups.
8479  *
8480  * This yields:
8481  *
8482  *     log_2 n     1     n
8483  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
8484  *     i = 0      2^i   2^i
8485  *                               `- size of each group
8486  *         |         |     `- number of cpus doing load-balance
8487  *         |         `- freq
8488  *         `- sum over all levels
8489  *
8490  * Coupled with a limit on how many tasks we can migrate every balance pass,
8491  * this makes (5) the runtime complexity of the balancer.
8492  *
8493  * An important property here is that each CPU is still (indirectly) connected
8494  * to every other cpu in at most O(log n) steps:
8495  *
8496  * The adjacency matrix of the resulting graph is given by:
8497  *
8498  *             log_2 n
8499  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
8500  *             k = 0
8501  *
8502  * And you'll find that:
8503  *
8504  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
8505  *
8506  * Showing there's indeed a path between every cpu in at most O(log n) steps.
8507  * The task movement gives a factor of O(m), giving a convergence complexity
8508  * of:
8509  *
8510  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
8511  *
8512  *
8513  * WORK CONSERVING
8514  *
8515  * In order to avoid CPUs going idle while there's still work to do, new idle
8516  * balancing is more aggressive and has the newly idle cpu iterate up the domain
8517  * tree itself instead of relying on other CPUs to bring it work.
8518  *
8519  * This adds some complexity to both (5) and (8) but it reduces the total idle
8520  * time.
8521  *
8522  * [XXX more?]
8523  *
8524  *
8525  * CGROUPS
8526  *
8527  * Cgroups make a horror show out of (2), instead of a simple sum we get:
8528  *
8529  *                                s_k,i
8530  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
8531  *                                 S_k
8532  *
8533  * Where
8534  *
8535  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
8536  *
8537  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
8538  *
8539  * The big problem is S_k, its a global sum needed to compute a local (W_i)
8540  * property.
8541  *
8542  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8543  *      rewrite all of this once again.]
8544  */
8545
8546 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8547
8548 enum fbq_type { regular, remote, all };
8549
8550 enum group_type {
8551         group_other = 0,
8552         group_misfit_task,
8553         group_imbalanced,
8554         group_overloaded,
8555 };
8556
8557 #define LBF_ALL_PINNED  0x01
8558 #define LBF_NEED_BREAK  0x02
8559 #define LBF_DST_PINNED  0x04
8560 #define LBF_SOME_PINNED 0x08
8561 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
8562 #define LBF_IGNORE_BIG_TASKS 0x100
8563 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
8564 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
8565
8566 struct lb_env {
8567         struct sched_domain     *sd;
8568
8569         struct rq               *src_rq;
8570         int                     src_cpu;
8571
8572         int                     dst_cpu;
8573         struct rq               *dst_rq;
8574
8575         struct cpumask          *dst_grpmask;
8576         int                     new_dst_cpu;
8577         enum cpu_idle_type      idle;
8578         long                    imbalance;
8579         unsigned int            src_grp_nr_running;
8580         /* The set of CPUs under consideration for load-balancing */
8581         struct cpumask          *cpus;
8582         unsigned int            busiest_grp_capacity;
8583         unsigned int            busiest_nr_running;
8584
8585         unsigned int            flags;
8586
8587         unsigned int            loop;
8588         unsigned int            loop_break;
8589         unsigned int            loop_max;
8590
8591         enum fbq_type           fbq_type;
8592         enum group_type         busiest_group_type;
8593         struct list_head        tasks;
8594         enum sched_boost_policy boost_policy;
8595 };
8596
8597 /*
8598  * Is this task likely cache-hot:
8599  */
8600 static int task_hot(struct task_struct *p, struct lb_env *env)
8601 {
8602         s64 delta;
8603
8604         lockdep_assert_held(&env->src_rq->lock);
8605
8606         if (p->sched_class != &fair_sched_class)
8607                 return 0;
8608
8609         if (unlikely(p->policy == SCHED_IDLE))
8610                 return 0;
8611
8612         /*
8613          * Buddy candidates are cache hot:
8614          */
8615         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8616                         (&p->se == cfs_rq_of(&p->se)->next ||
8617                          &p->se == cfs_rq_of(&p->se)->last))
8618                 return 1;
8619
8620         if (sysctl_sched_migration_cost == -1)
8621                 return 1;
8622         if (sysctl_sched_migration_cost == 0)
8623                 return 0;
8624
8625         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8626
8627         return delta < (s64)sysctl_sched_migration_cost;
8628 }
8629
8630 #ifdef CONFIG_NUMA_BALANCING
8631 /*
8632  * Returns 1, if task migration degrades locality
8633  * Returns 0, if task migration improves locality i.e migration preferred.
8634  * Returns -1, if task migration is not affected by locality.
8635  */
8636 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8637 {
8638         struct numa_group *numa_group = rcu_dereference(p->numa_group);
8639         unsigned long src_faults, dst_faults;
8640         int src_nid, dst_nid;
8641
8642         if (!static_branch_likely(&sched_numa_balancing))
8643                 return -1;
8644
8645         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8646                 return -1;
8647
8648         src_nid = cpu_to_node(env->src_cpu);
8649         dst_nid = cpu_to_node(env->dst_cpu);
8650
8651         if (src_nid == dst_nid)
8652                 return -1;
8653
8654         /* Migrating away from the preferred node is always bad. */
8655         if (src_nid == p->numa_preferred_nid) {
8656                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8657                         return 1;
8658                 else
8659                         return -1;
8660         }
8661
8662         /* Encourage migration to the preferred node. */
8663         if (dst_nid == p->numa_preferred_nid)
8664                 return 0;
8665
8666         if (numa_group) {
8667                 src_faults = group_faults(p, src_nid);
8668                 dst_faults = group_faults(p, dst_nid);
8669         } else {
8670                 src_faults = task_faults(p, src_nid);
8671                 dst_faults = task_faults(p, dst_nid);
8672         }
8673
8674         return dst_faults < src_faults;
8675 }
8676
8677 #else
8678 static inline int migrate_degrades_locality(struct task_struct *p,
8679                                              struct lb_env *env)
8680 {
8681         return -1;
8682 }
8683 #endif
8684
8685 /*
8686  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8687  */
8688 static
8689 int can_migrate_task(struct task_struct *p, struct lb_env *env)
8690 {
8691         int tsk_cache_hot;
8692         int twf, group_cpus;
8693
8694         lockdep_assert_held(&env->src_rq->lock);
8695
8696         /*
8697          * We do not migrate tasks that are:
8698          * 1) throttled_lb_pair, or
8699          * 2) cannot be migrated to this CPU due to cpus_allowed, or
8700          * 3) running (obviously), or
8701          * 4) are cache-hot on their current CPU.
8702          */
8703         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8704                 return 0;
8705
8706         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
8707                 int cpu;
8708
8709                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
8710
8711                 env->flags |= LBF_SOME_PINNED;
8712
8713                 /*
8714                  * Remember if this task can be migrated to any other cpu in
8715                  * our sched_group. We may want to revisit it if we couldn't
8716                  * meet load balance goals by pulling other tasks on src_cpu.
8717                  *
8718                  * Also avoid computing new_dst_cpu if we have already computed
8719                  * one in current iteration.
8720                  */
8721                 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
8722                         return 0;
8723
8724                 /* Prevent to re-select dst_cpu via env's cpus */
8725                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8726                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
8727                                 env->flags |= LBF_DST_PINNED;
8728                                 env->new_dst_cpu = cpu;
8729                                 break;
8730                         }
8731                 }
8732
8733                 return 0;
8734         }
8735
8736         /* Record that we found atleast one task that could run on dst_cpu */
8737         env->flags &= ~LBF_ALL_PINNED;
8738
8739         if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
8740                 if (nr_big_tasks(env->src_rq) && !is_big_task(p))
8741                         return 0;
8742
8743                 if (env->boost_policy == SCHED_BOOST_ON_BIG &&
8744                                         !task_sched_boost(p))
8745                         return 0;
8746         }
8747
8748         twf = task_will_fit(p, env->dst_cpu);
8749
8750         /*
8751          * Attempt to not pull tasks that don't fit. We may get lucky and find
8752          * one that actually fits.
8753          */
8754         if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
8755                 return 0;
8756
8757         if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
8758             !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
8759                 return 0;
8760
8761         /*
8762          * Group imbalance can sometimes cause work to be pulled across groups
8763          * even though the group could have managed the imbalance on its own.
8764          * Prevent inter-cluster migrations for big tasks when the number of
8765          * tasks is lower than the capacity of the group.
8766          */
8767         group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
8768                                                  SCHED_CAPACITY_SCALE);
8769         if (!twf && env->busiest_nr_running <= group_cpus)
8770                 return 0;
8771
8772         if (task_running(env->src_rq, p)) {
8773                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
8774                 return 0;
8775         }
8776
8777         /*
8778          * Aggressive migration if:
8779          * 1) IDLE or NEWLY_IDLE balance.
8780          * 2) destination numa is preferred
8781          * 3) task is cache cold, or
8782          * 4) too many balance attempts have failed.
8783          */
8784         tsk_cache_hot = migrate_degrades_locality(p, env);
8785         if (tsk_cache_hot == -1)
8786                 tsk_cache_hot = task_hot(p, env);
8787
8788         if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
8789             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
8790                 if (tsk_cache_hot == 1) {
8791                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
8792                         schedstat_inc(p, se.statistics.nr_forced_migrations);
8793                 }
8794                 return 1;
8795         }
8796
8797         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
8798         return 0;
8799 }
8800
8801 /*
8802  * detach_task() -- detach the task for the migration specified in env
8803  */
8804 static void detach_task(struct task_struct *p, struct lb_env *env)
8805 {
8806         lockdep_assert_held(&env->src_rq->lock);
8807
8808         p->on_rq = TASK_ON_RQ_MIGRATING;
8809         deactivate_task(env->src_rq, p, 0);
8810         double_lock_balance(env->src_rq, env->dst_rq);
8811         set_task_cpu(p, env->dst_cpu);
8812         if (task_in_related_thread_group(p))
8813                 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
8814         double_unlock_balance(env->src_rq, env->dst_rq);
8815 }
8816
8817 /*
8818  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
8819  * part of active balancing operations within "domain".
8820  *
8821  * Returns a task if successful and NULL otherwise.
8822  */
8823 static struct task_struct *detach_one_task(struct lb_env *env)
8824 {
8825         struct task_struct *p, *n;
8826
8827         lockdep_assert_held(&env->src_rq->lock);
8828
8829         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
8830                 if (!can_migrate_task(p, env))
8831                         continue;
8832
8833                 detach_task(p, env);
8834
8835                 /*
8836                  * Right now, this is only the second place where
8837                  * lb_gained[env->idle] is updated (other is detach_tasks)
8838                  * so we can safely collect stats here rather than
8839                  * inside detach_tasks().
8840                  */
8841                 schedstat_inc(env->sd, lb_gained[env->idle]);
8842
8843                 return p;
8844         }
8845         return NULL;
8846 }
8847
8848 static const unsigned int sched_nr_migrate_break = 32;
8849
8850 /*
8851  * detach_tasks() -- tries to detach up to imbalance weighted load from
8852  * busiest_rq, as part of a balancing operation within domain "sd".
8853  *
8854  * Returns number of detached tasks if successful and 0 otherwise.
8855  */
8856 static int detach_tasks(struct lb_env *env)
8857 {
8858         struct list_head *tasks = &env->src_rq->cfs_tasks;
8859         struct task_struct *p;
8860         unsigned long load;
8861         int detached = 0;
8862         int orig_loop = env->loop;
8863
8864         lockdep_assert_held(&env->src_rq->lock);
8865
8866         if (env->imbalance <= 0)
8867                 return 0;
8868
8869         if (!same_cluster(env->dst_cpu, env->src_cpu))
8870                 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
8871
8872         if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
8873                 env->flags |= LBF_IGNORE_BIG_TASKS;
8874
8875 redo:
8876         while (!list_empty(tasks)) {
8877                 /*
8878                  * We don't want to steal all, otherwise we may be treated likewise,
8879                  * which could at worst lead to a livelock crash.
8880                  */
8881                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8882                         break;
8883
8884                 p = list_first_entry(tasks, struct task_struct, se.group_node);
8885
8886                 env->loop++;
8887                 /* We've more or less seen every task there is, call it quits */
8888                 if (env->loop > env->loop_max)
8889                         break;
8890
8891                 /* take a breather every nr_migrate tasks */
8892                 if (env->loop > env->loop_break) {
8893                         env->loop_break += sched_nr_migrate_break;
8894                         env->flags |= LBF_NEED_BREAK;
8895                         break;
8896                 }
8897
8898                 if (!can_migrate_task(p, env))
8899                         goto next;
8900
8901                 load = task_h_load(p);
8902
8903                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8904                         goto next;
8905
8906                 if ((load / 2) > env->imbalance)
8907                         goto next;
8908
8909                 detach_task(p, env);
8910                 list_add(&p->se.group_node, &env->tasks);
8911
8912                 detached++;
8913                 env->imbalance -= load;
8914
8915 #ifdef CONFIG_PREEMPT
8916                 /*
8917                  * NEWIDLE balancing is a source of latency, so preemptible
8918                  * kernels will stop after the first task is detached to minimize
8919                  * the critical section.
8920                  */
8921                 if (env->idle == CPU_NEWLY_IDLE)
8922                         break;
8923 #endif
8924
8925                 /*
8926                  * We only want to steal up to the prescribed amount of
8927                  * weighted load.
8928                  */
8929                 if (env->imbalance <= 0)
8930                         break;
8931
8932                 continue;
8933 next:
8934                 list_move_tail(&p->se.group_node, tasks);
8935         }
8936
8937         if (env->flags & (LBF_IGNORE_BIG_TASKS |
8938                         LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
8939                 tasks = &env->src_rq->cfs_tasks;
8940                 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
8941                                 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
8942                 env->loop = orig_loop;
8943                 goto redo;
8944         }
8945
8946         /*
8947          * Right now, this is one of only two places we collect this stat
8948          * so we can safely collect detach_one_task() stats here rather
8949          * than inside detach_one_task().
8950          */
8951         schedstat_add(env->sd, lb_gained[env->idle], detached);
8952
8953         return detached;
8954 }
8955
8956 /*
8957  * attach_task() -- attach the task detached by detach_task() to its new rq.
8958  */
8959 static void attach_task(struct rq *rq, struct task_struct *p)
8960 {
8961         lockdep_assert_held(&rq->lock);
8962
8963         BUG_ON(task_rq(p) != rq);
8964         activate_task(rq, p, 0);
8965         p->on_rq = TASK_ON_RQ_QUEUED;
8966         check_preempt_curr(rq, p, 0);
8967 }
8968
8969 /*
8970  * attach_one_task() -- attaches the task returned from detach_one_task() to
8971  * its new rq.
8972  */
8973 static void attach_one_task(struct rq *rq, struct task_struct *p)
8974 {
8975         raw_spin_lock(&rq->lock);
8976         attach_task(rq, p);
8977         raw_spin_unlock(&rq->lock);
8978 }
8979
8980 /*
8981  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8982  * new rq.
8983  */
8984 static void attach_tasks(struct lb_env *env)
8985 {
8986         struct list_head *tasks = &env->tasks;
8987         struct task_struct *p;
8988
8989         raw_spin_lock(&env->dst_rq->lock);
8990
8991         while (!list_empty(tasks)) {
8992                 p = list_first_entry(tasks, struct task_struct, se.group_node);
8993                 list_del_init(&p->se.group_node);
8994
8995                 attach_task(env->dst_rq, p);
8996         }
8997
8998         raw_spin_unlock(&env->dst_rq->lock);
8999 }
9000
9001 #ifdef CONFIG_FAIR_GROUP_SCHED
9002 static void update_blocked_averages(int cpu)
9003 {
9004         struct rq *rq = cpu_rq(cpu);
9005         struct cfs_rq *cfs_rq;
9006         unsigned long flags;
9007
9008         raw_spin_lock_irqsave(&rq->lock, flags);
9009         update_rq_clock(rq);
9010
9011         /*
9012          * Iterates the task_group tree in a bottom up fashion, see
9013          * list_add_leaf_cfs_rq() for details.
9014          */
9015         for_each_leaf_cfs_rq(rq, cfs_rq) {
9016                 /* throttled entities do not contribute to load */
9017                 if (throttled_hierarchy(cfs_rq))
9018                         continue;
9019
9020                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
9021                                            true))
9022                         update_tg_load_avg(cfs_rq, 0);
9023
9024                 /* Propagate pending load changes to the parent */
9025                 if (cfs_rq->tg->se[cpu])
9026                         update_load_avg(cfs_rq->tg->se[cpu], 0);
9027         }
9028         raw_spin_unlock_irqrestore(&rq->lock, flags);
9029 }
9030
9031 /*
9032  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9033  * This needs to be done in a top-down fashion because the load of a child
9034  * group is a fraction of its parents load.
9035  */
9036 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9037 {
9038         struct rq *rq = rq_of(cfs_rq);
9039         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9040         unsigned long now = jiffies;
9041         unsigned long load;
9042
9043         if (cfs_rq->last_h_load_update == now)
9044                 return;
9045
9046         WRITE_ONCE(cfs_rq->h_load_next, NULL);
9047         for_each_sched_entity(se) {
9048                 cfs_rq = cfs_rq_of(se);
9049                 WRITE_ONCE(cfs_rq->h_load_next, se);
9050                 if (cfs_rq->last_h_load_update == now)
9051                         break;
9052         }
9053
9054         if (!se) {
9055                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9056                 cfs_rq->last_h_load_update = now;
9057         }
9058
9059         while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9060                 load = cfs_rq->h_load;
9061                 load = div64_ul(load * se->avg.load_avg,
9062                         cfs_rq_load_avg(cfs_rq) + 1);
9063                 cfs_rq = group_cfs_rq(se);
9064                 cfs_rq->h_load = load;
9065                 cfs_rq->last_h_load_update = now;
9066         }
9067 }
9068
9069 static unsigned long task_h_load(struct task_struct *p)
9070 {
9071         struct cfs_rq *cfs_rq = task_cfs_rq(p);
9072
9073         update_cfs_rq_h_load(cfs_rq);
9074         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9075                         cfs_rq_load_avg(cfs_rq) + 1);
9076 }
9077 #else
9078 static inline void update_blocked_averages(int cpu)
9079 {
9080         struct rq *rq = cpu_rq(cpu);
9081         struct cfs_rq *cfs_rq = &rq->cfs;
9082         unsigned long flags;
9083
9084         raw_spin_lock_irqsave(&rq->lock, flags);
9085         update_rq_clock(rq);
9086         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
9087         raw_spin_unlock_irqrestore(&rq->lock, flags);
9088 }
9089
9090 static unsigned long task_h_load(struct task_struct *p)
9091 {
9092         return p->se.avg.load_avg;
9093 }
9094 #endif
9095
9096 /********** Helpers for find_busiest_group ************************/
9097
9098 /*
9099  * sg_lb_stats - stats of a sched_group required for load_balancing
9100  */
9101 struct sg_lb_stats {
9102         unsigned long avg_load; /*Avg load across the CPUs of the group */
9103         unsigned long group_load; /* Total load over the CPUs of the group */
9104         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
9105         unsigned long load_per_task;
9106         unsigned long group_capacity;
9107         unsigned long group_util; /* Total utilization of the group */
9108         unsigned int sum_nr_running; /* Nr tasks running in the group */
9109 #ifdef CONFIG_SCHED_HMP
9110         unsigned long sum_nr_big_tasks;
9111         u64 group_cpu_load; /* Scaled load of all CPUs of the group */
9112 #endif
9113         unsigned int idle_cpus;
9114         unsigned int group_weight;
9115         enum group_type group_type;
9116         int group_no_capacity;
9117         int group_misfit_task; /* A cpu has a task too big for its capacity */
9118 #ifdef CONFIG_NUMA_BALANCING
9119         unsigned int nr_numa_running;
9120         unsigned int nr_preferred_running;
9121 #endif
9122 };
9123
9124 /*
9125  * sd_lb_stats - Structure to store the statistics of a sched_domain
9126  *               during load balancing.
9127  */
9128 struct sd_lb_stats {
9129         struct sched_group *busiest;    /* Busiest group in this sd */
9130         struct sched_group *local;      /* Local group in this sd */
9131         unsigned long total_load;       /* Total load of all groups in sd */
9132         unsigned long total_capacity;   /* Total capacity of all groups in sd */
9133         unsigned long avg_load; /* Average load across all groups in sd */
9134
9135         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
9136         struct sg_lb_stats local_stat;  /* Statistics of the local group */
9137 };
9138
9139 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9140 {
9141         /*
9142          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9143          * local_stat because update_sg_lb_stats() does a full clear/assignment.
9144          * We must however clear busiest_stat::avg_load because
9145          * update_sd_pick_busiest() reads this before assignment.
9146          */
9147         *sds = (struct sd_lb_stats){
9148                 .busiest = NULL,
9149                 .local = NULL,
9150                 .total_load = 0UL,
9151                 .total_capacity = 0UL,
9152                 .busiest_stat = {
9153                         .avg_load = 0UL,
9154                         .sum_nr_running = 0,
9155                         .group_type = group_other,
9156 #ifdef CONFIG_SCHED_HMP
9157                         .sum_nr_big_tasks = 0UL,
9158                         .group_cpu_load = 0ULL,
9159 #endif
9160                 },
9161         };
9162 }
9163
9164 #ifdef CONFIG_SCHED_HMP
9165
9166 static int
9167 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9168 {
9169         int local_cpu, busiest_cpu;
9170         int local_capacity, busiest_capacity;
9171         int local_pwr_cost, busiest_pwr_cost;
9172         int nr_cpus;
9173         int boost = sched_boost();
9174
9175         if (!sysctl_sched_restrict_cluster_spill ||
9176                 boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
9177                 return 0;
9178
9179         local_cpu = group_first_cpu(sds->local);
9180         busiest_cpu = group_first_cpu(sds->busiest);
9181
9182         local_capacity = cpu_max_possible_capacity(local_cpu);
9183         busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
9184
9185         local_pwr_cost = cpu_max_power_cost(local_cpu);
9186         busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
9187
9188         if (local_pwr_cost <= busiest_pwr_cost)
9189                 return 0;
9190
9191         if (local_capacity > busiest_capacity &&
9192                         sds->busiest_stat.sum_nr_big_tasks)
9193                 return 0;
9194
9195         nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
9196         if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
9197                 (sds->busiest_stat.sum_nr_running <
9198                         nr_cpus * sysctl_sched_spill_nr_run))
9199                 return 1;
9200
9201         return 0;
9202 }
9203
9204 #else   /* CONFIG_SCHED_HMP */
9205
9206 static inline int
9207 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9208 {
9209         return 0;
9210 }
9211
9212 #endif  /* CONFIG_SCHED_HMP */
9213
9214 /**
9215  * get_sd_load_idx - Obtain the load index for a given sched domain.
9216  * @sd: The sched_domain whose load_idx is to be obtained.
9217  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
9218  *
9219  * Return: The load index.
9220  */
9221 static inline int get_sd_load_idx(struct sched_domain *sd,
9222                                         enum cpu_idle_type idle)
9223 {
9224         int load_idx;
9225
9226         switch (idle) {
9227         case CPU_NOT_IDLE:
9228                 load_idx = sd->busy_idx;
9229                 break;
9230
9231         case CPU_NEWLY_IDLE:
9232                 load_idx = sd->newidle_idx;
9233                 break;
9234         default:
9235                 load_idx = sd->idle_idx;
9236                 break;
9237         }
9238
9239         return load_idx;
9240 }
9241
9242 static unsigned long scale_rt_capacity(int cpu)
9243 {
9244         struct rq *rq = cpu_rq(cpu);
9245         u64 total, used, age_stamp, avg;
9246         s64 delta;
9247
9248         /*
9249          * Since we're reading these variables without serialization make sure
9250          * we read them once before doing sanity checks on them.
9251          */
9252         age_stamp = READ_ONCE(rq->age_stamp);
9253         avg = READ_ONCE(rq->rt_avg);
9254         delta = __rq_clock_broken(rq) - age_stamp;
9255
9256         if (unlikely(delta < 0))
9257                 delta = 0;
9258
9259         total = sched_avg_period() + delta;
9260
9261         used = div_u64(avg, total);
9262
9263         /*
9264          * deadline bandwidth is defined at system level so we must
9265          * weight this bandwidth with the max capacity of the system.
9266          * As a reminder, avg_bw is 20bits width and
9267          * scale_cpu_capacity is 10 bits width
9268          */
9269         used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
9270
9271         if (likely(used < SCHED_CAPACITY_SCALE))
9272                 return SCHED_CAPACITY_SCALE - used;
9273
9274         return 1;
9275 }
9276
9277 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
9278 {
9279         raw_spin_lock_init(&mcc->lock);
9280         mcc->val = 0;
9281         mcc->cpu = -1;
9282 }
9283
9284 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9285 {
9286         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
9287         struct sched_group *sdg = sd->groups;
9288         struct max_cpu_capacity *mcc;
9289         unsigned long max_capacity;
9290         int max_cap_cpu;
9291         unsigned long flags;
9292
9293         cpu_rq(cpu)->cpu_capacity_orig = capacity;
9294
9295         mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
9296
9297         raw_spin_lock_irqsave(&mcc->lock, flags);
9298         max_capacity = mcc->val;
9299         max_cap_cpu = mcc->cpu;
9300
9301         if ((max_capacity > capacity && max_cap_cpu == cpu) ||
9302             (max_capacity < capacity)) {
9303                 mcc->val = capacity;
9304                 mcc->cpu = cpu;
9305 #ifdef CONFIG_SCHED_DEBUG
9306                 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9307                 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
9308                                 cpu, capacity);
9309                 goto skip_unlock;
9310 #endif
9311         }
9312         raw_spin_unlock_irqrestore(&mcc->lock, flags);
9313
9314 skip_unlock: __attribute__ ((unused));
9315         capacity *= scale_rt_capacity(cpu);
9316         capacity >>= SCHED_CAPACITY_SHIFT;
9317
9318         if (!capacity)
9319                 capacity = 1;
9320
9321         cpu_rq(cpu)->cpu_capacity = capacity;
9322         sdg->sgc->capacity = capacity;
9323         sdg->sgc->max_capacity = capacity;
9324         sdg->sgc->min_capacity = capacity;
9325 }
9326
9327 void update_group_capacity(struct sched_domain *sd, int cpu)
9328 {
9329         struct sched_domain *child = sd->child;
9330         struct sched_group *group, *sdg = sd->groups;
9331         unsigned long capacity, max_capacity, min_capacity;
9332         unsigned long interval;
9333
9334         interval = msecs_to_jiffies(sd->balance_interval);
9335         interval = clamp(interval, 1UL, max_load_balance_interval);
9336         sdg->sgc->next_update = jiffies + interval;
9337
9338         if (!child) {
9339                 update_cpu_capacity(sd, cpu);
9340                 return;
9341         }
9342
9343         capacity = 0;
9344         max_capacity = 0;
9345         min_capacity = ULONG_MAX;
9346
9347         if (child->flags & SD_OVERLAP) {
9348                 /*
9349                  * SD_OVERLAP domains cannot assume that child groups
9350                  * span the current group.
9351                  */
9352
9353                 for_each_cpu(cpu, sched_group_cpus(sdg)) {
9354                         struct sched_group_capacity *sgc;
9355                         struct rq *rq = cpu_rq(cpu);
9356
9357                         if (cpumask_test_cpu(cpu, cpu_isolated_mask))
9358                                 continue;
9359                         /*
9360                          * build_sched_domains() -> init_sched_groups_capacity()
9361                          * gets here before we've attached the domains to the
9362                          * runqueues.
9363                          *
9364                          * Use capacity_of(), which is set irrespective of domains
9365                          * in update_cpu_capacity().
9366                          *
9367                          * This avoids capacity from being 0 and
9368                          * causing divide-by-zero issues on boot.
9369                          */
9370                         if (unlikely(!rq->sd)) {
9371                                 capacity += capacity_of(cpu);
9372                         } else {
9373                                 sgc = rq->sd->groups->sgc;
9374                                 capacity += sgc->capacity;
9375                         }
9376
9377                         max_capacity = max(capacity, max_capacity);
9378                         min_capacity = min(capacity, min_capacity);
9379                 }
9380         } else  {
9381                 /*
9382                  * !SD_OVERLAP domains can assume that child groups
9383                  * span the current group.
9384                  */
9385
9386                 group = child->groups;
9387                 do {
9388                         struct sched_group_capacity *sgc = group->sgc;
9389
9390                         cpumask_t *cpus = sched_group_cpus(group);
9391
9392                         /* Revisit this later. This won't work for MT domain */
9393                         if (!cpu_isolated(cpumask_first(cpus))) {
9394                                 capacity += sgc->capacity;
9395                                 max_capacity = max(sgc->max_capacity, max_capacity);
9396                                 min_capacity = min(sgc->min_capacity, min_capacity);
9397                         }
9398                         group = group->next;
9399                 } while (group != child->groups);
9400         }
9401
9402         sdg->sgc->capacity = capacity;
9403         sdg->sgc->max_capacity = max_capacity;
9404         sdg->sgc->min_capacity = min_capacity;
9405 }
9406
9407 /*
9408  * Check whether the capacity of the rq has been noticeably reduced by side
9409  * activity. The imbalance_pct is used for the threshold.
9410  * Return true is the capacity is reduced
9411  */
9412 static inline int
9413 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9414 {
9415         return ((rq->cpu_capacity * sd->imbalance_pct) <
9416                                 (rq->cpu_capacity_orig * 100));
9417 }
9418
9419 /*
9420  * Group imbalance indicates (and tries to solve) the problem where balancing
9421  * groups is inadequate due to tsk_cpus_allowed() constraints.
9422  *
9423  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
9424  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
9425  * Something like:
9426  *
9427  *      { 0 1 2 3 } { 4 5 6 7 }
9428  *              *     * * *
9429  *
9430  * If we were to balance group-wise we'd place two tasks in the first group and
9431  * two tasks in the second group. Clearly this is undesired as it will overload
9432  * cpu 3 and leave one of the cpus in the second group unused.
9433  *
9434  * The current solution to this issue is detecting the skew in the first group
9435  * by noticing the lower domain failed to reach balance and had difficulty
9436  * moving tasks due to affinity constraints.
9437  *
9438  * When this is so detected; this group becomes a candidate for busiest; see
9439  * update_sd_pick_busiest(). And calculate_imbalance() and
9440  * find_busiest_group() avoid some of the usual balance conditions to allow it
9441  * to create an effective group imbalance.
9442  *
9443  * This is a somewhat tricky proposition since the next run might not find the
9444  * group imbalance and decide the groups need to be balanced again. A most
9445  * subtle and fragile situation.
9446  */
9447
9448 static inline int sg_imbalanced(struct sched_group *group)
9449 {
9450         return group->sgc->imbalance;
9451 }
9452
9453 /*
9454  * group_has_capacity returns true if the group has spare capacity that could
9455  * be used by some tasks.
9456  * We consider that a group has spare capacity if the  * number of task is
9457  * smaller than the number of CPUs or if the utilization is lower than the
9458  * available capacity for CFS tasks.
9459  * For the latter, we use a threshold to stabilize the state, to take into
9460  * account the variance of the tasks' load and to return true if the available
9461  * capacity in meaningful for the load balancer.
9462  * As an example, an available capacity of 1% can appear but it doesn't make
9463  * any benefit for the load balance.
9464  */
9465 static inline bool
9466 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
9467 {
9468         if (sgs->sum_nr_running < sgs->group_weight)
9469                 return true;
9470
9471         if ((sgs->group_capacity * 100) >
9472                         (sgs->group_util * env->sd->imbalance_pct))
9473                 return true;
9474
9475         return false;
9476 }
9477
9478 /*
9479  *  group_is_overloaded returns true if the group has more tasks than it can
9480  *  handle.
9481  *  group_is_overloaded is not equals to !group_has_capacity because a group
9482  *  with the exact right number of tasks, has no more spare capacity but is not
9483  *  overloaded so both group_has_capacity and group_is_overloaded return
9484  *  false.
9485  */
9486 static inline bool
9487 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
9488 {
9489         if (sgs->sum_nr_running <= sgs->group_weight)
9490                 return false;
9491
9492         if ((sgs->group_capacity * 100) <
9493                         (sgs->group_util * env->sd->imbalance_pct))
9494                 return true;
9495
9496         return false;
9497 }
9498
9499
9500 /*
9501  * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
9502  * per-cpu capacity than sched_group ref.
9503  */
9504 static inline bool
9505 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
9506 {
9507         return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
9508                                                         ref->sgc->max_capacity;
9509 }
9510
9511 static inline enum
9512 group_type group_classify(struct sched_group *group,
9513                           struct sg_lb_stats *sgs, struct lb_env *env)
9514 {
9515         if (sgs->group_no_capacity)
9516                 return group_overloaded;
9517
9518         if (sg_imbalanced(group))
9519                 return group_imbalanced;
9520
9521         if (sgs->group_misfit_task)
9522                 return group_misfit_task;
9523
9524         return group_other;
9525 }
9526
9527 #ifdef CONFIG_NO_HZ_COMMON
9528 /*
9529  * idle load balancing data
9530  *  - used by the nohz balance, but we want it available here
9531  *    so that we can see which CPUs have no tick.
9532  */
9533 static struct {
9534         cpumask_var_t idle_cpus_mask;
9535         atomic_t nr_cpus;
9536         unsigned long next_balance;     /* in jiffy units */
9537 } nohz ____cacheline_aligned;
9538
9539 static inline void update_cpu_stats_if_tickless(struct rq *rq)
9540 {
9541         /* only called from update_sg_lb_stats when irqs are disabled */
9542         if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
9543                 /* rate limit updates to once-per-jiffie at most */
9544                 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
9545                         return;
9546
9547                 raw_spin_lock(&rq->lock);
9548                 update_rq_clock(rq);
9549                 update_idle_cpu_load(rq);
9550                 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
9551                 raw_spin_unlock(&rq->lock);
9552         }
9553 }
9554
9555 #else
9556 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
9557 #endif
9558
9559 /**
9560  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9561  * @env: The load balancing environment.
9562  * @group: sched_group whose statistics are to be updated.
9563  * @load_idx: Load index of sched_domain of this_cpu for load calc.
9564  * @local_group: Does group contain this_cpu.
9565  * @sgs: variable to hold the statistics for this group.
9566  * @overload: Indicate more than one runnable task for any CPU.
9567  * @overutilized: Indicate overutilization for any CPU.
9568  */
9569 static inline void update_sg_lb_stats(struct lb_env *env,
9570                         struct sched_group *group, int load_idx,
9571                         int local_group, struct sg_lb_stats *sgs,
9572                         bool *overload, bool *overutilized)
9573 {
9574         unsigned long load;
9575         int i, nr_running;
9576
9577         memset(sgs, 0, sizeof(*sgs));
9578
9579         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
9580                 struct rq *rq = cpu_rq(i);
9581
9582                 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
9583                                      sched_irqload(i),
9584                                      power_cost(i, 0),
9585                                      cpu_temp(i));
9586
9587                 if (cpu_isolated(i))
9588                         continue;
9589
9590                 /* if we are entering idle and there are CPUs with
9591                  * their tick stopped, do an update for them
9592                  */
9593                 if (env->idle == CPU_NEWLY_IDLE)
9594                         update_cpu_stats_if_tickless(rq);
9595
9596                 /* Bias balancing toward cpus of our domain */
9597                 if (local_group)
9598                         load = target_load(i, load_idx);
9599                 else
9600                         load = source_load(i, load_idx);
9601
9602                 sgs->group_load += load;
9603                 sgs->group_util += cpu_util(i);
9604                 sgs->sum_nr_running += rq->cfs.h_nr_running;
9605
9606                 nr_running = rq->nr_running;
9607                 if (nr_running > 1)
9608                         *overload = true;
9609
9610 #ifdef CONFIG_SCHED_HMP
9611                 sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
9612                 sgs->group_cpu_load += cpu_load(i);
9613 #endif
9614
9615 #ifdef CONFIG_NUMA_BALANCING
9616                 sgs->nr_numa_running += rq->nr_numa_running;
9617                 sgs->nr_preferred_running += rq->nr_preferred_running;
9618 #endif
9619                 sgs->sum_weighted_load += weighted_cpuload(i);
9620                 /*
9621                  * No need to call idle_cpu() if nr_running is not 0
9622                  */
9623                 if (!nr_running && idle_cpu(i))
9624                         sgs->idle_cpus++;
9625
9626                 if (energy_aware() && cpu_overutilized(i)) {
9627                         *overutilized = true;
9628                         if (!sgs->group_misfit_task && rq->misfit_task)
9629                                 sgs->group_misfit_task = capacity_of(i);
9630                 }
9631         }
9632
9633         /* Isolated CPU has no weight */
9634         if (!group->group_weight) {
9635                 sgs->group_capacity = 0;
9636                 sgs->avg_load = 0;
9637                 sgs->group_no_capacity = 1;
9638                 sgs->group_type = group_other;
9639                 sgs->group_weight = group->group_weight;
9640         } else {
9641                 /* Adjust by relative CPU capacity of the group */
9642                 sgs->group_capacity = group->sgc->capacity;
9643                 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
9644                                                         sgs->group_capacity;
9645
9646                 sgs->group_weight = group->group_weight;
9647
9648                 sgs->group_no_capacity = group_is_overloaded(env, sgs);
9649                 sgs->group_type = group_classify(group, sgs, env);
9650         }
9651
9652         if (sgs->sum_nr_running)
9653                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
9654 }
9655
9656 #ifdef CONFIG_SCHED_HMP
9657 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9658                                                   struct sd_lb_stats *sds,
9659                                                   struct sched_group *sg,
9660                                                   struct sg_lb_stats *sgs)
9661 {
9662         if (env->idle != CPU_NOT_IDLE &&
9663             cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
9664                 if (sgs->sum_nr_big_tasks >
9665                                 sds->busiest_stat.sum_nr_big_tasks) {
9666                         env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
9667                         return true;
9668                 }
9669         }
9670
9671         return false;
9672 }
9673 #else
9674 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9675                                                   struct sd_lb_stats *sds,
9676                                                   struct sched_group *sg,
9677                                                   struct sg_lb_stats *sgs)
9678 {
9679         return false;
9680 }
9681 #endif
9682
9683 /**
9684  * update_sd_pick_busiest - return 1 on busiest group
9685  * @env: The load balancing environment.
9686  * @sds: sched_domain statistics
9687  * @sg: sched_group candidate to be checked for being the busiest
9688  * @sgs: sched_group statistics
9689  *
9690  * Determine if @sg is a busier group than the previously selected
9691  * busiest group.
9692  *
9693  * Return: %true if @sg is a busier group than the previously selected
9694  * busiest group. %false otherwise.
9695  */
9696 static bool update_sd_pick_busiest(struct lb_env *env,
9697                                    struct sd_lb_stats *sds,
9698                                    struct sched_group *sg,
9699                                    struct sg_lb_stats *sgs)
9700 {
9701         struct sg_lb_stats *busiest = &sds->busiest_stat;
9702
9703         if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
9704                 return true;
9705
9706         if (sgs->group_type > busiest->group_type)
9707                 return true;
9708
9709         if (sgs->group_type < busiest->group_type)
9710                 return false;
9711
9712         if (energy_aware()) {
9713                 /*
9714                  * Candidate sg doesn't face any serious load-balance problems
9715                  * so don't pick it if the local sg is already filled up.
9716                  */
9717                 if (sgs->group_type == group_other &&
9718                     !group_has_capacity(env, &sds->local_stat))
9719                         return false;
9720
9721                 if (sgs->avg_load <= busiest->avg_load)
9722                         return false;
9723
9724                 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9725                         goto asym_packing;
9726
9727                 /*
9728                  * Candidate sg has no more than one task per CPU and
9729                  * has higher per-CPU capacity. Migrating tasks to less
9730                  * capable CPUs may harm throughput. Maximize throughput,
9731                  * power/energy consequences are not considered.
9732                  */
9733                 if (sgs->sum_nr_running <= sgs->group_weight &&
9734                     group_smaller_cpu_capacity(sds->local, sg))
9735                         return false;
9736         }
9737
9738 asym_packing:
9739         /* This is the busiest node in its class. */
9740         if (!(env->sd->flags & SD_ASYM_PACKING))
9741                 return true;
9742
9743         /*
9744          * ASYM_PACKING needs to move all the work to the lowest
9745          * numbered CPUs in the group, therefore mark all groups
9746          * higher than ourself as busy.
9747          */
9748         if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
9749                 if (!sds->busiest)
9750                         return true;
9751
9752                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
9753                         return true;
9754         }
9755
9756         return false;
9757 }
9758
9759 #ifdef CONFIG_NUMA_BALANCING
9760 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9761 {
9762         if (sgs->sum_nr_running > sgs->nr_numa_running)
9763                 return regular;
9764         if (sgs->sum_nr_running > sgs->nr_preferred_running)
9765                 return remote;
9766         return all;
9767 }
9768
9769 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9770 {
9771         if (rq->nr_running > rq->nr_numa_running)
9772                 return regular;
9773         if (rq->nr_running > rq->nr_preferred_running)
9774                 return remote;
9775         return all;
9776 }
9777 #else
9778 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9779 {
9780         return all;
9781 }
9782
9783 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9784 {
9785         return regular;
9786 }
9787 #endif /* CONFIG_NUMA_BALANCING */
9788
9789 #define lb_sd_parent(sd) \
9790         (sd->parent && sd->parent->groups != sd->parent->groups->next)
9791
9792 /**
9793  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9794  * @env: The load balancing environment.
9795  * @sds: variable to hold the statistics for this sched_domain.
9796  */
9797 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9798 {
9799         struct sched_domain *child = env->sd->child;
9800         struct sched_group *sg = env->sd->groups;
9801         struct sg_lb_stats tmp_sgs;
9802         int load_idx, prefer_sibling = 0;
9803         bool overload = false, overutilized = false;
9804
9805         if (child && child->flags & SD_PREFER_SIBLING)
9806                 prefer_sibling = 1;
9807
9808         load_idx = get_sd_load_idx(env->sd, env->idle);
9809
9810         do {
9811                 struct sg_lb_stats *sgs = &tmp_sgs;
9812                 int local_group;
9813
9814                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
9815                 if (local_group) {
9816                         sds->local = sg;
9817                         sgs = &sds->local_stat;
9818
9819                         if (env->idle != CPU_NEWLY_IDLE ||
9820                             time_after_eq(jiffies, sg->sgc->next_update))
9821                                 update_group_capacity(env->sd, env->dst_cpu);
9822                 }
9823
9824                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
9825                                                 &overload, &overutilized);
9826
9827                 if (local_group)
9828                         goto next_group;
9829
9830                 /*
9831                  * In case the child domain prefers tasks go to siblings
9832                  * first, lower the sg capacity so that we'll try
9833                  * and move all the excess tasks away. We lower the capacity
9834                  * of a group only if the local group has the capacity to fit
9835                  * these excess tasks. The extra check prevents the case where
9836                  * you always pull from the heaviest group when it is already
9837                  * under-utilized (possible with a large weight task outweighs
9838                  * the tasks on the system).
9839                  */
9840                 if (prefer_sibling && sds->local &&
9841                     group_has_capacity(env, &sds->local_stat) &&
9842                     (sgs->sum_nr_running > 1)) {
9843                         sgs->group_no_capacity = 1;
9844                         sgs->group_type = group_classify(sg, sgs, env);
9845                 }
9846
9847                 /*
9848                  * Ignore task groups with misfit tasks if local group has no
9849                  * capacity or if per-cpu capacity isn't higher.
9850                  */
9851                 if (energy_aware() &&
9852                     sgs->group_type == group_misfit_task &&
9853                     (!group_has_capacity(env, &sds->local_stat) ||
9854                      !group_smaller_cpu_capacity(sg, sds->local)))
9855                         sgs->group_type = group_other;
9856
9857                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9858                         sds->busiest = sg;
9859                         sds->busiest_stat = *sgs;
9860                         env->busiest_nr_running = sgs->sum_nr_running;
9861                         env->busiest_grp_capacity = sgs->group_capacity;
9862                 }
9863
9864 next_group:
9865                 /* Now, start updating sd_lb_stats */
9866                 sds->total_load += sgs->group_load;
9867                 sds->total_capacity += sgs->group_capacity;
9868
9869                 sg = sg->next;
9870         } while (sg != env->sd->groups);
9871
9872         if (env->sd->flags & SD_NUMA)
9873                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9874
9875         env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9876
9877         if (!lb_sd_parent(env->sd)) {
9878                 /* update overload indicator if we are at root domain */
9879                 if (env->dst_rq->rd->overload != overload)
9880                         env->dst_rq->rd->overload = overload;
9881
9882                 /* Update over-utilization (tipping point, U >= 0) indicator */
9883                 if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
9884                         env->dst_rq->rd->overutilized = overutilized;
9885                         trace_sched_overutilized(overutilized);
9886                 }
9887         } else {
9888                 if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
9889                         env->dst_rq->rd->overutilized = true;
9890                         trace_sched_overutilized(true);
9891                 }
9892         }
9893
9894 }
9895
9896 /**
9897  * check_asym_packing - Check to see if the group is packed into the
9898  *                      sched doman.
9899  *
9900  * This is primarily intended to used at the sibling level.  Some
9901  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
9902  * case of POWER7, it can move to lower SMT modes only when higher
9903  * threads are idle.  When in lower SMT modes, the threads will
9904  * perform better since they share less core resources.  Hence when we
9905  * have idle threads, we want them to be the higher ones.
9906  *
9907  * This packing function is run on idle threads.  It checks to see if
9908  * the busiest CPU in this domain (core in the P7 case) has a higher
9909  * CPU number than the packing function is being run on.  Here we are
9910  * assuming lower CPU number will be equivalent to lower a SMT thread
9911  * number.
9912  *
9913  * Return: 1 when packing is required and a task should be moved to
9914  * this CPU.  The amount of the imbalance is returned in *imbalance.
9915  *
9916  * @env: The load balancing environment.
9917  * @sds: Statistics of the sched_domain which is to be packed
9918  */
9919 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9920 {
9921         int busiest_cpu;
9922
9923         if (!(env->sd->flags & SD_ASYM_PACKING))
9924                 return 0;
9925
9926         if (!sds->busiest)
9927                 return 0;
9928
9929         busiest_cpu = group_first_cpu(sds->busiest);
9930         if (env->dst_cpu > busiest_cpu)
9931                 return 0;
9932
9933         env->imbalance = DIV_ROUND_CLOSEST(
9934                 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9935                 SCHED_CAPACITY_SCALE);
9936
9937         return 1;
9938 }
9939
9940 /**
9941  * fix_small_imbalance - Calculate the minor imbalance that exists
9942  *                      amongst the groups of a sched_domain, during
9943  *                      load balancing.
9944  * @env: The load balancing environment.
9945  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9946  */
9947 static inline
9948 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9949 {
9950         unsigned long tmp, capa_now = 0, capa_move = 0;
9951         unsigned int imbn = 2;
9952         unsigned long scaled_busy_load_per_task;
9953         struct sg_lb_stats *local, *busiest;
9954
9955         local = &sds->local_stat;
9956         busiest = &sds->busiest_stat;
9957
9958         if (!local->sum_nr_running)
9959                 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9960         else if (busiest->load_per_task > local->load_per_task)
9961                 imbn = 1;
9962
9963         scaled_busy_load_per_task =
9964                 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9965                 busiest->group_capacity;
9966
9967         if (busiest->avg_load + scaled_busy_load_per_task >=
9968             local->avg_load + (scaled_busy_load_per_task * imbn)) {
9969                 env->imbalance = busiest->load_per_task;
9970                 return;
9971         }
9972
9973         /*
9974          * OK, we don't have enough imbalance to justify moving tasks,
9975          * however we may be able to increase total CPU capacity used by
9976          * moving them.
9977          */
9978
9979         capa_now += busiest->group_capacity *
9980                         min(busiest->load_per_task, busiest->avg_load);
9981         capa_now += local->group_capacity *
9982                         min(local->load_per_task, local->avg_load);
9983         capa_now /= SCHED_CAPACITY_SCALE;
9984
9985         /* Amount of load we'd subtract */
9986         if (busiest->avg_load > scaled_busy_load_per_task) {
9987                 capa_move += busiest->group_capacity *
9988                             min(busiest->load_per_task,
9989                                 busiest->avg_load - scaled_busy_load_per_task);
9990         }
9991
9992         /* Amount of load we'd add */
9993         if (busiest->avg_load * busiest->group_capacity <
9994             busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9995                 tmp = (busiest->avg_load * busiest->group_capacity) /
9996                       local->group_capacity;
9997         } else {
9998                 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9999                       local->group_capacity;
10000         }
10001         capa_move += local->group_capacity *
10002                     min(local->load_per_task, local->avg_load + tmp);
10003         capa_move /= SCHED_CAPACITY_SCALE;
10004
10005         /* Move if we gain throughput */
10006         if (capa_move > capa_now)
10007                 env->imbalance = busiest->load_per_task;
10008 }
10009
10010 /**
10011  * calculate_imbalance - Calculate the amount of imbalance present within the
10012  *                       groups of a given sched_domain during load balance.
10013  * @env: load balance environment
10014  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
10015  */
10016 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
10017 {
10018         unsigned long max_pull, load_above_capacity = ~0UL;
10019         struct sg_lb_stats *local, *busiest;
10020
10021         local = &sds->local_stat;
10022         busiest = &sds->busiest_stat;
10023
10024         if (busiest->group_type == group_imbalanced) {
10025                 /*
10026                  * In the group_imb case we cannot rely on group-wide averages
10027                  * to ensure cpu-load equilibrium, look at wider averages. XXX
10028                  */
10029                 busiest->load_per_task =
10030                         min(busiest->load_per_task, sds->avg_load);
10031         }
10032
10033         /*
10034          * In the presence of smp nice balancing, certain scenarios can have
10035          * max load less than avg load(as we skip the groups at or below
10036          * its cpu_capacity, while calculating max_load..)
10037          */
10038         if (busiest->avg_load <= sds->avg_load ||
10039             local->avg_load >= sds->avg_load) {
10040                 if (energy_aware()) {
10041                         /* Misfitting tasks should be migrated in any case */
10042                         if (busiest->group_type == group_misfit_task) {
10043                                 env->imbalance = busiest->group_misfit_task;
10044                                 return;
10045                         }
10046
10047                         /*
10048                          * Busiest group is overloaded, local is not, use the spare
10049                          * cycles to maximize throughput
10050                          */
10051                         if (busiest->group_type == group_overloaded &&
10052                             local->group_type <= group_misfit_task) {
10053                                 env->imbalance = busiest->load_per_task;
10054                                 return;
10055                         }
10056                 }
10057
10058                 env->imbalance = 0;
10059                 return fix_small_imbalance(env, sds);
10060         }
10061
10062         /*
10063          * If there aren't any idle cpus, avoid creating some.
10064          */
10065         if (busiest->group_type == group_overloaded &&
10066             local->group_type   == group_overloaded) {
10067                 load_above_capacity = busiest->sum_nr_running *
10068                                         SCHED_LOAD_SCALE;
10069                 if (load_above_capacity > busiest->group_capacity)
10070                         load_above_capacity -= busiest->group_capacity;
10071                 else
10072                         load_above_capacity = ~0UL;
10073         }
10074
10075         /*
10076          * We're trying to get all the cpus to the average_load, so we don't
10077          * want to push ourselves above the average load, nor do we wish to
10078          * reduce the max loaded cpu below the average load. At the same time,
10079          * we also don't want to reduce the group load below the group capacity
10080          * (so that we can implement power-savings policies etc). Thus we look
10081          * for the minimum possible imbalance.
10082          */
10083         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
10084
10085         /* How much load to actually move to equalise the imbalance */
10086         env->imbalance = min(
10087                 max_pull * busiest->group_capacity,
10088                 (sds->avg_load - local->avg_load) * local->group_capacity
10089         ) / SCHED_CAPACITY_SCALE;
10090
10091         /* Boost imbalance to allow misfit task to be balanced. */
10092         if (energy_aware() && busiest->group_type == group_misfit_task)
10093                 env->imbalance = max_t(long, env->imbalance,
10094                                      busiest->group_misfit_task);
10095
10096         /*
10097          * if *imbalance is less than the average load per runnable task
10098          * there is no guarantee that any tasks will be moved so we'll have
10099          * a think about bumping its value to force at least one task to be
10100          * moved
10101          */
10102         if (env->imbalance < busiest->load_per_task)
10103                 return fix_small_imbalance(env, sds);
10104 }
10105
10106 /******* find_busiest_group() helpers end here *********************/
10107
10108 /**
10109  * find_busiest_group - Returns the busiest group within the sched_domain
10110  * if there is an imbalance. If there isn't an imbalance, and
10111  * the user has opted for power-savings, it returns a group whose
10112  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
10113  * such a group exists.
10114  *
10115  * Also calculates the amount of weighted load which should be moved
10116  * to restore balance.
10117  *
10118  * @env: The load balancing environment.
10119  *
10120  * Return:      - The busiest group if imbalance exists.
10121  *              - If no imbalance and user has opted for power-savings balance,
10122  *                 return the least loaded group whose CPUs can be
10123  *                 put to idle by rebalancing its tasks onto our group.
10124  */
10125 static struct sched_group *find_busiest_group(struct lb_env *env)
10126 {
10127         struct sg_lb_stats *local, *busiest;
10128         struct sd_lb_stats sds;
10129
10130         init_sd_lb_stats(&sds);
10131
10132         /*
10133          * Compute the various statistics relavent for load balancing at
10134          * this level.
10135          */
10136         update_sd_lb_stats(env, &sds);
10137
10138         if (energy_aware() && !env->dst_rq->rd->overutilized)
10139                 goto out_balanced;
10140
10141         local = &sds.local_stat;
10142         busiest = &sds.busiest_stat;
10143
10144         /* ASYM feature bypasses nice load balance check */
10145         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
10146             check_asym_packing(env, &sds))
10147                 return sds.busiest;
10148
10149         /* There is no busy sibling group to pull tasks from */
10150         if (!sds.busiest || busiest->sum_nr_running == 0)
10151                 goto out_balanced;
10152
10153         if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10154                 goto force_balance;
10155
10156         if (bail_inter_cluster_balance(env, &sds))
10157                 goto out_balanced;
10158
10159         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
10160                                                 / sds.total_capacity;
10161
10162         /*
10163          * If the busiest group is imbalanced the below checks don't
10164          * work because they assume all things are equal, which typically
10165          * isn't true due to cpus_allowed constraints and the like.
10166          */
10167         if (busiest->group_type == group_imbalanced)
10168                 goto force_balance;
10169
10170         /*
10171          * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
10172          * capacities from resulting in underutilization due to avg_load.
10173          */
10174         if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
10175             busiest->group_no_capacity)
10176                 goto force_balance;
10177
10178         /* Misfitting tasks should be dealt with regardless of the avg load */
10179         if (energy_aware() && busiest->group_type == group_misfit_task) {
10180                 goto force_balance;
10181         }
10182
10183         /*
10184          * If the local group is busier than the selected busiest group
10185          * don't try and pull any tasks.
10186          */
10187         if (local->avg_load >= busiest->avg_load)
10188                 goto out_balanced;
10189
10190         /*
10191          * Don't pull any tasks if this group is already above the domain
10192          * average load.
10193          */
10194         if (local->avg_load >= sds.avg_load)
10195                 goto out_balanced;
10196
10197         if (env->idle == CPU_IDLE) {
10198                 /*
10199                  * This cpu is idle. If the busiest group is not overloaded
10200                  * and there is no imbalance between this and busiest group
10201                  * wrt idle cpus, it is balanced. The imbalance becomes
10202                  * significant if the diff is greater than 1 otherwise we
10203                  * might end up to just move the imbalance on another group
10204                  */
10205                 if ((busiest->group_type != group_overloaded) &&
10206                     (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
10207                     !group_smaller_cpu_capacity(sds.busiest, sds.local))
10208                         goto out_balanced;
10209         } else {
10210                 /*
10211                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
10212                  * imbalance_pct to be conservative.
10213                  */
10214                 if (100 * busiest->avg_load <=
10215                                 env->sd->imbalance_pct * local->avg_load)
10216                         goto out_balanced;
10217         }
10218
10219 force_balance:
10220         env->busiest_group_type = busiest->group_type;
10221         /* Looks like there is an imbalance. Compute it */
10222         calculate_imbalance(env, &sds);
10223         return sds.busiest;
10224
10225 out_balanced:
10226         env->imbalance = 0;
10227         return NULL;
10228 }
10229
10230 #ifdef CONFIG_SCHED_HMP
10231 static struct rq *find_busiest_queue_hmp(struct lb_env *env,
10232                                      struct sched_group *group)
10233 {
10234         struct rq *busiest = NULL, *busiest_big = NULL;
10235         u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
10236         int max_nr_big = 0, nr_big;
10237         bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
10238         int i;
10239         cpumask_t cpus;
10240
10241         cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
10242
10243         for_each_cpu(i, &cpus) {
10244                 struct rq *rq = cpu_rq(i);
10245                 u64 cumulative_runnable_avg =
10246                                 rq->hmp_stats.cumulative_runnable_avg;
10247
10248                 if (!cpumask_test_cpu(i, env->cpus))
10249                         continue;
10250
10251
10252                 if (find_big) {
10253                         nr_big = nr_big_tasks(rq);
10254                         if (nr_big > max_nr_big ||
10255                             (nr_big > 0 && nr_big == max_nr_big &&
10256                              cumulative_runnable_avg > max_runnable_avg_big)) {
10257                                 max_runnable_avg_big = cumulative_runnable_avg;
10258                                 busiest_big = rq;
10259                                 max_nr_big = nr_big;
10260                                 continue;
10261                         }
10262                 }
10263
10264                 if (cumulative_runnable_avg > max_runnable_avg) {
10265                         max_runnable_avg = cumulative_runnable_avg;
10266                         busiest = rq;
10267                 }
10268         }
10269
10270         if (busiest_big)
10271                 return busiest_big;
10272
10273         env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
10274         return busiest;
10275 }
10276 #else
10277 static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
10278                                     struct sched_group *group)
10279 {
10280         return NULL;
10281 }
10282 #endif
10283
10284 /*
10285  * find_busiest_queue - find the busiest runqueue among the cpus in group.
10286  */
10287 static struct rq *find_busiest_queue(struct lb_env *env,
10288                                      struct sched_group *group)
10289 {
10290         struct rq *busiest = NULL, *rq;
10291         unsigned long busiest_load = 0, busiest_capacity = 1;
10292         int i;
10293
10294 #ifdef CONFIG_SCHED_HMP
10295         return find_busiest_queue_hmp(env, group);
10296 #endif
10297
10298         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
10299                 unsigned long capacity, wl;
10300                 enum fbq_type rt;
10301
10302                 rq = cpu_rq(i);
10303                 rt = fbq_classify_rq(rq);
10304
10305                 /*
10306                  * We classify groups/runqueues into three groups:
10307                  *  - regular: there are !numa tasks
10308                  *  - remote:  there are numa tasks that run on the 'wrong' node
10309                  *  - all:     there is no distinction
10310                  *
10311                  * In order to avoid migrating ideally placed numa tasks,
10312                  * ignore those when there's better options.
10313                  *
10314                  * If we ignore the actual busiest queue to migrate another
10315                  * task, the next balance pass can still reduce the busiest
10316                  * queue by moving tasks around inside the node.
10317                  *
10318                  * If we cannot move enough load due to this classification
10319                  * the next pass will adjust the group classification and
10320                  * allow migration of more tasks.
10321                  *
10322                  * Both cases only affect the total convergence complexity.
10323                  */
10324                 if (rt > env->fbq_type)
10325                         continue;
10326
10327                 capacity = capacity_of(i);
10328
10329                 wl = weighted_cpuload(i);
10330
10331                 /*
10332                  * When comparing with imbalance, use weighted_cpuload()
10333                  * which is not scaled with the cpu capacity.
10334                  */
10335
10336                 if (rq->nr_running == 1 && wl > env->imbalance &&
10337                     !check_cpu_capacity(rq, env->sd) &&
10338                     env->busiest_group_type != group_misfit_task)
10339                         continue;
10340
10341                 /*
10342                  * For the load comparisons with the other cpu's, consider
10343                  * the weighted_cpuload() scaled with the cpu capacity, so
10344                  * that the load can be moved away from the cpu that is
10345                  * potentially running at a lower capacity.
10346                  *
10347                  * Thus we're looking for max(wl_i / capacity_i), crosswise
10348                  * multiplication to rid ourselves of the division works out
10349                  * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
10350                  * our previous maximum.
10351                  */
10352                 if (wl * busiest_capacity > busiest_load * capacity) {
10353                         busiest_load = wl;
10354                         busiest_capacity = capacity;
10355                         busiest = rq;
10356                 }
10357         }
10358
10359         return busiest;
10360 }
10361
10362 /*
10363  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10364  * so long as it is large enough.
10365  */
10366 #define MAX_PINNED_INTERVAL     16
10367
10368 /* Working cpumask for load_balance and load_balance_newidle. */
10369 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
10370
10371 #define NEED_ACTIVE_BALANCE_THRESHOLD 10
10372
10373 static int need_active_balance(struct lb_env *env)
10374 {
10375         struct sched_domain *sd = env->sd;
10376
10377         if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10378                 return 1;
10379
10380         if (env->idle == CPU_NEWLY_IDLE) {
10381
10382                 /*
10383                  * ASYM_PACKING needs to force migrate tasks from busy but
10384                  * higher numbered CPUs in order to pack all tasks in the
10385                  * lowest numbered CPUs.
10386                  */
10387                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
10388                         return 1;
10389         }
10390
10391         /*
10392          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
10393          * It's worth migrating the task if the src_cpu's capacity is reduced
10394          * because of other sched_class or IRQs if more capacity stays
10395          * available on dst_cpu.
10396          */
10397         if ((env->idle != CPU_NOT_IDLE) &&
10398             (env->src_rq->cfs.h_nr_running == 1)) {
10399                 if ((check_cpu_capacity(env->src_rq, sd)) &&
10400                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10401                         return 1;
10402         }
10403
10404         if (energy_aware() &&
10405             (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
10406             ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
10407                                 env->src_rq->cfs.h_nr_running == 1 &&
10408                                 cpu_overutilized(env->src_cpu) &&
10409                                 !cpu_overutilized(env->dst_cpu)) {
10410                         return 1;
10411         }
10412
10413         return unlikely(sd->nr_balance_failed >
10414                         sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
10415 }
10416
10417 static int group_balance_cpu_not_isolated(struct sched_group *sg)
10418 {
10419         cpumask_t cpus;
10420
10421         cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
10422         cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
10423         return cpumask_first(&cpus);
10424 }
10425
10426 static int should_we_balance(struct lb_env *env)
10427 {
10428         struct sched_group *sg = env->sd->groups;
10429         struct cpumask *sg_cpus, *sg_mask;
10430         int cpu, balance_cpu = -1;
10431
10432         /*
10433          * In the newly idle case, we will allow all the cpu's
10434          * to do the newly idle load balance.
10435          */
10436         if (env->idle == CPU_NEWLY_IDLE)
10437                 return 1;
10438
10439         sg_cpus = sched_group_cpus(sg);
10440         sg_mask = sched_group_mask(sg);
10441         /* Try to find first idle cpu */
10442         for_each_cpu_and(cpu, sg_cpus, env->cpus) {
10443                 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
10444                     cpu_isolated(cpu))
10445                         continue;
10446
10447                 balance_cpu = cpu;
10448                 break;
10449         }
10450
10451         if (balance_cpu == -1)
10452                 balance_cpu = group_balance_cpu_not_isolated(sg);
10453
10454         /*
10455          * First idle cpu or the first cpu(busiest) in this sched group
10456          * is eligible for doing load balancing at this and above domains.
10457          */
10458         return balance_cpu == env->dst_cpu;
10459 }
10460
10461 /*
10462  * Check this_cpu to ensure it is balanced within domain. Attempt to move
10463  * tasks if there is an imbalance.
10464  */
10465 static int load_balance(int this_cpu, struct rq *this_rq,
10466                         struct sched_domain *sd, enum cpu_idle_type idle,
10467                         int *continue_balancing)
10468 {
10469         int ld_moved = 0, cur_ld_moved, active_balance = 0;
10470         struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
10471         struct sched_group *group = NULL;
10472         struct rq *busiest = NULL;
10473         unsigned long flags;
10474         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
10475
10476         struct lb_env env = {
10477                 .sd                     = sd,
10478                 .dst_cpu                = this_cpu,
10479                 .dst_rq                 = this_rq,
10480                 .dst_grpmask            = sched_group_cpus(sd->groups),
10481                 .idle                   = idle,
10482                 .loop_break             = sched_nr_migrate_break,
10483                 .cpus                   = cpus,
10484                 .fbq_type               = all,
10485                 .tasks                  = LIST_HEAD_INIT(env.tasks),
10486                 .imbalance              = 0,
10487                 .flags                  = 0,
10488                 .loop                   = 0,
10489                 .busiest_nr_running     = 0,
10490                 .busiest_grp_capacity   = 0,
10491                 .boost_policy           = sched_boost_policy(),
10492         };
10493
10494         /*
10495          * For NEWLY_IDLE load_balancing, we don't need to consider
10496          * other cpus in our group
10497          */
10498         if (idle == CPU_NEWLY_IDLE)
10499                 env.dst_grpmask = NULL;
10500
10501         cpumask_copy(cpus, cpu_active_mask);
10502
10503         schedstat_inc(sd, lb_count[idle]);
10504
10505 redo:
10506         if (!should_we_balance(&env)) {
10507                 *continue_balancing = 0;
10508                 goto out_balanced;
10509         }
10510
10511         group = find_busiest_group(&env);
10512         if (!group) {
10513                 schedstat_inc(sd, lb_nobusyg[idle]);
10514                 goto out_balanced;
10515         }
10516
10517         busiest = find_busiest_queue(&env, group);
10518         if (!busiest) {
10519                 schedstat_inc(sd, lb_nobusyq[idle]);
10520                 goto out_balanced;
10521         }
10522
10523         BUG_ON(busiest == env.dst_rq);
10524
10525         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
10526
10527         env.src_cpu = busiest->cpu;
10528         env.src_rq = busiest;
10529
10530         ld_moved = 0;
10531         if (busiest->nr_running > 1) {
10532                 /*
10533                  * Attempt to move tasks. If find_busiest_group has found
10534                  * an imbalance but busiest->nr_running <= 1, the group is
10535                  * still unbalanced. ld_moved simply stays zero, so it is
10536                  * correctly treated as an imbalance.
10537                  */
10538                 env.flags |= LBF_ALL_PINNED;
10539                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
10540
10541 more_balance:
10542                 raw_spin_lock_irqsave(&busiest->lock, flags);
10543                 update_rq_clock(busiest);
10544
10545                 /* The world might have changed. Validate assumptions */
10546                 if (busiest->nr_running <= 1) {
10547                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
10548                         env.flags &= ~LBF_ALL_PINNED;
10549                         goto no_move;
10550                 }
10551
10552                 /*
10553                  * cur_ld_moved - load moved in current iteration
10554                  * ld_moved     - cumulative load moved across iterations
10555                  */
10556                 cur_ld_moved = detach_tasks(&env);
10557
10558                 /*
10559                  * We've detached some tasks from busiest_rq. Every
10560                  * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
10561                  * unlock busiest->lock, and we are able to be sure
10562                  * that nobody can manipulate the tasks in parallel.
10563                  * See task_rq_lock() family for the details.
10564                  */
10565
10566                 raw_spin_unlock(&busiest->lock);
10567
10568                 if (cur_ld_moved) {
10569                         attach_tasks(&env);
10570                         ld_moved += cur_ld_moved;
10571                 }
10572
10573                 local_irq_restore(flags);
10574
10575                 if (env.flags & LBF_NEED_BREAK) {
10576                         env.flags &= ~LBF_NEED_BREAK;
10577                         goto more_balance;
10578                 }
10579
10580                 /*
10581                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
10582                  * us and move them to an alternate dst_cpu in our sched_group
10583                  * where they can run. The upper limit on how many times we
10584                  * iterate on same src_cpu is dependent on number of cpus in our
10585                  * sched_group.
10586                  *
10587                  * This changes load balance semantics a bit on who can move
10588                  * load to a given_cpu. In addition to the given_cpu itself
10589                  * (or a ilb_cpu acting on its behalf where given_cpu is
10590                  * nohz-idle), we now have balance_cpu in a position to move
10591                  * load to given_cpu. In rare situations, this may cause
10592                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
10593                  * _independently_ and at _same_ time to move some load to
10594                  * given_cpu) causing exceess load to be moved to given_cpu.
10595                  * This however should not happen so much in practice and
10596                  * moreover subsequent load balance cycles should correct the
10597                  * excess load moved.
10598                  */
10599                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
10600
10601                         /* Prevent to re-select dst_cpu via env's cpus */
10602                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
10603
10604                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
10605                         env.dst_cpu      = env.new_dst_cpu;
10606                         env.flags       &= ~LBF_DST_PINNED;
10607                         env.loop         = 0;
10608                         env.loop_break   = sched_nr_migrate_break;
10609
10610                         /*
10611                          * Go back to "more_balance" rather than "redo" since we
10612                          * need to continue with same src_cpu.
10613                          */
10614                         goto more_balance;
10615                 }
10616
10617                 /*
10618                  * We failed to reach balance because of affinity.
10619                  */
10620                 if (sd_parent) {
10621                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10622
10623                         if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
10624                                 *group_imbalance = 1;
10625                 }
10626
10627                 /* All tasks on this runqueue were pinned by CPU affinity */
10628                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
10629                         cpumask_clear_cpu(cpu_of(busiest), cpus);
10630                         if (!cpumask_empty(cpus)) {
10631                                 env.loop = 0;
10632                                 env.loop_break = sched_nr_migrate_break;
10633                                 goto redo;
10634                         }
10635                         goto out_all_pinned;
10636                 }
10637         }
10638
10639 no_move:
10640         if (!ld_moved) {
10641                 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
10642                         schedstat_inc(sd, lb_failed[idle]);
10643
10644                 /*
10645                  * Increment the failure counter only on periodic balance.
10646                  * We do not want newidle balance, which can be very
10647                  * frequent, pollute the failure counter causing
10648                  * excessive cache_hot migrations and active balances.
10649                  */
10650                 if (idle != CPU_NEWLY_IDLE &&
10651                     !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
10652                         if (env.src_grp_nr_running > 1)
10653                                 sd->nr_balance_failed++;
10654                 }
10655
10656                 if (need_active_balance(&env)) {
10657                         raw_spin_lock_irqsave(&busiest->lock, flags);
10658
10659                         /* don't kick the active_load_balance_cpu_stop,
10660                          * if the curr task on busiest cpu can't be
10661                          * moved to this_cpu
10662                          */
10663                         if (!cpumask_test_cpu(this_cpu,
10664                                         tsk_cpus_allowed(busiest->curr))) {
10665                                 raw_spin_unlock_irqrestore(&busiest->lock,
10666                                                             flags);
10667                                 env.flags |= LBF_ALL_PINNED;
10668                                 goto out_one_pinned;
10669                         }
10670
10671                         /*
10672                          * ->active_balance synchronizes accesses to
10673                          * ->active_balance_work.  Once set, it's cleared
10674                          * only after active load balance is finished.
10675                          */
10676                         if (!busiest->active_balance &&
10677                             !cpu_isolated(cpu_of(busiest))) {
10678                                 busiest->active_balance = 1;
10679                                 busiest->push_cpu = this_cpu;
10680                                 active_balance = 1;
10681                         }
10682                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
10683
10684                         if (active_balance) {
10685                                 stop_one_cpu_nowait(cpu_of(busiest),
10686                                         active_load_balance_cpu_stop, busiest,
10687                                         &busiest->active_balance_work);
10688                                 *continue_balancing = 0;
10689                         }
10690
10691                         /*
10692                          * We've kicked active balancing, reset the failure
10693                          * counter.
10694                          */
10695                         sd->nr_balance_failed =
10696                             sd->cache_nice_tries +
10697                             NEED_ACTIVE_BALANCE_THRESHOLD - 1;
10698                 }
10699         } else {
10700                 sd->nr_balance_failed = 0;
10701
10702                 /* Assumes one 'busiest' cpu that we pulled tasks from */
10703                 if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
10704                         int check_groups = !!(env.flags &
10705                                          LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10706
10707                         check_for_freq_change(this_rq, false, check_groups);
10708                         check_for_freq_change(busiest, false, check_groups);
10709                 } else {
10710                         check_for_freq_change(this_rq, true, false);
10711                 }
10712         }
10713         if (likely(!active_balance)) {
10714                 /* We were unbalanced, so reset the balancing interval */
10715                 sd->balance_interval = sd->min_interval;
10716         } else {
10717                 /*
10718                  * If we've begun active balancing, start to back off. This
10719                  * case may not be covered by the all_pinned logic if there
10720                  * is only 1 task on the busy runqueue (because we don't call
10721                  * detach_tasks).
10722                  */
10723                 if (sd->balance_interval < sd->max_interval)
10724                         sd->balance_interval *= 2;
10725         }
10726
10727         goto out;
10728
10729 out_balanced:
10730         /*
10731          * We reach balance although we may have faced some affinity
10732          * constraints. Clear the imbalance flag if it was set.
10733          */
10734         if (sd_parent) {
10735                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10736
10737                 if (*group_imbalance)
10738                         *group_imbalance = 0;
10739         }
10740
10741 out_all_pinned:
10742         /*
10743          * We reach balance because all tasks are pinned at this level so
10744          * we can't migrate them. Let the imbalance flag set so parent level
10745          * can try to migrate them.
10746          */
10747         schedstat_inc(sd, lb_balanced[idle]);
10748
10749         sd->nr_balance_failed = 0;
10750
10751 out_one_pinned:
10752         /* tune up the balancing interval */
10753         if (((env.flags & LBF_ALL_PINNED) &&
10754                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
10755                         (sd->balance_interval < sd->max_interval))
10756                 sd->balance_interval *= 2;
10757
10758         ld_moved = 0;
10759 out:
10760         trace_sched_load_balance(this_cpu, idle, *continue_balancing,
10761                                  group ? group->cpumask[0] : 0,
10762                                  busiest ? busiest->nr_running : 0,
10763                                  env.imbalance, env.flags, ld_moved,
10764                                  sd->balance_interval);
10765         return ld_moved;
10766 }
10767
10768 static inline unsigned long
10769 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10770 {
10771         unsigned long interval = sd->balance_interval;
10772
10773         if (cpu_busy)
10774                 interval *= sd->busy_factor;
10775
10776         /* scale ms to jiffies */
10777         interval = msecs_to_jiffies(interval);
10778         interval = clamp(interval, 1UL, max_load_balance_interval);
10779
10780         return interval;
10781 }
10782
10783 static inline void
10784 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
10785 {
10786         unsigned long interval, next;
10787
10788         interval = get_sd_balance_interval(sd, cpu_busy);
10789         next = sd->last_balance + interval;
10790
10791         if (time_after(*next_balance, next))
10792                 *next_balance = next;
10793 }
10794
10795 /*
10796  * idle_balance is called by schedule() if this_cpu is about to become
10797  * idle. Attempts to pull tasks from other CPUs.
10798  */
10799 static int idle_balance(struct rq *this_rq)
10800 {
10801         unsigned long next_balance = jiffies + HZ;
10802         int this_cpu = this_rq->cpu;
10803         struct sched_domain *sd;
10804         int pulled_task = 0;
10805         u64 curr_cost = 0;
10806
10807         if (cpu_isolated(this_cpu))
10808                 return 0;
10809
10810         idle_enter_fair(this_rq);
10811
10812         /*
10813          * We must set idle_stamp _before_ calling idle_balance(), such that we
10814          * measure the duration of idle_balance() as idle time.
10815          */
10816         this_rq->idle_stamp = rq_clock(this_rq);
10817
10818         if (!energy_aware() &&
10819             (this_rq->avg_idle < sysctl_sched_migration_cost ||
10820              !this_rq->rd->overload)) {
10821                 rcu_read_lock();
10822                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10823                 if (sd)
10824                         update_next_balance(sd, 0, &next_balance);
10825                 rcu_read_unlock();
10826
10827                 goto out;
10828         }
10829
10830         raw_spin_unlock(&this_rq->lock);
10831
10832         update_blocked_averages(this_cpu);
10833         rcu_read_lock();
10834         for_each_domain(this_cpu, sd) {
10835                 int continue_balancing = 1;
10836                 u64 t0, domain_cost;
10837
10838                 if (!(sd->flags & SD_LOAD_BALANCE))
10839                         continue;
10840
10841                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10842                         update_next_balance(sd, 0, &next_balance);
10843                         break;
10844                 }
10845
10846                 if (sd->flags & SD_BALANCE_NEWIDLE) {
10847                         t0 = sched_clock_cpu(this_cpu);
10848
10849                         pulled_task = load_balance(this_cpu, this_rq,
10850                                                    sd, CPU_NEWLY_IDLE,
10851                                                    &continue_balancing);
10852
10853                         domain_cost = sched_clock_cpu(this_cpu) - t0;
10854                         if (domain_cost > sd->max_newidle_lb_cost)
10855                                 sd->max_newidle_lb_cost = domain_cost;
10856
10857                         curr_cost += domain_cost;
10858                 }
10859
10860                 update_next_balance(sd, 0, &next_balance);
10861
10862                 /*
10863                  * Stop searching for tasks to pull if there are
10864                  * now runnable tasks on the balance rq or if
10865                  * continue_balancing has been unset (only possible
10866                  * due to active migration).
10867                  */
10868                 if (pulled_task || this_rq->nr_running > 0 ||
10869                                                 !continue_balancing)
10870                         break;
10871         }
10872         rcu_read_unlock();
10873
10874         raw_spin_lock(&this_rq->lock);
10875
10876         if (curr_cost > this_rq->max_idle_balance_cost)
10877                 this_rq->max_idle_balance_cost = curr_cost;
10878
10879         /*
10880          * While browsing the domains, we released the rq lock, a task could
10881          * have been enqueued in the meantime. Since we're not going idle,
10882          * pretend we pulled a task.
10883          */
10884         if (this_rq->cfs.h_nr_running && !pulled_task)
10885                 pulled_task = 1;
10886
10887 out:
10888         /* Move the next balance forward */
10889         if (time_after(this_rq->next_balance, next_balance))
10890                 this_rq->next_balance = next_balance;
10891
10892         /* Is there a task of a high priority class? */
10893         if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10894                 pulled_task = -1;
10895
10896         if (pulled_task) {
10897                 idle_exit_fair(this_rq);
10898                 this_rq->idle_stamp = 0;
10899         }
10900
10901         return pulled_task;
10902 }
10903
10904 /*
10905  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
10906  * running tasks off the busiest CPU onto idle CPUs. It requires at
10907  * least 1 task to be running on each physical CPU where possible, and
10908  * avoids physical / logical imbalances.
10909  */
10910 static int active_load_balance_cpu_stop(void *data)
10911 {
10912         struct rq *busiest_rq = data;
10913         int busiest_cpu = cpu_of(busiest_rq);
10914         int target_cpu = busiest_rq->push_cpu;
10915         struct rq *target_rq = cpu_rq(target_cpu);
10916         struct sched_domain *sd = NULL;
10917         struct task_struct *p = NULL;
10918         struct task_struct *push_task = NULL;
10919         int push_task_detached = 0;
10920         struct lb_env env = {
10921                 .sd                     = sd,
10922                 .dst_cpu                = target_cpu,
10923                 .dst_rq                 = target_rq,
10924                 .src_cpu                = busiest_rq->cpu,
10925                 .src_rq                 = busiest_rq,
10926                 .idle                   = CPU_IDLE,
10927                 .busiest_nr_running     = 0,
10928                 .busiest_grp_capacity   = 0,
10929                 .flags                  = 0,
10930                 .loop                   = 0,
10931                 .boost_policy           = sched_boost_policy(),
10932         };
10933         bool moved = false;
10934
10935         raw_spin_lock_irq(&busiest_rq->lock);
10936
10937         /* make sure the requested cpu hasn't gone down in the meantime */
10938         if (unlikely(busiest_cpu != smp_processor_id() ||
10939                      !busiest_rq->active_balance))
10940                 goto out_unlock;
10941
10942         /* Is there any task to move? */
10943         if (busiest_rq->nr_running <= 1)
10944                 goto out_unlock;
10945
10946         /*
10947          * This condition is "impossible", if it occurs
10948          * we need to fix it. Originally reported by
10949          * Bjorn Helgaas on a 128-cpu setup.
10950          */
10951         BUG_ON(busiest_rq == target_rq);
10952
10953         push_task = busiest_rq->push_task;
10954         target_cpu = busiest_rq->push_cpu;
10955         if (push_task) {
10956                 if (task_on_rq_queued(push_task) &&
10957                         push_task->state == TASK_RUNNING &&
10958                         task_cpu(push_task) == busiest_cpu &&
10959                                         cpu_online(target_cpu)) {
10960                         detach_task(push_task, &env);
10961                         push_task_detached = 1;
10962                         moved = true;
10963                 }
10964                 goto out_unlock;
10965         }
10966
10967         /* Search for an sd spanning us and the target CPU. */
10968         rcu_read_lock();
10969         for_each_domain(target_cpu, sd) {
10970                 if ((sd->flags & SD_LOAD_BALANCE) &&
10971                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10972                                 break;
10973         }
10974
10975         if (likely(sd)) {
10976                 env.sd = sd;
10977                 schedstat_inc(sd, alb_count);
10978                 update_rq_clock(busiest_rq);
10979
10980                 p = detach_one_task(&env);
10981                 if (p) {
10982                         schedstat_inc(sd, alb_pushed);
10983                         moved = true;
10984                 } else {
10985                         schedstat_inc(sd, alb_failed);
10986                 }
10987         }
10988         rcu_read_unlock();
10989 out_unlock:
10990         busiest_rq->active_balance = 0;
10991         push_task = busiest_rq->push_task;
10992         target_cpu = busiest_rq->push_cpu;
10993
10994         if (push_task)
10995                 busiest_rq->push_task = NULL;
10996
10997         raw_spin_unlock(&busiest_rq->lock);
10998
10999         if (push_task) {
11000                 if (push_task_detached)
11001                         attach_one_task(target_rq, push_task);
11002                 put_task_struct(push_task);
11003                 clear_reserved(target_cpu);
11004         }
11005
11006         if (p)
11007                 attach_one_task(target_rq, p);
11008
11009         local_irq_enable();
11010
11011         if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
11012                 int check_groups = !!(env.flags &
11013                                          LBF_MOVED_RELATED_THREAD_GROUP_TASK);
11014                 check_for_freq_change(busiest_rq, false, check_groups);
11015                 check_for_freq_change(target_rq, false, check_groups);
11016         } else if (moved) {
11017                 check_for_freq_change(target_rq, true, false);
11018         }
11019
11020         return 0;
11021 }
11022
11023 static inline int on_null_domain(struct rq *rq)
11024 {
11025         return unlikely(!rcu_dereference_sched(rq->sd));
11026 }
11027
11028 #ifdef CONFIG_NO_HZ_COMMON
11029 /*
11030  * idle load balancing details
11031  * - When one of the busy CPUs notice that there may be an idle rebalancing
11032  *   needed, they will kick the idle load balancer, which then does idle
11033  *   load balancing for all the idle CPUs.
11034  */
11035
11036 #ifdef CONFIG_SCHED_HMP
11037 static inline int find_new_hmp_ilb(int type)
11038 {
11039         int call_cpu = raw_smp_processor_id();
11040         struct sched_domain *sd;
11041         int ilb;
11042
11043         rcu_read_lock();
11044
11045         /* Pick an idle cpu "closest" to call_cpu */
11046         for_each_domain(call_cpu, sd) {
11047                 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
11048                                                 sched_domain_span(sd)) {
11049                         if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
11050                                         cpu_max_power_cost(ilb) <=
11051                                         cpu_max_power_cost(call_cpu))) {
11052                                 rcu_read_unlock();
11053                                 reset_balance_interval(ilb);
11054                                 return ilb;
11055                         }
11056                 }
11057         }
11058
11059         rcu_read_unlock();
11060         return nr_cpu_ids;
11061 }
11062 #else   /* CONFIG_SCHED_HMP */
11063 static inline int find_new_hmp_ilb(int type)
11064 {
11065         return 0;
11066 }
11067 #endif  /* CONFIG_SCHED_HMP */
11068
11069 static inline int find_new_ilb(int type)
11070 {
11071         int ilb;
11072
11073 #ifdef CONFIG_SCHED_HMP
11074         return find_new_hmp_ilb(type);
11075 #endif
11076
11077         ilb = cpumask_first(nohz.idle_cpus_mask);
11078
11079         if (ilb < nr_cpu_ids && idle_cpu(ilb))
11080                 return ilb;
11081
11082         return nr_cpu_ids;
11083 }
11084
11085 /*
11086  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
11087  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
11088  * CPU (if there is one).
11089  */
11090 static void nohz_balancer_kick(int type)
11091 {
11092         int ilb_cpu;
11093
11094         nohz.next_balance++;
11095
11096         ilb_cpu = find_new_ilb(type);
11097
11098         if (ilb_cpu >= nr_cpu_ids)
11099                 return;
11100
11101         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
11102                 return;
11103         /*
11104          * Use smp_send_reschedule() instead of resched_cpu().
11105          * This way we generate a sched IPI on the target cpu which
11106          * is idle. And the softirq performing nohz idle load balance
11107          * will be run before returning from the IPI.
11108          */
11109         smp_send_reschedule(ilb_cpu);
11110         return;
11111 }
11112
11113 void nohz_balance_clear_nohz_mask(int cpu)
11114 {
11115         if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
11116                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
11117                 atomic_dec(&nohz.nr_cpus);
11118         }
11119 }
11120
11121 static inline void nohz_balance_exit_idle(int cpu)
11122 {
11123         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
11124                 /*
11125                  * Completely isolated CPUs don't ever set, so we must test.
11126                  */
11127                 nohz_balance_clear_nohz_mask(cpu);
11128                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11129         }
11130 }
11131
11132 static inline void set_cpu_sd_state_busy(void)
11133 {
11134         struct sched_domain *sd;
11135         int cpu = smp_processor_id();
11136
11137         rcu_read_lock();
11138         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11139
11140         if (!sd || !sd->nohz_idle)
11141                 goto unlock;
11142         sd->nohz_idle = 0;
11143
11144         atomic_inc(&sd->groups->sgc->nr_busy_cpus);
11145 unlock:
11146         rcu_read_unlock();
11147 }
11148
11149 void set_cpu_sd_state_idle(void)
11150 {
11151         struct sched_domain *sd;
11152         int cpu = smp_processor_id();
11153
11154         rcu_read_lock();
11155         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11156
11157         if (!sd || sd->nohz_idle)
11158                 goto unlock;
11159         sd->nohz_idle = 1;
11160
11161         atomic_dec(&sd->groups->sgc->nr_busy_cpus);
11162 unlock:
11163         rcu_read_unlock();
11164 }
11165
11166 /*
11167  * This routine will record that the cpu is going idle with tick stopped.
11168  * This info will be used in performing idle load balancing in the future.
11169  */
11170 void nohz_balance_enter_idle(int cpu)
11171 {
11172         /*
11173          * If this cpu is going down, then nothing needs to be done.
11174          */
11175         if (!cpu_active(cpu))
11176                 return;
11177
11178         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
11179                 return;
11180
11181         /*
11182          * If we're a completely isolated CPU, we don't play.
11183          */
11184         if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
11185                 return;
11186
11187         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
11188         atomic_inc(&nohz.nr_cpus);
11189         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11190 }
11191
11192 static int sched_ilb_notifier(struct notifier_block *nfb,
11193                                         unsigned long action, void *hcpu)
11194 {
11195         switch (action & ~CPU_TASKS_FROZEN) {
11196         case CPU_DYING:
11197                 nohz_balance_exit_idle(smp_processor_id());
11198                 return NOTIFY_OK;
11199         default:
11200                 return NOTIFY_DONE;
11201         }
11202 }
11203 #endif
11204
11205 static DEFINE_SPINLOCK(balancing);
11206
11207 /*
11208  * Scale the max load_balance interval with the number of CPUs in the system.
11209  * This trades load-balance latency on larger machines for less cross talk.
11210  */
11211 void update_max_interval(void)
11212 {
11213         cpumask_t avail_mask;
11214         unsigned int available_cpus;
11215
11216         cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
11217         available_cpus = cpumask_weight(&avail_mask);
11218
11219         max_load_balance_interval = HZ*available_cpus/10;
11220 }
11221
11222 /*
11223  * It checks each scheduling domain to see if it is due to be balanced,
11224  * and initiates a balancing operation if so.
11225  *
11226  * Balancing parameters are set up in init_sched_domains.
11227  */
11228 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
11229 {
11230         int continue_balancing = 1;
11231         int cpu = rq->cpu;
11232         unsigned long interval;
11233         struct sched_domain *sd;
11234         /* Earliest time when we have to do rebalance again */
11235         unsigned long next_balance = jiffies + 60*HZ;
11236         int update_next_balance = 0;
11237         int need_serialize, need_decay = 0;
11238         u64 max_cost = 0;
11239
11240         update_blocked_averages(cpu);
11241
11242         rcu_read_lock();
11243         for_each_domain(cpu, sd) {
11244                 /*
11245                  * Decay the newidle max times here because this is a regular
11246                  * visit to all the domains. Decay ~1% per second.
11247                  */
11248                 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
11249                         sd->max_newidle_lb_cost =
11250                                 (sd->max_newidle_lb_cost * 253) / 256;
11251                         sd->next_decay_max_lb_cost = jiffies + HZ;
11252                         need_decay = 1;
11253                 }
11254                 max_cost += sd->max_newidle_lb_cost;
11255
11256                 if (!(sd->flags & SD_LOAD_BALANCE))
11257                         continue;
11258
11259                 /*
11260                  * Stop the load balance at this level. There is another
11261                  * CPU in our sched group which is doing load balancing more
11262                  * actively.
11263                  */
11264                 if (!continue_balancing) {
11265                         if (need_decay)
11266                                 continue;
11267                         break;
11268                 }
11269
11270                 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11271
11272                 need_serialize = sd->flags & SD_SERIALIZE;
11273                 if (need_serialize) {
11274                         if (!spin_trylock(&balancing))
11275                                 goto out;
11276                 }
11277
11278                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
11279                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
11280                                 /*
11281                                  * The LBF_DST_PINNED logic could have changed
11282                                  * env->dst_cpu, so we can't know our idle
11283                                  * state even if we migrated tasks. Update it.
11284                                  */
11285                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
11286                         }
11287                         sd->last_balance = jiffies;
11288                         interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11289                 }
11290                 if (need_serialize)
11291                         spin_unlock(&balancing);
11292 out:
11293                 if (time_after(next_balance, sd->last_balance + interval)) {
11294                         next_balance = sd->last_balance + interval;
11295                         update_next_balance = 1;
11296                 }
11297         }
11298         if (need_decay) {
11299                 /*
11300                  * Ensure the rq-wide value also decays but keep it at a
11301                  * reasonable floor to avoid funnies with rq->avg_idle.
11302                  */
11303                 rq->max_idle_balance_cost =
11304                         max((u64)sysctl_sched_migration_cost, max_cost);
11305         }
11306         rcu_read_unlock();
11307
11308         /*
11309          * next_balance will be updated only when there is a need.
11310          * When the cpu is attached to null domain for ex, it will not be
11311          * updated.
11312          */
11313         if (likely(update_next_balance)) {
11314                 rq->next_balance = next_balance;
11315
11316 #ifdef CONFIG_NO_HZ_COMMON
11317                 /*
11318                  * If this CPU has been elected to perform the nohz idle
11319                  * balance. Other idle CPUs have already rebalanced with
11320                  * nohz_idle_balance() and nohz.next_balance has been
11321                  * updated accordingly. This CPU is now running the idle load
11322                  * balance for itself and we need to update the
11323                  * nohz.next_balance accordingly.
11324                  */
11325                 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
11326                         nohz.next_balance = rq->next_balance;
11327 #endif
11328         }
11329 }
11330
11331 #ifdef CONFIG_NO_HZ_COMMON
11332 /*
11333  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
11334  * rebalancing for all the cpus for whom scheduler ticks are stopped.
11335  */
11336 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11337 {
11338         int this_cpu = this_rq->cpu;
11339         struct rq *rq;
11340         int balance_cpu;
11341         /* Earliest time when we have to do rebalance again */
11342         unsigned long next_balance = jiffies + 60*HZ;
11343         int update_next_balance = 0;
11344         cpumask_t cpus;
11345
11346         if (idle != CPU_IDLE ||
11347             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
11348                 goto end;
11349
11350         cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
11351
11352         for_each_cpu(balance_cpu, &cpus) {
11353                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
11354                         continue;
11355
11356                 /*
11357                  * If this cpu gets work to do, stop the load balancing
11358                  * work being done for other cpus. Next load
11359                  * balancing owner will pick it up.
11360                  */
11361                 if (need_resched())
11362                         break;
11363
11364                 rq = cpu_rq(balance_cpu);
11365
11366                 /*
11367                  * If time for next balance is due,
11368                  * do the balance.
11369                  */
11370                 if (time_after_eq(jiffies, rq->next_balance)) {
11371                         raw_spin_lock_irq(&rq->lock);
11372                         update_rq_clock(rq);
11373                         update_idle_cpu_load(rq);
11374                         raw_spin_unlock_irq(&rq->lock);
11375                         rebalance_domains(rq, CPU_IDLE);
11376                 }
11377
11378                 if (time_after(next_balance, rq->next_balance)) {
11379                         next_balance = rq->next_balance;
11380                         update_next_balance = 1;
11381                 }
11382         }
11383
11384         /*
11385          * next_balance will be updated only when there is a need.
11386          * When the CPU is attached to null domain for ex, it will not be
11387          * updated.
11388          */
11389         if (likely(update_next_balance))
11390                 nohz.next_balance = next_balance;
11391 end:
11392         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
11393 }
11394
11395 #ifdef CONFIG_SCHED_HMP
11396 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11397 {
11398         struct sched_domain *sd;
11399         int i;
11400
11401         if (rq->nr_running < 2)
11402                 return 0;
11403
11404         if (!sysctl_sched_restrict_cluster_spill ||
11405                         sched_boost_policy() == SCHED_BOOST_ON_ALL)
11406                 return 1;
11407
11408         if (cpu_max_power_cost(cpu) == max_power_cost)
11409                 return 1;
11410
11411         rcu_read_lock();
11412         sd = rcu_dereference_check_sched_domain(rq->sd);
11413         if (!sd) {
11414                 rcu_read_unlock();
11415                 return 0;
11416         }
11417
11418         for_each_cpu(i, sched_domain_span(sd)) {
11419                 if (cpu_load(i) < sched_spill_load &&
11420                                 cpu_rq(i)->nr_running <
11421                                 sysctl_sched_spill_nr_run) {
11422                         /* Change the kick type to limit to CPUs that
11423                          * are of equal or lower capacity.
11424                          */
11425                         *type = NOHZ_KICK_RESTRICT;
11426                         break;
11427                 }
11428         }
11429         rcu_read_unlock();
11430         return 1;
11431 }
11432 #else
11433 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11434 {
11435         return 0;
11436 }
11437 #endif
11438
11439 static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
11440 {
11441         unsigned long now = jiffies;
11442
11443         /*
11444          * None are in tickless mode and hence no need for NOHZ idle load
11445          * balancing.
11446          */
11447         if (likely(!atomic_read(&nohz.nr_cpus)))
11448                 return 0;
11449
11450 #ifdef CONFIG_SCHED_HMP
11451         return _nohz_kick_needed_hmp(rq, cpu, type);
11452 #endif
11453
11454         if (time_before(now, nohz.next_balance))
11455                 return 0;
11456
11457         if (rq->nr_running >= 2 &&
11458             (!energy_aware() || cpu_overutilized(cpu)))
11459                 return true;
11460
11461         /* Do idle load balance if there have misfit task */
11462         if (energy_aware())
11463                 return rq->misfit_task;
11464
11465         return (rq->nr_running >= 2);
11466 }
11467
11468 /*
11469  * Current heuristic for kicking the idle load balancer in the presence
11470  * of an idle cpu in the system.
11471  *   - This rq has more than one task.
11472  *   - This rq has at least one CFS task and the capacity of the CPU is
11473  *     significantly reduced because of RT tasks or IRQs.
11474  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
11475  *     multiple busy cpu.
11476  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
11477  *     domain span are idle.
11478  */
11479 static inline bool nohz_kick_needed(struct rq *rq, int *type)
11480 {
11481 #ifndef CONFIG_SCHED_HMP
11482         struct sched_domain *sd;
11483         struct sched_group_capacity *sgc;
11484         int nr_busy;
11485 #endif
11486         int cpu = rq->cpu;
11487         bool kick = false;
11488
11489         if (unlikely(rq->idle_balance))
11490                 return false;
11491
11492        /*
11493         * We may be recently in ticked or tickless idle mode. At the first
11494         * busy tick after returning from idle, we will update the busy stats.
11495         */
11496         set_cpu_sd_state_busy();
11497         nohz_balance_exit_idle(cpu);
11498
11499         if (_nohz_kick_needed(rq, cpu, type))
11500                 return true;
11501
11502 #ifndef CONFIG_SCHED_HMP
11503         rcu_read_lock();
11504         sd = rcu_dereference(per_cpu(sd_busy, cpu));
11505         if (sd) {
11506                 sgc = sd->groups->sgc;
11507                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
11508
11509                 if (nr_busy > 1) {
11510                         kick = true;
11511                         goto unlock;
11512                 }
11513
11514         }
11515
11516         sd = rcu_dereference(rq->sd);
11517         if (sd) {
11518                 if ((rq->cfs.h_nr_running >= 1) &&
11519                                 check_cpu_capacity(rq, sd)) {
11520                         kick = true;
11521                         goto unlock;
11522                 }
11523         }
11524
11525         sd = rcu_dereference(per_cpu(sd_asym, cpu));
11526         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
11527                                   sched_domain_span(sd)) < cpu)) {
11528                 kick = true;
11529                 goto unlock;
11530         }
11531
11532 unlock:
11533         rcu_read_unlock();
11534 #endif
11535         return kick;
11536 }
11537 #else
11538 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
11539 #endif
11540
11541 /*
11542  * run_rebalance_domains is triggered when needed from the scheduler tick.
11543  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
11544  */
11545 static void run_rebalance_domains(struct softirq_action *h)
11546 {
11547         struct rq *this_rq = this_rq();
11548         enum cpu_idle_type idle = this_rq->idle_balance ?
11549                                                 CPU_IDLE : CPU_NOT_IDLE;
11550
11551         /*
11552          * If this cpu has a pending nohz_balance_kick, then do the
11553          * balancing on behalf of the other idle cpus whose ticks are
11554          * stopped. Do nohz_idle_balance *before* rebalance_domains to
11555          * give the idle cpus a chance to load balance. Else we may
11556          * load balance only within the local sched_domain hierarchy
11557          * and abort nohz_idle_balance altogether if we pull some load.
11558          */
11559         nohz_idle_balance(this_rq, idle);
11560         rebalance_domains(this_rq, idle);
11561 }
11562
11563 /*
11564  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
11565  */
11566 void trigger_load_balance(struct rq *rq)
11567 {
11568         int type = NOHZ_KICK_ANY;
11569
11570         /* Don't need to rebalance while attached to NULL domain or
11571          * cpu is isolated.
11572          */
11573         if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
11574                 return;
11575
11576         if (time_after_eq(jiffies, rq->next_balance))
11577                 raise_softirq(SCHED_SOFTIRQ);
11578 #ifdef CONFIG_NO_HZ_COMMON
11579         if (nohz_kick_needed(rq, &type))
11580                 nohz_balancer_kick(type);
11581 #endif
11582 }
11583
11584 static void rq_online_fair(struct rq *rq)
11585 {
11586         update_sysctl();
11587
11588         update_runtime_enabled(rq);
11589 }
11590
11591 static void rq_offline_fair(struct rq *rq)
11592 {
11593         update_sysctl();
11594
11595         /* Ensure any throttled groups are reachable by pick_next_task */
11596         unthrottle_offline_cfs_rqs(rq);
11597 }
11598
11599 #endif /* CONFIG_SMP */
11600
11601 /*
11602  * scheduler tick hitting a task of our scheduling class:
11603  */
11604 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11605 {
11606         struct cfs_rq *cfs_rq;
11607         struct sched_entity *se = &curr->se;
11608
11609         for_each_sched_entity(se) {
11610                 cfs_rq = cfs_rq_of(se);
11611                 entity_tick(cfs_rq, se, queued);
11612         }
11613
11614         if (static_branch_unlikely(&sched_numa_balancing))
11615                 task_tick_numa(rq, curr);
11616
11617 #ifdef CONFIG_SMP
11618         if (energy_aware() &&
11619             !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
11620                 rq->rd->overutilized = true;
11621                 trace_sched_overutilized(true);
11622         }
11623
11624         rq->misfit_task = !task_fits_max(curr, rq->cpu);
11625 #endif
11626
11627 }
11628
11629 /*
11630  * called on fork with the child task as argument from the parent's context
11631  *  - child not yet on the tasklist
11632  *  - preemption disabled
11633  */
11634 static void task_fork_fair(struct task_struct *p)
11635 {
11636         struct cfs_rq *cfs_rq;
11637         struct sched_entity *se = &p->se, *curr;
11638         struct rq *rq = this_rq();
11639
11640         raw_spin_lock(&rq->lock);
11641         update_rq_clock(rq);
11642
11643         cfs_rq = task_cfs_rq(current);
11644         curr = cfs_rq->curr;
11645         if (curr) {
11646                 update_curr(cfs_rq);
11647                 se->vruntime = curr->vruntime;
11648         }
11649         place_entity(cfs_rq, se, 1);
11650
11651         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11652                 /*
11653                  * Upon rescheduling, sched_class::put_prev_task() will place
11654                  * 'current' within the tree based on its new key value.
11655                  */
11656                 swap(curr->vruntime, se->vruntime);
11657                 resched_curr(rq);
11658         }
11659
11660         se->vruntime -= cfs_rq->min_vruntime;
11661         raw_spin_unlock(&rq->lock);
11662 }
11663
11664 /*
11665  * Priority of the task has changed. Check to see if we preempt
11666  * the current task.
11667  */
11668 static void
11669 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11670 {
11671         if (!task_on_rq_queued(p))
11672                 return;
11673
11674         /*
11675          * Reschedule if we are currently running on this runqueue and
11676          * our priority decreased, or if we are not currently running on
11677          * this runqueue and our priority is higher than the current's
11678          */
11679         if (rq->curr == p) {
11680                 if (p->prio > oldprio)
11681                         resched_curr(rq);
11682         } else
11683                 check_preempt_curr(rq, p, 0);
11684 }
11685
11686 static inline bool vruntime_normalized(struct task_struct *p)
11687 {
11688         struct sched_entity *se = &p->se;
11689
11690         /*
11691          * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11692          * the dequeue_entity(.flags=0) will already have normalized the
11693          * vruntime.
11694          */
11695         if (p->on_rq)
11696                 return true;
11697
11698         /*
11699          * When !on_rq, vruntime of the task has usually NOT been normalized.
11700          * But there are some cases where it has already been normalized:
11701          *
11702          * - A forked child which is waiting for being woken up by
11703          *   wake_up_new_task().
11704          * - A task which has been woken up by try_to_wake_up() and
11705          *   waiting for actually being woken up by sched_ttwu_pending().
11706          */
11707         if (!se->sum_exec_runtime || p->state == TASK_WAKING)
11708                 return true;
11709
11710         return false;
11711 }
11712
11713 #ifdef CONFIG_FAIR_GROUP_SCHED
11714 /*
11715  * Propagate the changes of the sched_entity across the tg tree to make it
11716  * visible to the root
11717  */
11718 static void propagate_entity_cfs_rq(struct sched_entity *se)
11719 {
11720         struct cfs_rq *cfs_rq;
11721
11722         /* Start to propagate at parent */
11723         se = se->parent;
11724
11725         for_each_sched_entity(se) {
11726                 cfs_rq = cfs_rq_of(se);
11727
11728                 if (cfs_rq_throttled(cfs_rq))
11729                         break;
11730
11731                 update_load_avg(se, UPDATE_TG);
11732         }
11733 }
11734 #else
11735 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11736 #endif
11737
11738 static void detach_entity_cfs_rq(struct sched_entity *se)
11739 {
11740         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11741
11742         /* Catch up with the cfs_rq and remove our load when we leave */
11743         update_load_avg(se, 0);
11744         detach_entity_load_avg(cfs_rq, se);
11745         update_tg_load_avg(cfs_rq, false);
11746         propagate_entity_cfs_rq(se);
11747 }
11748
11749 static void attach_entity_cfs_rq(struct sched_entity *se)
11750 {
11751         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11752
11753 #ifdef CONFIG_FAIR_GROUP_SCHED
11754         /*
11755          * Since the real-depth could have been changed (only FAIR
11756          * class maintain depth value), reset depth properly.
11757          */
11758         se->depth = se->parent ? se->parent->depth + 1 : 0;
11759 #endif
11760
11761         /* Synchronize entity with its cfs_rq */
11762         update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11763         attach_entity_load_avg(cfs_rq, se);
11764         update_tg_load_avg(cfs_rq, false);
11765         propagate_entity_cfs_rq(se);
11766 }
11767
11768 static void detach_task_cfs_rq(struct task_struct *p)
11769 {
11770         struct sched_entity *se = &p->se;
11771         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11772
11773         if (!vruntime_normalized(p)) {
11774                 /*
11775                  * Fix up our vruntime so that the current sleep doesn't
11776                  * cause 'unlimited' sleep bonus.
11777                  */
11778                 place_entity(cfs_rq, se, 0);
11779                 se->vruntime -= cfs_rq->min_vruntime;
11780         }
11781
11782         detach_entity_cfs_rq(se);
11783 }
11784
11785 static void attach_task_cfs_rq(struct task_struct *p)
11786 {
11787         struct sched_entity *se = &p->se;
11788         struct cfs_rq *cfs_rq = cfs_rq_of(se);
11789
11790         attach_entity_cfs_rq(se);
11791
11792         if (!vruntime_normalized(p))
11793                 se->vruntime += cfs_rq->min_vruntime;
11794 }
11795
11796 static void switched_from_fair(struct rq *rq, struct task_struct *p)
11797 {
11798         detach_task_cfs_rq(p);
11799 }
11800
11801 static void switched_to_fair(struct rq *rq, struct task_struct *p)
11802 {
11803         attach_task_cfs_rq(p);
11804
11805         if (task_on_rq_queued(p)) {
11806                 /*
11807                  * We were most likely switched from sched_rt, so
11808                  * kick off the schedule if running, otherwise just see
11809                  * if we can still preempt the current task.
11810                  */
11811                 if (rq->curr == p)
11812                         resched_curr(rq);
11813                 else
11814                         check_preempt_curr(rq, p, 0);
11815         }
11816 }
11817
11818 /* Account for a task changing its policy or group.
11819  *
11820  * This routine is mostly called to set cfs_rq->curr field when a task
11821  * migrates between groups/classes.
11822  */
11823 static void set_curr_task_fair(struct rq *rq)
11824 {
11825         struct sched_entity *se = &rq->curr->se;
11826
11827         for_each_sched_entity(se) {
11828                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11829
11830                 set_next_entity(cfs_rq, se);
11831                 /* ensure bandwidth has been allocated on our new cfs_rq */
11832                 account_cfs_rq_runtime(cfs_rq, 0);
11833         }
11834 }
11835
11836 void init_cfs_rq(struct cfs_rq *cfs_rq)
11837 {
11838         cfs_rq->tasks_timeline = RB_ROOT;
11839         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11840 #ifndef CONFIG_64BIT
11841         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11842 #endif
11843 #ifdef CONFIG_SMP
11844 #ifdef CONFIG_FAIR_GROUP_SCHED
11845         cfs_rq->propagate_avg = 0;
11846 #endif
11847         atomic_long_set(&cfs_rq->removed_load_avg, 0);
11848         atomic_long_set(&cfs_rq->removed_util_avg, 0);
11849 #endif
11850 }
11851
11852 #ifdef CONFIG_FAIR_GROUP_SCHED
11853 static void task_set_group_fair(struct task_struct *p)
11854 {
11855         struct sched_entity *se = &p->se;
11856
11857         set_task_rq(p, task_cpu(p));
11858         se->depth = se->parent ? se->parent->depth + 1 : 0;
11859 }
11860
11861 static void task_move_group_fair(struct task_struct *p)
11862 {
11863         detach_task_cfs_rq(p);
11864         set_task_rq(p, task_cpu(p));
11865
11866 #ifdef CONFIG_SMP
11867         /* Tell se's cfs_rq has been changed -- migrated */
11868         p->se.avg.last_update_time = 0;
11869 #endif
11870         attach_task_cfs_rq(p);
11871 }
11872
11873 static void task_change_group_fair(struct task_struct *p, int type)
11874 {
11875         switch (type) {
11876         case TASK_SET_GROUP:
11877                 task_set_group_fair(p);
11878                 break;
11879
11880         case TASK_MOVE_GROUP:
11881                 task_move_group_fair(p);
11882                 break;
11883         }
11884 }
11885
11886 void free_fair_sched_group(struct task_group *tg)
11887 {
11888         int i;
11889
11890         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11891
11892         for_each_possible_cpu(i) {
11893                 if (tg->cfs_rq)
11894                         kfree(tg->cfs_rq[i]);
11895                 if (tg->se)
11896                         kfree(tg->se[i]);
11897         }
11898
11899         kfree(tg->cfs_rq);
11900         kfree(tg->se);
11901 }
11902
11903 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11904 {
11905         struct sched_entity *se;
11906         struct cfs_rq *cfs_rq;
11907         struct rq *rq;
11908         int i;
11909
11910         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
11911         if (!tg->cfs_rq)
11912                 goto err;
11913         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
11914         if (!tg->se)
11915                 goto err;
11916
11917         tg->shares = NICE_0_LOAD;
11918
11919         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11920
11921         for_each_possible_cpu(i) {
11922                 rq = cpu_rq(i);
11923
11924                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11925                                       GFP_KERNEL, cpu_to_node(i));
11926                 if (!cfs_rq)
11927                         goto err;
11928
11929                 se = kzalloc_node(sizeof(struct sched_entity),
11930                                   GFP_KERNEL, cpu_to_node(i));
11931                 if (!se)
11932                         goto err_free_rq;
11933
11934                 init_cfs_rq(cfs_rq);
11935                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11936                 init_entity_runnable_average(se);
11937
11938                 raw_spin_lock_irq(&rq->lock);
11939                 post_init_entity_util_avg(se);
11940                 raw_spin_unlock_irq(&rq->lock);
11941         }
11942
11943         return 1;
11944
11945 err_free_rq:
11946         kfree(cfs_rq);
11947 err:
11948         return 0;
11949 }
11950
11951 void unregister_fair_sched_group(struct task_group *tg)
11952 {
11953         unsigned long flags;
11954         struct rq *rq;
11955         int cpu;
11956
11957         for_each_possible_cpu(cpu) {
11958                 if (tg->se[cpu])
11959                         remove_entity_load_avg(tg->se[cpu]);
11960
11961                 /*
11962                  * Only empty task groups can be destroyed; so we can speculatively
11963                  * check on_list without danger of it being re-added.
11964                  */
11965                 if (!tg->cfs_rq[cpu]->on_list)
11966                         continue;
11967
11968                 rq = cpu_rq(cpu);
11969
11970                 raw_spin_lock_irqsave(&rq->lock, flags);
11971                 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11972                 raw_spin_unlock_irqrestore(&rq->lock, flags);
11973         }
11974 }
11975
11976 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11977                         struct sched_entity *se, int cpu,
11978                         struct sched_entity *parent)
11979 {
11980         struct rq *rq = cpu_rq(cpu);
11981
11982         cfs_rq->tg = tg;
11983         cfs_rq->rq = rq;
11984         init_cfs_rq_runtime(cfs_rq);
11985
11986         tg->cfs_rq[cpu] = cfs_rq;
11987         tg->se[cpu] = se;
11988
11989         /* se could be NULL for root_task_group */
11990         if (!se)
11991                 return;
11992
11993         if (!parent) {
11994                 se->cfs_rq = &rq->cfs;
11995                 se->depth = 0;
11996         } else {
11997                 se->cfs_rq = parent->my_q;
11998                 se->depth = parent->depth + 1;
11999         }
12000
12001         se->my_q = cfs_rq;
12002         /* guarantee group entities always have weight */
12003         update_load_set(&se->load, NICE_0_LOAD);
12004         se->parent = parent;
12005 }
12006
12007 static DEFINE_MUTEX(shares_mutex);
12008
12009 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
12010 {
12011         int i;
12012         unsigned long flags;
12013
12014         /*
12015          * We can't change the weight of the root cgroup.
12016          */
12017         if (!tg->se[0])
12018                 return -EINVAL;
12019
12020         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
12021
12022         mutex_lock(&shares_mutex);
12023         if (tg->shares == shares)
12024                 goto done;
12025
12026         tg->shares = shares;
12027         for_each_possible_cpu(i) {
12028                 struct rq *rq = cpu_rq(i);
12029                 struct sched_entity *se;
12030
12031                 se = tg->se[i];
12032                 /* Propagate contribution to hierarchy */
12033                 raw_spin_lock_irqsave(&rq->lock, flags);
12034
12035                 /* Possible calls to update_curr() need rq clock */
12036                 update_rq_clock(rq);
12037                 for_each_sched_entity(se) {
12038                         update_load_avg(se, UPDATE_TG);
12039                         update_cfs_shares(se);
12040                 }
12041                 raw_spin_unlock_irqrestore(&rq->lock, flags);
12042         }
12043
12044 done:
12045         mutex_unlock(&shares_mutex);
12046         return 0;
12047 }
12048 #else /* CONFIG_FAIR_GROUP_SCHED */
12049
12050 void free_fair_sched_group(struct task_group *tg) { }
12051
12052 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12053 {
12054         return 1;
12055 }
12056
12057 void unregister_fair_sched_group(struct task_group *tg) { }
12058
12059 #endif /* CONFIG_FAIR_GROUP_SCHED */
12060
12061
12062 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
12063 {
12064         struct sched_entity *se = &task->se;
12065         unsigned int rr_interval = 0;
12066
12067         /*
12068          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
12069          * idle runqueue:
12070          */
12071         if (rq->cfs.load.weight)
12072                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
12073
12074         return rr_interval;
12075 }
12076
12077 /*
12078  * All the scheduling class methods:
12079  */
12080 const struct sched_class fair_sched_class = {
12081         .next                   = &idle_sched_class,
12082         .enqueue_task           = enqueue_task_fair,
12083         .dequeue_task           = dequeue_task_fair,
12084         .yield_task             = yield_task_fair,
12085         .yield_to_task          = yield_to_task_fair,
12086
12087         .check_preempt_curr     = check_preempt_wakeup,
12088
12089         .pick_next_task         = pick_next_task_fair,
12090         .put_prev_task          = put_prev_task_fair,
12091
12092 #ifdef CONFIG_SMP
12093         .select_task_rq         = select_task_rq_fair,
12094         .migrate_task_rq        = migrate_task_rq_fair,
12095
12096         .rq_online              = rq_online_fair,
12097         .rq_offline             = rq_offline_fair,
12098
12099         .task_waking            = task_waking_fair,
12100         .task_dead              = task_dead_fair,
12101         .set_cpus_allowed       = set_cpus_allowed_common,
12102 #endif
12103
12104         .set_curr_task          = set_curr_task_fair,
12105         .task_tick              = task_tick_fair,
12106         .task_fork              = task_fork_fair,
12107
12108         .prio_changed           = prio_changed_fair,
12109         .switched_from          = switched_from_fair,
12110         .switched_to            = switched_to_fair,
12111
12112         .get_rr_interval        = get_rr_interval_fair,
12113
12114         .update_curr            = update_curr_fair,
12115
12116 #ifdef CONFIG_FAIR_GROUP_SCHED
12117         .task_change_group      = task_change_group_fair,
12118 #endif
12119 #ifdef CONFIG_SCHED_HMP
12120         .inc_hmp_sched_stats    = inc_hmp_sched_stats_fair,
12121         .dec_hmp_sched_stats    = dec_hmp_sched_stats_fair,
12122         .fixup_hmp_sched_stats  = fixup_hmp_sched_stats_fair,
12123 #endif
12124 };
12125
12126 #ifdef CONFIG_SCHED_DEBUG
12127 void print_cfs_stats(struct seq_file *m, int cpu)
12128 {
12129         struct cfs_rq *cfs_rq;
12130
12131         rcu_read_lock();
12132         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
12133                 print_cfs_rq(m, cpu, cfs_rq);
12134         rcu_read_unlock();
12135 }
12136
12137 #ifdef CONFIG_NUMA_BALANCING
12138 void show_numa_stats(struct task_struct *p, struct seq_file *m)
12139 {
12140         int node;
12141         unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
12142
12143         for_each_online_node(node) {
12144                 if (p->numa_faults) {
12145                         tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
12146                         tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
12147                 }
12148                 if (p->numa_group) {
12149                         gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
12150                         gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
12151                 }
12152                 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
12153         }
12154 }
12155 #endif /* CONFIG_NUMA_BALANCING */
12156 #endif /* CONFIG_SCHED_DEBUG */
12157
12158 __init void init_sched_fair_class(void)
12159 {
12160 #ifdef CONFIG_SMP
12161         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
12162
12163 #ifdef CONFIG_NO_HZ_COMMON
12164         nohz.next_balance = jiffies;
12165         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
12166         cpu_notifier(sched_ilb_notifier, 0);
12167 #endif
12168 #endif /* SMP */
12169
12170 }