2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 #include <linux/module.h>
36 #include <trace/events/sched.h>
41 * Targeted preemption latency for CPU-bound tasks:
42 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
44 * NOTE: this latency value is not the same as the concept of
45 * 'timeslice length' - timeslices in CFS are of variable length
46 * and have no persistent notion like in traditional, time-slice
47 * based scheduling concepts.
49 * (to see the precise effective timeslice length of your workload,
50 * run vmstat and monitor the context-switches (cs) field)
52 unsigned int sysctl_sched_latency = 6000000ULL;
53 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55 unsigned int sysctl_sched_sync_hint_enable = 1;
56 unsigned int sysctl_sched_cstate_aware = 1;
59 * The initial- and re-scaling of tunables is configurable
60 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
63 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
64 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
65 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
67 enum sched_tunable_scaling sysctl_sched_tunable_scaling
68 = SCHED_TUNABLESCALING_LOG;
71 * Minimal preemption granularity for CPU-bound tasks:
72 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
74 unsigned int sysctl_sched_min_granularity = 750000ULL;
75 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
78 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
80 static unsigned int sched_nr_latency = 8;
83 * After fork, child runs first. If set to 0 (default) then
84 * parent will (try to) run first.
86 unsigned int sysctl_sched_child_runs_first __read_mostly;
89 * SCHED_OTHER wake-up granularity.
90 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
92 * This option delays the preemption effects of decoupled workloads
93 * and reduces their over-scheduling. Synchronous workloads will still
94 * have immediate wakeup/sleep latencies.
96 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
97 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
99 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
102 * The exponential sliding window over which load is averaged for shares
106 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
108 #ifdef CONFIG_CFS_BANDWIDTH
110 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
111 * each time a cfs_rq requests quota.
113 * Note: in the case that the slice exceeds the runtime remaining (either due
114 * to consumption or the quota being specified to be smaller than the slice)
115 * we will always only issue the remaining available time.
117 * default: 5 msec, units: microseconds
119 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
123 * The margin used when comparing utilization with CPU capacity:
124 * util * margin < capacity * 1024
126 unsigned int capacity_margin = 1280; /* ~20% */
128 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
134 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
140 static inline void update_load_set(struct load_weight *lw, unsigned long w)
147 * Increase the granularity value when there are more CPUs,
148 * because with more CPUs the 'effective latency' as visible
149 * to users decreases. But the relationship is not linear,
150 * so pick a second-best guess by going with the log2 of the
153 * This idea comes from the SD scheduler of Con Kolivas:
155 static unsigned int get_update_sysctl_factor(void)
157 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
160 switch (sysctl_sched_tunable_scaling) {
161 case SCHED_TUNABLESCALING_NONE:
164 case SCHED_TUNABLESCALING_LINEAR:
167 case SCHED_TUNABLESCALING_LOG:
169 factor = 1 + ilog2(cpus);
176 static void update_sysctl(void)
178 unsigned int factor = get_update_sysctl_factor();
180 #define SET_SYSCTL(name) \
181 (sysctl_##name = (factor) * normalized_sysctl_##name)
182 SET_SYSCTL(sched_min_granularity);
183 SET_SYSCTL(sched_latency);
184 SET_SYSCTL(sched_wakeup_granularity);
188 void sched_init_granularity(void)
193 #define WMULT_CONST (~0U)
194 #define WMULT_SHIFT 32
196 static void __update_inv_weight(struct load_weight *lw)
200 if (likely(lw->inv_weight))
203 w = scale_load_down(lw->weight);
205 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
207 else if (unlikely(!w))
208 lw->inv_weight = WMULT_CONST;
210 lw->inv_weight = WMULT_CONST / w;
214 * delta_exec * weight / lw.weight
216 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
218 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
219 * we're guaranteed shift stays positive because inv_weight is guaranteed to
220 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
222 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
223 * weight/lw.weight <= 1, and therefore our shift will also be positive.
225 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
227 u64 fact = scale_load_down(weight);
228 int shift = WMULT_SHIFT;
230 __update_inv_weight(lw);
232 if (unlikely(fact >> 32)) {
239 /* hint to use a 32x32->64 mul */
240 fact = (u64)(u32)fact * lw->inv_weight;
247 return mul_u64_u32_shr(delta_exec, fact, shift);
251 static int active_load_balance_cpu_stop(void *data);
254 const struct sched_class fair_sched_class;
256 /**************************************************************
257 * CFS operations on generic schedulable entities:
260 #ifdef CONFIG_FAIR_GROUP_SCHED
262 /* cpu runqueue to which this cfs_rq is attached */
263 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
268 /* An entity is a task if it doesn't "own" a runqueue */
269 #define entity_is_task(se) (!se->my_q)
271 static inline struct task_struct *task_of(struct sched_entity *se)
273 #ifdef CONFIG_SCHED_DEBUG
274 WARN_ON_ONCE(!entity_is_task(se));
276 return container_of(se, struct task_struct, se);
279 /* Walk up scheduling entities hierarchy */
280 #define for_each_sched_entity(se) \
281 for (; se; se = se->parent)
283 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
288 /* runqueue on which this entity is (to be) queued */
289 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
294 /* runqueue "owned" by this group */
295 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
300 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
302 if (!cfs_rq->on_list) {
303 struct rq *rq = rq_of(cfs_rq);
304 int cpu = cpu_of(rq);
306 * Ensure we either appear before our parent (if already
307 * enqueued) or force our parent to appear after us when it is
308 * enqueued. The fact that we always enqueue bottom-up
309 * reduces this to two cases and a special case for the root
310 * cfs_rq. Furthermore, it also means that we will always reset
311 * tmp_alone_branch either when the branch is connected
312 * to a tree or when we reach the beg of the tree
314 if (cfs_rq->tg->parent &&
315 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
317 * If parent is already on the list, we add the child
318 * just before. Thanks to circular linked property of
319 * the list, this means to put the child at the tail
320 * of the list that starts by parent.
322 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
323 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
325 * The branch is now connected to its tree so we can
326 * reset tmp_alone_branch to the beginning of the
329 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
330 } else if (!cfs_rq->tg->parent) {
332 * cfs rq without parent should be put
333 * at the tail of the list.
335 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
336 &rq->leaf_cfs_rq_list);
338 * We have reach the beg of a tree so we can reset
339 * tmp_alone_branch to the beginning of the list.
341 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
344 * The parent has not already been added so we want to
345 * make sure that it will be put after us.
346 * tmp_alone_branch points to the beg of the branch
347 * where we will add parent.
349 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
350 rq->tmp_alone_branch);
352 * update tmp_alone_branch to points to the new beg
355 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
362 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
364 if (cfs_rq->on_list) {
365 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
370 /* Iterate thr' all leaf cfs_rq's on a runqueue */
371 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
372 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
374 /* Do the two (enqueued) entities belong to the same group ? */
375 static inline struct cfs_rq *
376 is_same_group(struct sched_entity *se, struct sched_entity *pse)
378 if (se->cfs_rq == pse->cfs_rq)
384 static inline struct sched_entity *parent_entity(struct sched_entity *se)
390 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
392 int se_depth, pse_depth;
395 * preemption test can be made between sibling entities who are in the
396 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
397 * both tasks until we find their ancestors who are siblings of common
401 /* First walk up until both entities are at same depth */
402 se_depth = (*se)->depth;
403 pse_depth = (*pse)->depth;
405 while (se_depth > pse_depth) {
407 *se = parent_entity(*se);
410 while (pse_depth > se_depth) {
412 *pse = parent_entity(*pse);
415 while (!is_same_group(*se, *pse)) {
416 *se = parent_entity(*se);
417 *pse = parent_entity(*pse);
421 #else /* !CONFIG_FAIR_GROUP_SCHED */
423 static inline struct task_struct *task_of(struct sched_entity *se)
425 return container_of(se, struct task_struct, se);
428 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
430 return container_of(cfs_rq, struct rq, cfs);
433 #define entity_is_task(se) 1
435 #define for_each_sched_entity(se) \
436 for (; se; se = NULL)
438 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
440 return &task_rq(p)->cfs;
443 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
445 struct task_struct *p = task_of(se);
446 struct rq *rq = task_rq(p);
451 /* runqueue "owned" by this group */
452 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
461 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
465 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
466 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
468 static inline struct sched_entity *parent_entity(struct sched_entity *se)
474 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
478 #endif /* CONFIG_FAIR_GROUP_SCHED */
480 static __always_inline
481 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
483 /**************************************************************
484 * Scheduling class tree data structure manipulation methods:
487 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
489 s64 delta = (s64)(vruntime - max_vruntime);
491 max_vruntime = vruntime;
496 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
498 s64 delta = (s64)(vruntime - min_vruntime);
500 min_vruntime = vruntime;
505 static inline int entity_before(struct sched_entity *a,
506 struct sched_entity *b)
508 return (s64)(a->vruntime - b->vruntime) < 0;
511 static void update_min_vruntime(struct cfs_rq *cfs_rq)
513 u64 vruntime = cfs_rq->min_vruntime;
516 vruntime = cfs_rq->curr->vruntime;
518 if (cfs_rq->rb_leftmost) {
519 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
524 vruntime = se->vruntime;
526 vruntime = min_vruntime(vruntime, se->vruntime);
529 /* ensure we never gain time by being placed backwards. */
530 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
533 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
538 * Enqueue an entity into the rb-tree:
540 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
543 struct rb_node *parent = NULL;
544 struct sched_entity *entry;
548 * Find the right place in the rbtree:
552 entry = rb_entry(parent, struct sched_entity, run_node);
554 * We dont care about collisions. Nodes with
555 * the same key stay together.
557 if (entity_before(se, entry)) {
558 link = &parent->rb_left;
560 link = &parent->rb_right;
566 * Maintain a cache of leftmost tree entries (it is frequently
570 cfs_rq->rb_leftmost = &se->run_node;
572 rb_link_node(&se->run_node, parent, link);
573 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
576 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
578 if (cfs_rq->rb_leftmost == &se->run_node) {
579 struct rb_node *next_node;
581 next_node = rb_next(&se->run_node);
582 cfs_rq->rb_leftmost = next_node;
585 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
588 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
590 struct rb_node *left = cfs_rq->rb_leftmost;
595 return rb_entry(left, struct sched_entity, run_node);
598 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
600 struct rb_node *next = rb_next(&se->run_node);
605 return rb_entry(next, struct sched_entity, run_node);
608 #ifdef CONFIG_SCHED_DEBUG
609 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
611 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
616 return rb_entry(last, struct sched_entity, run_node);
619 /**************************************************************
620 * Scheduling class statistics methods:
623 int sched_proc_update_handler(struct ctl_table *table, int write,
624 void __user *buffer, size_t *lenp,
627 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
628 unsigned int factor = get_update_sysctl_factor();
633 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
634 sysctl_sched_min_granularity);
636 #define WRT_SYSCTL(name) \
637 (normalized_sysctl_##name = sysctl_##name / (factor))
638 WRT_SYSCTL(sched_min_granularity);
639 WRT_SYSCTL(sched_latency);
640 WRT_SYSCTL(sched_wakeup_granularity);
650 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
652 if (unlikely(se->load.weight != NICE_0_LOAD))
653 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
659 * The idea is to set a period in which each task runs once.
661 * When there are too many tasks (sched_nr_latency) we have to stretch
662 * this period because otherwise the slices get too small.
664 * p = (nr <= nl) ? l : l*nr/nl
666 static u64 __sched_period(unsigned long nr_running)
668 if (unlikely(nr_running > sched_nr_latency))
669 return nr_running * sysctl_sched_min_granularity;
671 return sysctl_sched_latency;
675 * We calculate the wall-time slice from the period by taking a part
676 * proportional to the weight.
680 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
682 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
684 for_each_sched_entity(se) {
685 struct load_weight *load;
686 struct load_weight lw;
688 cfs_rq = cfs_rq_of(se);
689 load = &cfs_rq->load;
691 if (unlikely(!se->on_rq)) {
694 update_load_add(&lw, se->load.weight);
697 slice = __calc_delta(slice, se->load.weight, load);
703 * We calculate the vruntime slice of a to-be-inserted task.
707 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
709 return calc_delta_fair(sched_slice(cfs_rq, se), se);
713 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
714 static unsigned long task_h_load(struct task_struct *p);
717 * We choose a half-life close to 1 scheduling period.
718 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
719 * dependent on this value.
721 #define LOAD_AVG_PERIOD 32
722 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
723 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
725 /* Give new sched_entity start runnable values to heavy its load in infant time */
726 void init_entity_runnable_average(struct sched_entity *se)
728 struct sched_avg *sa = &se->avg;
730 sa->last_update_time = 0;
732 * sched_avg's period_contrib should be strictly less then 1024, so
733 * we give it 1023 to make sure it is almost a period (1024us), and
734 * will definitely be update (after enqueue).
736 sa->period_contrib = 1023;
738 * Tasks are intialized with full load to be seen as heavy tasks until
739 * they get a chance to stabilize to their real load level.
740 * Group entities are intialized with zero load to reflect the fact that
741 * nothing has been attached to the task group yet.
743 if (entity_is_task(se))
744 sa->load_avg = scale_load_down(se->load.weight);
745 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
747 * In previous Android versions, we used to have:
748 * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
749 * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
750 * However, that functionality has been moved to enqueue.
751 * It is unclear if we should restore this in enqueue.
754 * At this point, util_avg won't be used in select_task_rq_fair anyway
758 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
761 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
762 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
763 static void attach_entity_cfs_rq(struct sched_entity *se);
764 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
767 * With new tasks being created, their initial util_avgs are extrapolated
768 * based on the cfs_rq's current util_avg:
770 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
772 * However, in many cases, the above util_avg does not give a desired
773 * value. Moreover, the sum of the util_avgs may be divergent, such
774 * as when the series is a harmonic series.
776 * To solve this problem, we also cap the util_avg of successive tasks to
777 * only 1/2 of the left utilization budget:
779 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
781 * where n denotes the nth task.
783 * For example, a simplest series from the beginning would be like:
785 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
786 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
788 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
789 * if util_avg > util_avg_cap.
791 void post_init_entity_util_avg(struct sched_entity *se)
793 struct cfs_rq *cfs_rq = cfs_rq_of(se);
794 struct sched_avg *sa = &se->avg;
795 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
798 if (cfs_rq->avg.util_avg != 0) {
799 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
800 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
802 if (sa->util_avg > cap)
808 * If we wish to restore tuning via setting initial util,
809 * this is where we should do it.
811 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
814 if (entity_is_task(se)) {
815 struct task_struct *p = task_of(se);
816 if (p->sched_class != &fair_sched_class) {
818 * For !fair tasks do:
820 update_cfs_rq_load_avg(now, cfs_rq, false);
821 attach_entity_load_avg(cfs_rq, se);
822 switched_from_fair(rq, p);
824 * such that the next switched_to_fair() has the
827 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
832 attach_entity_cfs_rq(se);
835 #else /* !CONFIG_SMP */
836 void init_entity_runnable_average(struct sched_entity *se)
839 void post_init_entity_util_avg(struct sched_entity *se)
842 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
845 #endif /* CONFIG_SMP */
848 * Update the current task's runtime statistics.
850 static void update_curr(struct cfs_rq *cfs_rq)
852 struct sched_entity *curr = cfs_rq->curr;
853 u64 now = rq_clock_task(rq_of(cfs_rq));
859 delta_exec = now - curr->exec_start;
860 if (unlikely((s64)delta_exec <= 0))
863 curr->exec_start = now;
865 schedstat_set(curr->statistics.exec_max,
866 max(delta_exec, curr->statistics.exec_max));
868 curr->sum_exec_runtime += delta_exec;
869 schedstat_add(cfs_rq, exec_clock, delta_exec);
871 curr->vruntime += calc_delta_fair(delta_exec, curr);
872 update_min_vruntime(cfs_rq);
874 if (entity_is_task(curr)) {
875 struct task_struct *curtask = task_of(curr);
877 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
878 cpuacct_charge(curtask, delta_exec);
879 account_group_exec_runtime(curtask, delta_exec);
882 account_cfs_rq_runtime(cfs_rq, delta_exec);
885 static void update_curr_fair(struct rq *rq)
887 update_curr(cfs_rq_of(&rq->curr->se));
890 #ifdef CONFIG_SCHEDSTATS
892 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
894 u64 wait_start = rq_clock(rq_of(cfs_rq));
896 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
897 likely(wait_start > se->statistics.wait_start))
898 wait_start -= se->statistics.wait_start;
900 se->statistics.wait_start = wait_start;
904 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
906 struct task_struct *p;
907 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
909 if (entity_is_task(se)) {
911 if (task_on_rq_migrating(p)) {
913 * Preserve migrating task's wait time so wait_start
914 * time stamp can be adjusted to accumulate wait time
915 * prior to migration.
917 se->statistics.wait_start = delta;
920 trace_sched_stat_wait(p, delta);
923 se->statistics.wait_max = max(se->statistics.wait_max, delta);
924 se->statistics.wait_count++;
925 se->statistics.wait_sum += delta;
926 se->statistics.wait_start = 0;
930 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
935 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
941 * Task is being enqueued - update stats:
943 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
946 * Are we enqueueing a waiting task? (for current tasks
947 * a dequeue/enqueue event is a NOP)
949 if (se != cfs_rq->curr)
950 update_stats_wait_start(cfs_rq, se);
954 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
957 * Mark the end of the wait period if dequeueing a
960 if (se != cfs_rq->curr)
961 update_stats_wait_end(cfs_rq, se);
965 * We are picking a new current task - update its stats:
968 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
971 * We are starting a new run period:
973 se->exec_start = rq_clock_task(rq_of(cfs_rq));
976 /**************************************************
977 * Scheduling class queueing methods:
980 #ifdef CONFIG_NUMA_BALANCING
982 * Approximate time to scan a full NUMA task in ms. The task scan period is
983 * calculated based on the tasks virtual memory size and
984 * numa_balancing_scan_size.
986 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
987 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
989 /* Portion of address space to scan in MB */
990 unsigned int sysctl_numa_balancing_scan_size = 256;
992 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
993 unsigned int sysctl_numa_balancing_scan_delay = 1000;
995 static unsigned int task_nr_scan_windows(struct task_struct *p)
997 unsigned long rss = 0;
998 unsigned long nr_scan_pages;
1001 * Calculations based on RSS as non-present and empty pages are skipped
1002 * by the PTE scanner and NUMA hinting faults should be trapped based
1005 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1006 rss = get_mm_rss(p->mm);
1008 rss = nr_scan_pages;
1010 rss = round_up(rss, nr_scan_pages);
1011 return rss / nr_scan_pages;
1014 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1015 #define MAX_SCAN_WINDOW 2560
1017 static unsigned int task_scan_min(struct task_struct *p)
1019 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1020 unsigned int scan, floor;
1021 unsigned int windows = 1;
1023 if (scan_size < MAX_SCAN_WINDOW)
1024 windows = MAX_SCAN_WINDOW / scan_size;
1025 floor = 1000 / windows;
1027 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1028 return max_t(unsigned int, floor, scan);
1031 static unsigned int task_scan_max(struct task_struct *p)
1033 unsigned int smin = task_scan_min(p);
1036 /* Watch for min being lower than max due to floor calculations */
1037 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1038 return max(smin, smax);
1041 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1043 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1044 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1047 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1049 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1050 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1056 spinlock_t lock; /* nr_tasks, tasks */
1060 struct rcu_head rcu;
1061 nodemask_t active_nodes;
1062 unsigned long total_faults;
1064 * Faults_cpu is used to decide whether memory should move
1065 * towards the CPU. As a consequence, these stats are weighted
1066 * more by CPU use than by memory faults.
1068 unsigned long *faults_cpu;
1069 unsigned long faults[0];
1072 /* Shared or private faults. */
1073 #define NR_NUMA_HINT_FAULT_TYPES 2
1075 /* Memory and CPU locality */
1076 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1078 /* Averaged statistics, and temporary buffers. */
1079 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1081 pid_t task_numa_group_id(struct task_struct *p)
1083 return p->numa_group ? p->numa_group->gid : 0;
1087 * The averaged statistics, shared & private, memory & cpu,
1088 * occupy the first half of the array. The second half of the
1089 * array is for current counters, which are averaged into the
1090 * first set by task_numa_placement.
1092 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1094 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1097 static inline unsigned long task_faults(struct task_struct *p, int nid)
1099 if (!p->numa_faults)
1102 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1103 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1106 static inline unsigned long group_faults(struct task_struct *p, int nid)
1111 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1112 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1115 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1117 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1118 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1121 /* Handle placement on systems where not all nodes are directly connected. */
1122 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1123 int maxdist, bool task)
1125 unsigned long score = 0;
1129 * All nodes are directly connected, and the same distance
1130 * from each other. No need for fancy placement algorithms.
1132 if (sched_numa_topology_type == NUMA_DIRECT)
1136 * This code is called for each node, introducing N^2 complexity,
1137 * which should be ok given the number of nodes rarely exceeds 8.
1139 for_each_online_node(node) {
1140 unsigned long faults;
1141 int dist = node_distance(nid, node);
1144 * The furthest away nodes in the system are not interesting
1145 * for placement; nid was already counted.
1147 if (dist == sched_max_numa_distance || node == nid)
1151 * On systems with a backplane NUMA topology, compare groups
1152 * of nodes, and move tasks towards the group with the most
1153 * memory accesses. When comparing two nodes at distance
1154 * "hoplimit", only nodes closer by than "hoplimit" are part
1155 * of each group. Skip other nodes.
1157 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1161 /* Add up the faults from nearby nodes. */
1163 faults = task_faults(p, node);
1165 faults = group_faults(p, node);
1168 * On systems with a glueless mesh NUMA topology, there are
1169 * no fixed "groups of nodes". Instead, nodes that are not
1170 * directly connected bounce traffic through intermediate
1171 * nodes; a numa_group can occupy any set of nodes.
1172 * The further away a node is, the less the faults count.
1173 * This seems to result in good task placement.
1175 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1176 faults *= (sched_max_numa_distance - dist);
1177 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1187 * These return the fraction of accesses done by a particular task, or
1188 * task group, on a particular numa node. The group weight is given a
1189 * larger multiplier, in order to group tasks together that are almost
1190 * evenly spread out between numa nodes.
1192 static inline unsigned long task_weight(struct task_struct *p, int nid,
1195 unsigned long faults, total_faults;
1197 if (!p->numa_faults)
1200 total_faults = p->total_numa_faults;
1205 faults = task_faults(p, nid);
1206 faults += score_nearby_nodes(p, nid, dist, true);
1208 return 1000 * faults / total_faults;
1211 static inline unsigned long group_weight(struct task_struct *p, int nid,
1214 unsigned long faults, total_faults;
1219 total_faults = p->numa_group->total_faults;
1224 faults = group_faults(p, nid);
1225 faults += score_nearby_nodes(p, nid, dist, false);
1227 return 1000 * faults / total_faults;
1230 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1231 int src_nid, int dst_cpu)
1233 struct numa_group *ng = p->numa_group;
1234 int dst_nid = cpu_to_node(dst_cpu);
1235 int last_cpupid, this_cpupid;
1237 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1240 * Multi-stage node selection is used in conjunction with a periodic
1241 * migration fault to build a temporal task<->page relation. By using
1242 * a two-stage filter we remove short/unlikely relations.
1244 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1245 * a task's usage of a particular page (n_p) per total usage of this
1246 * page (n_t) (in a given time-span) to a probability.
1248 * Our periodic faults will sample this probability and getting the
1249 * same result twice in a row, given these samples are fully
1250 * independent, is then given by P(n)^2, provided our sample period
1251 * is sufficiently short compared to the usage pattern.
1253 * This quadric squishes small probabilities, making it less likely we
1254 * act on an unlikely task<->page relation.
1256 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1257 if (!cpupid_pid_unset(last_cpupid) &&
1258 cpupid_to_nid(last_cpupid) != dst_nid)
1261 /* Always allow migrate on private faults */
1262 if (cpupid_match_pid(p, last_cpupid))
1265 /* A shared fault, but p->numa_group has not been set up yet. */
1270 * Do not migrate if the destination is not a node that
1271 * is actively used by this numa group.
1273 if (!node_isset(dst_nid, ng->active_nodes))
1277 * Source is a node that is not actively used by this
1278 * numa group, while the destination is. Migrate.
1280 if (!node_isset(src_nid, ng->active_nodes))
1284 * Both source and destination are nodes in active
1285 * use by this numa group. Maximize memory bandwidth
1286 * by migrating from more heavily used groups, to less
1287 * heavily used ones, spreading the load around.
1288 * Use a 1/4 hysteresis to avoid spurious page movement.
1290 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1293 static unsigned long weighted_cpuload(const int cpu);
1294 static unsigned long source_load(int cpu, int type);
1295 static unsigned long target_load(int cpu, int type);
1296 static unsigned long capacity_of(int cpu);
1297 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1299 /* Cached statistics for all CPUs within a node */
1301 unsigned long nr_running;
1304 /* Total compute capacity of CPUs on a node */
1305 unsigned long compute_capacity;
1307 /* Approximate capacity in terms of runnable tasks on a node */
1308 unsigned long task_capacity;
1309 int has_free_capacity;
1313 * XXX borrowed from update_sg_lb_stats
1315 static void update_numa_stats(struct numa_stats *ns, int nid)
1317 int smt, cpu, cpus = 0;
1318 unsigned long capacity;
1320 memset(ns, 0, sizeof(*ns));
1321 for_each_cpu(cpu, cpumask_of_node(nid)) {
1322 struct rq *rq = cpu_rq(cpu);
1324 ns->nr_running += rq->nr_running;
1325 ns->load += weighted_cpuload(cpu);
1326 ns->compute_capacity += capacity_of(cpu);
1332 * If we raced with hotplug and there are no CPUs left in our mask
1333 * the @ns structure is NULL'ed and task_numa_compare() will
1334 * not find this node attractive.
1336 * We'll either bail at !has_free_capacity, or we'll detect a huge
1337 * imbalance and bail there.
1342 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1343 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1344 capacity = cpus / smt; /* cores */
1346 ns->task_capacity = min_t(unsigned, capacity,
1347 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1348 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1351 struct task_numa_env {
1352 struct task_struct *p;
1354 int src_cpu, src_nid;
1355 int dst_cpu, dst_nid;
1357 struct numa_stats src_stats, dst_stats;
1362 struct task_struct *best_task;
1367 static void task_numa_assign(struct task_numa_env *env,
1368 struct task_struct *p, long imp)
1371 put_task_struct(env->best_task);
1374 env->best_imp = imp;
1375 env->best_cpu = env->dst_cpu;
1378 static bool load_too_imbalanced(long src_load, long dst_load,
1379 struct task_numa_env *env)
1382 long orig_src_load, orig_dst_load;
1383 long src_capacity, dst_capacity;
1386 * The load is corrected for the CPU capacity available on each node.
1389 * ------------ vs ---------
1390 * src_capacity dst_capacity
1392 src_capacity = env->src_stats.compute_capacity;
1393 dst_capacity = env->dst_stats.compute_capacity;
1395 /* We care about the slope of the imbalance, not the direction. */
1396 if (dst_load < src_load)
1397 swap(dst_load, src_load);
1399 /* Is the difference below the threshold? */
1400 imb = dst_load * src_capacity * 100 -
1401 src_load * dst_capacity * env->imbalance_pct;
1406 * The imbalance is above the allowed threshold.
1407 * Compare it with the old imbalance.
1409 orig_src_load = env->src_stats.load;
1410 orig_dst_load = env->dst_stats.load;
1412 if (orig_dst_load < orig_src_load)
1413 swap(orig_dst_load, orig_src_load);
1415 old_imb = orig_dst_load * src_capacity * 100 -
1416 orig_src_load * dst_capacity * env->imbalance_pct;
1418 /* Would this change make things worse? */
1419 return (imb > old_imb);
1423 * This checks if the overall compute and NUMA accesses of the system would
1424 * be improved if the source tasks was migrated to the target dst_cpu taking
1425 * into account that it might be best if task running on the dst_cpu should
1426 * be exchanged with the source task
1428 static void task_numa_compare(struct task_numa_env *env,
1429 long taskimp, long groupimp)
1431 struct rq *src_rq = cpu_rq(env->src_cpu);
1432 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1433 struct task_struct *cur;
1434 long src_load, dst_load;
1436 long imp = env->p->numa_group ? groupimp : taskimp;
1438 int dist = env->dist;
1439 bool assigned = false;
1443 raw_spin_lock_irq(&dst_rq->lock);
1446 * No need to move the exiting task or idle task.
1448 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1452 * The task_struct must be protected here to protect the
1453 * p->numa_faults access in the task_weight since the
1454 * numa_faults could already be freed in the following path:
1455 * finish_task_switch()
1456 * --> put_task_struct()
1457 * --> __put_task_struct()
1458 * --> task_numa_free()
1460 get_task_struct(cur);
1463 raw_spin_unlock_irq(&dst_rq->lock);
1466 * Because we have preemption enabled we can get migrated around and
1467 * end try selecting ourselves (current == env->p) as a swap candidate.
1473 * "imp" is the fault differential for the source task between the
1474 * source and destination node. Calculate the total differential for
1475 * the source task and potential destination task. The more negative
1476 * the value is, the more rmeote accesses that would be expected to
1477 * be incurred if the tasks were swapped.
1480 /* Skip this swap candidate if cannot move to the source cpu */
1481 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1485 * If dst and source tasks are in the same NUMA group, or not
1486 * in any group then look only at task weights.
1488 if (cur->numa_group == env->p->numa_group) {
1489 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1490 task_weight(cur, env->dst_nid, dist);
1492 * Add some hysteresis to prevent swapping the
1493 * tasks within a group over tiny differences.
1495 if (cur->numa_group)
1499 * Compare the group weights. If a task is all by
1500 * itself (not part of a group), use the task weight
1503 if (cur->numa_group)
1504 imp += group_weight(cur, env->src_nid, dist) -
1505 group_weight(cur, env->dst_nid, dist);
1507 imp += task_weight(cur, env->src_nid, dist) -
1508 task_weight(cur, env->dst_nid, dist);
1512 if (imp <= env->best_imp && moveimp <= env->best_imp)
1516 /* Is there capacity at our destination? */
1517 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1518 !env->dst_stats.has_free_capacity)
1524 /* Balance doesn't matter much if we're running a task per cpu */
1525 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1526 dst_rq->nr_running == 1)
1530 * In the overloaded case, try and keep the load balanced.
1533 load = task_h_load(env->p);
1534 dst_load = env->dst_stats.load + load;
1535 src_load = env->src_stats.load - load;
1537 if (moveimp > imp && moveimp > env->best_imp) {
1539 * If the improvement from just moving env->p direction is
1540 * better than swapping tasks around, check if a move is
1541 * possible. Store a slightly smaller score than moveimp,
1542 * so an actually idle CPU will win.
1544 if (!load_too_imbalanced(src_load, dst_load, env)) {
1546 put_task_struct(cur);
1552 if (imp <= env->best_imp)
1556 load = task_h_load(cur);
1561 if (load_too_imbalanced(src_load, dst_load, env))
1565 * One idle CPU per node is evaluated for a task numa move.
1566 * Call select_idle_sibling to maybe find a better one.
1569 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1574 task_numa_assign(env, cur, imp);
1578 * The dst_rq->curr isn't assigned. The protection for task_struct is
1581 if (cur && !assigned)
1582 put_task_struct(cur);
1585 static void task_numa_find_cpu(struct task_numa_env *env,
1586 long taskimp, long groupimp)
1590 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1591 /* Skip this CPU if the source task cannot migrate */
1592 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1596 task_numa_compare(env, taskimp, groupimp);
1600 /* Only move tasks to a NUMA node less busy than the current node. */
1601 static bool numa_has_capacity(struct task_numa_env *env)
1603 struct numa_stats *src = &env->src_stats;
1604 struct numa_stats *dst = &env->dst_stats;
1606 if (src->has_free_capacity && !dst->has_free_capacity)
1610 * Only consider a task move if the source has a higher load
1611 * than the destination, corrected for CPU capacity on each node.
1613 * src->load dst->load
1614 * --------------------- vs ---------------------
1615 * src->compute_capacity dst->compute_capacity
1617 if (src->load * dst->compute_capacity * env->imbalance_pct >
1619 dst->load * src->compute_capacity * 100)
1625 static int task_numa_migrate(struct task_struct *p)
1627 struct task_numa_env env = {
1630 .src_cpu = task_cpu(p),
1631 .src_nid = task_node(p),
1633 .imbalance_pct = 112,
1639 struct sched_domain *sd;
1640 unsigned long taskweight, groupweight;
1642 long taskimp, groupimp;
1645 * Pick the lowest SD_NUMA domain, as that would have the smallest
1646 * imbalance and would be the first to start moving tasks about.
1648 * And we want to avoid any moving of tasks about, as that would create
1649 * random movement of tasks -- counter the numa conditions we're trying
1653 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1655 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1659 * Cpusets can break the scheduler domain tree into smaller
1660 * balance domains, some of which do not cross NUMA boundaries.
1661 * Tasks that are "trapped" in such domains cannot be migrated
1662 * elsewhere, so there is no point in (re)trying.
1664 if (unlikely(!sd)) {
1665 p->numa_preferred_nid = task_node(p);
1669 env.dst_nid = p->numa_preferred_nid;
1670 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1671 taskweight = task_weight(p, env.src_nid, dist);
1672 groupweight = group_weight(p, env.src_nid, dist);
1673 update_numa_stats(&env.src_stats, env.src_nid);
1674 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1675 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1676 update_numa_stats(&env.dst_stats, env.dst_nid);
1678 /* Try to find a spot on the preferred nid. */
1679 if (numa_has_capacity(&env))
1680 task_numa_find_cpu(&env, taskimp, groupimp);
1683 * Look at other nodes in these cases:
1684 * - there is no space available on the preferred_nid
1685 * - the task is part of a numa_group that is interleaved across
1686 * multiple NUMA nodes; in order to better consolidate the group,
1687 * we need to check other locations.
1689 if (env.best_cpu == -1 || (p->numa_group &&
1690 nodes_weight(p->numa_group->active_nodes) > 1)) {
1691 for_each_online_node(nid) {
1692 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1695 dist = node_distance(env.src_nid, env.dst_nid);
1696 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1698 taskweight = task_weight(p, env.src_nid, dist);
1699 groupweight = group_weight(p, env.src_nid, dist);
1702 /* Only consider nodes where both task and groups benefit */
1703 taskimp = task_weight(p, nid, dist) - taskweight;
1704 groupimp = group_weight(p, nid, dist) - groupweight;
1705 if (taskimp < 0 && groupimp < 0)
1710 update_numa_stats(&env.dst_stats, env.dst_nid);
1711 if (numa_has_capacity(&env))
1712 task_numa_find_cpu(&env, taskimp, groupimp);
1717 * If the task is part of a workload that spans multiple NUMA nodes,
1718 * and is migrating into one of the workload's active nodes, remember
1719 * this node as the task's preferred numa node, so the workload can
1721 * A task that migrated to a second choice node will be better off
1722 * trying for a better one later. Do not set the preferred node here.
1724 if (p->numa_group) {
1725 if (env.best_cpu == -1)
1730 if (node_isset(nid, p->numa_group->active_nodes))
1731 sched_setnuma(p, env.dst_nid);
1734 /* No better CPU than the current one was found. */
1735 if (env.best_cpu == -1)
1739 * Reset the scan period if the task is being rescheduled on an
1740 * alternative node to recheck if the tasks is now properly placed.
1742 p->numa_scan_period = task_scan_min(p);
1744 if (env.best_task == NULL) {
1745 ret = migrate_task_to(p, env.best_cpu);
1747 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1751 ret = migrate_swap(p, env.best_task);
1753 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1754 put_task_struct(env.best_task);
1758 /* Attempt to migrate a task to a CPU on the preferred node. */
1759 static void numa_migrate_preferred(struct task_struct *p)
1761 unsigned long interval = HZ;
1763 /* This task has no NUMA fault statistics yet */
1764 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1767 /* Periodically retry migrating the task to the preferred node */
1768 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1769 p->numa_migrate_retry = jiffies + interval;
1771 /* Success if task is already running on preferred CPU */
1772 if (task_node(p) == p->numa_preferred_nid)
1775 /* Otherwise, try migrate to a CPU on the preferred node */
1776 task_numa_migrate(p);
1780 * Find the nodes on which the workload is actively running. We do this by
1781 * tracking the nodes from which NUMA hinting faults are triggered. This can
1782 * be different from the set of nodes where the workload's memory is currently
1785 * The bitmask is used to make smarter decisions on when to do NUMA page
1786 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1787 * are added when they cause over 6/16 of the maximum number of faults, but
1788 * only removed when they drop below 3/16.
1790 static void update_numa_active_node_mask(struct numa_group *numa_group)
1792 unsigned long faults, max_faults = 0;
1795 for_each_online_node(nid) {
1796 faults = group_faults_cpu(numa_group, nid);
1797 if (faults > max_faults)
1798 max_faults = faults;
1801 for_each_online_node(nid) {
1802 faults = group_faults_cpu(numa_group, nid);
1803 if (!node_isset(nid, numa_group->active_nodes)) {
1804 if (faults > max_faults * 6 / 16)
1805 node_set(nid, numa_group->active_nodes);
1806 } else if (faults < max_faults * 3 / 16)
1807 node_clear(nid, numa_group->active_nodes);
1812 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1813 * increments. The more local the fault statistics are, the higher the scan
1814 * period will be for the next scan window. If local/(local+remote) ratio is
1815 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1816 * the scan period will decrease. Aim for 70% local accesses.
1818 #define NUMA_PERIOD_SLOTS 10
1819 #define NUMA_PERIOD_THRESHOLD 7
1822 * Increase the scan period (slow down scanning) if the majority of
1823 * our memory is already on our local node, or if the majority of
1824 * the page accesses are shared with other processes.
1825 * Otherwise, decrease the scan period.
1827 static void update_task_scan_period(struct task_struct *p,
1828 unsigned long shared, unsigned long private)
1830 unsigned int period_slot;
1834 unsigned long remote = p->numa_faults_locality[0];
1835 unsigned long local = p->numa_faults_locality[1];
1838 * If there were no record hinting faults then either the task is
1839 * completely idle or all activity is areas that are not of interest
1840 * to automatic numa balancing. Related to that, if there were failed
1841 * migration then it implies we are migrating too quickly or the local
1842 * node is overloaded. In either case, scan slower
1844 if (local + shared == 0 || p->numa_faults_locality[2]) {
1845 p->numa_scan_period = min(p->numa_scan_period_max,
1846 p->numa_scan_period << 1);
1848 p->mm->numa_next_scan = jiffies +
1849 msecs_to_jiffies(p->numa_scan_period);
1855 * Prepare to scale scan period relative to the current period.
1856 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1857 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1858 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1860 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1861 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1862 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1863 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1866 diff = slot * period_slot;
1868 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1871 * Scale scan rate increases based on sharing. There is an
1872 * inverse relationship between the degree of sharing and
1873 * the adjustment made to the scanning period. Broadly
1874 * speaking the intent is that there is little point
1875 * scanning faster if shared accesses dominate as it may
1876 * simply bounce migrations uselessly
1878 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1879 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1882 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1883 task_scan_min(p), task_scan_max(p));
1884 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1888 * Get the fraction of time the task has been running since the last
1889 * NUMA placement cycle. The scheduler keeps similar statistics, but
1890 * decays those on a 32ms period, which is orders of magnitude off
1891 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1892 * stats only if the task is so new there are no NUMA statistics yet.
1894 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1896 u64 runtime, delta, now;
1897 /* Use the start of this time slice to avoid calculations. */
1898 now = p->se.exec_start;
1899 runtime = p->se.sum_exec_runtime;
1901 if (p->last_task_numa_placement) {
1902 delta = runtime - p->last_sum_exec_runtime;
1903 *period = now - p->last_task_numa_placement;
1905 delta = p->se.avg.load_sum / p->se.load.weight;
1906 *period = LOAD_AVG_MAX;
1909 p->last_sum_exec_runtime = runtime;
1910 p->last_task_numa_placement = now;
1916 * Determine the preferred nid for a task in a numa_group. This needs to
1917 * be done in a way that produces consistent results with group_weight,
1918 * otherwise workloads might not converge.
1920 static int preferred_group_nid(struct task_struct *p, int nid)
1925 /* Direct connections between all NUMA nodes. */
1926 if (sched_numa_topology_type == NUMA_DIRECT)
1930 * On a system with glueless mesh NUMA topology, group_weight
1931 * scores nodes according to the number of NUMA hinting faults on
1932 * both the node itself, and on nearby nodes.
1934 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1935 unsigned long score, max_score = 0;
1936 int node, max_node = nid;
1938 dist = sched_max_numa_distance;
1940 for_each_online_node(node) {
1941 score = group_weight(p, node, dist);
1942 if (score > max_score) {
1951 * Finding the preferred nid in a system with NUMA backplane
1952 * interconnect topology is more involved. The goal is to locate
1953 * tasks from numa_groups near each other in the system, and
1954 * untangle workloads from different sides of the system. This requires
1955 * searching down the hierarchy of node groups, recursively searching
1956 * inside the highest scoring group of nodes. The nodemask tricks
1957 * keep the complexity of the search down.
1959 nodes = node_online_map;
1960 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1961 unsigned long max_faults = 0;
1962 nodemask_t max_group = NODE_MASK_NONE;
1965 /* Are there nodes at this distance from each other? */
1966 if (!find_numa_distance(dist))
1969 for_each_node_mask(a, nodes) {
1970 unsigned long faults = 0;
1971 nodemask_t this_group;
1972 nodes_clear(this_group);
1974 /* Sum group's NUMA faults; includes a==b case. */
1975 for_each_node_mask(b, nodes) {
1976 if (node_distance(a, b) < dist) {
1977 faults += group_faults(p, b);
1978 node_set(b, this_group);
1979 node_clear(b, nodes);
1983 /* Remember the top group. */
1984 if (faults > max_faults) {
1985 max_faults = faults;
1986 max_group = this_group;
1988 * subtle: at the smallest distance there is
1989 * just one node left in each "group", the
1990 * winner is the preferred nid.
1995 /* Next round, evaluate the nodes within max_group. */
2003 static void task_numa_placement(struct task_struct *p)
2005 int seq, nid, max_nid = -1, max_group_nid = -1;
2006 unsigned long max_faults = 0, max_group_faults = 0;
2007 unsigned long fault_types[2] = { 0, 0 };
2008 unsigned long total_faults;
2009 u64 runtime, period;
2010 spinlock_t *group_lock = NULL;
2013 * The p->mm->numa_scan_seq field gets updated without
2014 * exclusive access. Use READ_ONCE() here to ensure
2015 * that the field is read in a single access:
2017 seq = READ_ONCE(p->mm->numa_scan_seq);
2018 if (p->numa_scan_seq == seq)
2020 p->numa_scan_seq = seq;
2021 p->numa_scan_period_max = task_scan_max(p);
2023 total_faults = p->numa_faults_locality[0] +
2024 p->numa_faults_locality[1];
2025 runtime = numa_get_avg_runtime(p, &period);
2027 /* If the task is part of a group prevent parallel updates to group stats */
2028 if (p->numa_group) {
2029 group_lock = &p->numa_group->lock;
2030 spin_lock_irq(group_lock);
2033 /* Find the node with the highest number of faults */
2034 for_each_online_node(nid) {
2035 /* Keep track of the offsets in numa_faults array */
2036 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2037 unsigned long faults = 0, group_faults = 0;
2040 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2041 long diff, f_diff, f_weight;
2043 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2044 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2045 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2046 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2048 /* Decay existing window, copy faults since last scan */
2049 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2050 fault_types[priv] += p->numa_faults[membuf_idx];
2051 p->numa_faults[membuf_idx] = 0;
2054 * Normalize the faults_from, so all tasks in a group
2055 * count according to CPU use, instead of by the raw
2056 * number of faults. Tasks with little runtime have
2057 * little over-all impact on throughput, and thus their
2058 * faults are less important.
2060 f_weight = div64_u64(runtime << 16, period + 1);
2061 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2063 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2064 p->numa_faults[cpubuf_idx] = 0;
2066 p->numa_faults[mem_idx] += diff;
2067 p->numa_faults[cpu_idx] += f_diff;
2068 faults += p->numa_faults[mem_idx];
2069 p->total_numa_faults += diff;
2070 if (p->numa_group) {
2072 * safe because we can only change our own group
2074 * mem_idx represents the offset for a given
2075 * nid and priv in a specific region because it
2076 * is at the beginning of the numa_faults array.
2078 p->numa_group->faults[mem_idx] += diff;
2079 p->numa_group->faults_cpu[mem_idx] += f_diff;
2080 p->numa_group->total_faults += diff;
2081 group_faults += p->numa_group->faults[mem_idx];
2085 if (faults > max_faults) {
2086 max_faults = faults;
2090 if (group_faults > max_group_faults) {
2091 max_group_faults = group_faults;
2092 max_group_nid = nid;
2096 update_task_scan_period(p, fault_types[0], fault_types[1]);
2098 if (p->numa_group) {
2099 update_numa_active_node_mask(p->numa_group);
2100 spin_unlock_irq(group_lock);
2101 max_nid = preferred_group_nid(p, max_group_nid);
2105 /* Set the new preferred node */
2106 if (max_nid != p->numa_preferred_nid)
2107 sched_setnuma(p, max_nid);
2109 if (task_node(p) != p->numa_preferred_nid)
2110 numa_migrate_preferred(p);
2114 static inline int get_numa_group(struct numa_group *grp)
2116 return atomic_inc_not_zero(&grp->refcount);
2119 static inline void put_numa_group(struct numa_group *grp)
2121 if (atomic_dec_and_test(&grp->refcount))
2122 kfree_rcu(grp, rcu);
2125 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2128 struct numa_group *grp, *my_grp;
2129 struct task_struct *tsk;
2131 int cpu = cpupid_to_cpu(cpupid);
2134 if (unlikely(!p->numa_group)) {
2135 unsigned int size = sizeof(struct numa_group) +
2136 4*nr_node_ids*sizeof(unsigned long);
2138 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2142 atomic_set(&grp->refcount, 1);
2143 spin_lock_init(&grp->lock);
2145 /* Second half of the array tracks nids where faults happen */
2146 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2149 node_set(task_node(current), grp->active_nodes);
2151 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2152 grp->faults[i] = p->numa_faults[i];
2154 grp->total_faults = p->total_numa_faults;
2157 rcu_assign_pointer(p->numa_group, grp);
2161 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2163 if (!cpupid_match_pid(tsk, cpupid))
2166 grp = rcu_dereference(tsk->numa_group);
2170 my_grp = p->numa_group;
2175 * Only join the other group if its bigger; if we're the bigger group,
2176 * the other task will join us.
2178 if (my_grp->nr_tasks > grp->nr_tasks)
2182 * Tie-break on the grp address.
2184 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2187 /* Always join threads in the same process. */
2188 if (tsk->mm == current->mm)
2191 /* Simple filter to avoid false positives due to PID collisions */
2192 if (flags & TNF_SHARED)
2195 /* Update priv based on whether false sharing was detected */
2198 if (join && !get_numa_group(grp))
2206 BUG_ON(irqs_disabled());
2207 double_lock_irq(&my_grp->lock, &grp->lock);
2209 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2210 my_grp->faults[i] -= p->numa_faults[i];
2211 grp->faults[i] += p->numa_faults[i];
2213 my_grp->total_faults -= p->total_numa_faults;
2214 grp->total_faults += p->total_numa_faults;
2219 spin_unlock(&my_grp->lock);
2220 spin_unlock_irq(&grp->lock);
2222 rcu_assign_pointer(p->numa_group, grp);
2224 put_numa_group(my_grp);
2232 void task_numa_free(struct task_struct *p)
2234 struct numa_group *grp = p->numa_group;
2235 void *numa_faults = p->numa_faults;
2236 unsigned long flags;
2240 spin_lock_irqsave(&grp->lock, flags);
2241 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2242 grp->faults[i] -= p->numa_faults[i];
2243 grp->total_faults -= p->total_numa_faults;
2246 spin_unlock_irqrestore(&grp->lock, flags);
2247 RCU_INIT_POINTER(p->numa_group, NULL);
2248 put_numa_group(grp);
2251 p->numa_faults = NULL;
2256 * Got a PROT_NONE fault for a page on @node.
2258 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2260 struct task_struct *p = current;
2261 bool migrated = flags & TNF_MIGRATED;
2262 int cpu_node = task_node(current);
2263 int local = !!(flags & TNF_FAULT_LOCAL);
2266 if (!static_branch_likely(&sched_numa_balancing))
2269 /* for example, ksmd faulting in a user's mm */
2273 /* Allocate buffer to track faults on a per-node basis */
2274 if (unlikely(!p->numa_faults)) {
2275 int size = sizeof(*p->numa_faults) *
2276 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2278 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2279 if (!p->numa_faults)
2282 p->total_numa_faults = 0;
2283 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2287 * First accesses are treated as private, otherwise consider accesses
2288 * to be private if the accessing pid has not changed
2290 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2293 priv = cpupid_match_pid(p, last_cpupid);
2294 if (!priv && !(flags & TNF_NO_GROUP))
2295 task_numa_group(p, last_cpupid, flags, &priv);
2299 * If a workload spans multiple NUMA nodes, a shared fault that
2300 * occurs wholly within the set of nodes that the workload is
2301 * actively using should be counted as local. This allows the
2302 * scan rate to slow down when a workload has settled down.
2304 if (!priv && !local && p->numa_group &&
2305 node_isset(cpu_node, p->numa_group->active_nodes) &&
2306 node_isset(mem_node, p->numa_group->active_nodes))
2309 task_numa_placement(p);
2312 * Retry task to preferred node migration periodically, in case it
2313 * case it previously failed, or the scheduler moved us.
2315 if (time_after(jiffies, p->numa_migrate_retry))
2316 numa_migrate_preferred(p);
2319 p->numa_pages_migrated += pages;
2320 if (flags & TNF_MIGRATE_FAIL)
2321 p->numa_faults_locality[2] += pages;
2323 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2324 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2325 p->numa_faults_locality[local] += pages;
2328 static void reset_ptenuma_scan(struct task_struct *p)
2331 * We only did a read acquisition of the mmap sem, so
2332 * p->mm->numa_scan_seq is written to without exclusive access
2333 * and the update is not guaranteed to be atomic. That's not
2334 * much of an issue though, since this is just used for
2335 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2336 * expensive, to avoid any form of compiler optimizations:
2338 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2339 p->mm->numa_scan_offset = 0;
2343 * The expensive part of numa migration is done from task_work context.
2344 * Triggered from task_tick_numa().
2346 void task_numa_work(struct callback_head *work)
2348 unsigned long migrate, next_scan, now = jiffies;
2349 struct task_struct *p = current;
2350 struct mm_struct *mm = p->mm;
2351 struct vm_area_struct *vma;
2352 unsigned long start, end;
2353 unsigned long nr_pte_updates = 0;
2354 long pages, virtpages;
2356 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2358 work->next = work; /* protect against double add */
2360 * Who cares about NUMA placement when they're dying.
2362 * NOTE: make sure not to dereference p->mm before this check,
2363 * exit_task_work() happens _after_ exit_mm() so we could be called
2364 * without p->mm even though we still had it when we enqueued this
2367 if (p->flags & PF_EXITING)
2370 if (!mm->numa_next_scan) {
2371 mm->numa_next_scan = now +
2372 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2376 * Enforce maximal scan/migration frequency..
2378 migrate = mm->numa_next_scan;
2379 if (time_before(now, migrate))
2382 if (p->numa_scan_period == 0) {
2383 p->numa_scan_period_max = task_scan_max(p);
2384 p->numa_scan_period = task_scan_min(p);
2387 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2388 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2392 * Delay this task enough that another task of this mm will likely win
2393 * the next time around.
2395 p->node_stamp += 2 * TICK_NSEC;
2397 start = mm->numa_scan_offset;
2398 pages = sysctl_numa_balancing_scan_size;
2399 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2400 virtpages = pages * 8; /* Scan up to this much virtual space */
2405 if (!down_read_trylock(&mm->mmap_sem))
2407 vma = find_vma(mm, start);
2409 reset_ptenuma_scan(p);
2413 for (; vma; vma = vma->vm_next) {
2414 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2415 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2420 * Shared library pages mapped by multiple processes are not
2421 * migrated as it is expected they are cache replicated. Avoid
2422 * hinting faults in read-only file-backed mappings or the vdso
2423 * as migrating the pages will be of marginal benefit.
2426 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2430 * Skip inaccessible VMAs to avoid any confusion between
2431 * PROT_NONE and NUMA hinting ptes
2433 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2437 start = max(start, vma->vm_start);
2438 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2439 end = min(end, vma->vm_end);
2440 nr_pte_updates = change_prot_numa(vma, start, end);
2443 * Try to scan sysctl_numa_balancing_size worth of
2444 * hpages that have at least one present PTE that
2445 * is not already pte-numa. If the VMA contains
2446 * areas that are unused or already full of prot_numa
2447 * PTEs, scan up to virtpages, to skip through those
2451 pages -= (end - start) >> PAGE_SHIFT;
2452 virtpages -= (end - start) >> PAGE_SHIFT;
2455 if (pages <= 0 || virtpages <= 0)
2459 } while (end != vma->vm_end);
2464 * It is possible to reach the end of the VMA list but the last few
2465 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2466 * would find the !migratable VMA on the next scan but not reset the
2467 * scanner to the start so check it now.
2470 mm->numa_scan_offset = start;
2472 reset_ptenuma_scan(p);
2473 up_read(&mm->mmap_sem);
2477 * Drive the periodic memory faults..
2479 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2481 struct callback_head *work = &curr->numa_work;
2485 * We don't care about NUMA placement if we don't have memory.
2487 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2491 * Using runtime rather than walltime has the dual advantage that
2492 * we (mostly) drive the selection from busy threads and that the
2493 * task needs to have done some actual work before we bother with
2496 now = curr->se.sum_exec_runtime;
2497 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2499 if (now > curr->node_stamp + period) {
2500 if (!curr->node_stamp)
2501 curr->numa_scan_period = task_scan_min(curr);
2502 curr->node_stamp += period;
2504 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2505 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2506 task_work_add(curr, work, true);
2511 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2515 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2519 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2522 #endif /* CONFIG_NUMA_BALANCING */
2525 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2527 update_load_add(&cfs_rq->load, se->load.weight);
2528 if (!parent_entity(se))
2529 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2531 if (entity_is_task(se)) {
2532 struct rq *rq = rq_of(cfs_rq);
2534 account_numa_enqueue(rq, task_of(se));
2535 list_add(&se->group_node, &rq->cfs_tasks);
2538 cfs_rq->nr_running++;
2542 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2544 update_load_sub(&cfs_rq->load, se->load.weight);
2545 if (!parent_entity(se))
2546 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2547 if (entity_is_task(se)) {
2548 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2549 list_del_init(&se->group_node);
2551 cfs_rq->nr_running--;
2554 #ifdef CONFIG_FAIR_GROUP_SCHED
2556 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2558 long tg_weight, load, shares;
2561 * This really should be: cfs_rq->avg.load_avg, but instead we use
2562 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2563 * the shares for small weight interactive tasks.
2565 load = scale_load_down(cfs_rq->load.weight);
2567 tg_weight = atomic_long_read(&tg->load_avg);
2569 /* Ensure tg_weight >= load */
2570 tg_weight -= cfs_rq->tg_load_avg_contrib;
2573 shares = (tg->shares * load);
2575 shares /= tg_weight;
2577 if (shares < MIN_SHARES)
2578 shares = MIN_SHARES;
2579 if (shares > tg->shares)
2580 shares = tg->shares;
2584 # else /* CONFIG_SMP */
2585 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2589 # endif /* CONFIG_SMP */
2591 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2592 unsigned long weight)
2595 /* commit outstanding execution time */
2596 if (cfs_rq->curr == se)
2597 update_curr(cfs_rq);
2598 account_entity_dequeue(cfs_rq, se);
2601 update_load_set(&se->load, weight);
2604 account_entity_enqueue(cfs_rq, se);
2607 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2609 static void update_cfs_shares(struct sched_entity *se)
2611 struct cfs_rq *cfs_rq = group_cfs_rq(se);
2612 struct task_group *tg;
2618 if (throttled_hierarchy(cfs_rq))
2624 if (likely(se->load.weight == tg->shares))
2627 shares = calc_cfs_shares(cfs_rq, tg);
2629 reweight_entity(cfs_rq_of(se), se, shares);
2632 #else /* CONFIG_FAIR_GROUP_SCHED */
2633 static inline void update_cfs_shares(struct sched_entity *se)
2636 #endif /* CONFIG_FAIR_GROUP_SCHED */
2639 u32 sched_get_wake_up_idle(struct task_struct *p)
2641 u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2645 EXPORT_SYMBOL(sched_get_wake_up_idle);
2647 int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2649 int enable = !!wake_up_idle;
2652 p->flags |= PF_WAKE_UP_IDLE;
2654 p->flags &= ~PF_WAKE_UP_IDLE;
2658 EXPORT_SYMBOL(sched_set_wake_up_idle);
2660 static const u32 runnable_avg_yN_inv[] = {
2661 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2662 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2663 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2664 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2665 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2666 0x85aac367, 0x82cd8698,
2670 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2671 * over-estimates when re-combining.
2673 static const u32 runnable_avg_yN_sum[] = {
2674 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2675 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2676 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2681 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2683 static __always_inline u64 decay_load(u64 val, u64 n)
2685 unsigned int local_n;
2689 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2692 /* after bounds checking we can collapse to 32-bit */
2696 * As y^PERIOD = 1/2, we can combine
2697 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2698 * With a look-up table which covers y^n (n<PERIOD)
2700 * To achieve constant time decay_load.
2702 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2703 val >>= local_n / LOAD_AVG_PERIOD;
2704 local_n %= LOAD_AVG_PERIOD;
2707 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2712 * For updates fully spanning n periods, the contribution to runnable
2713 * average will be: \Sum 1024*y^n
2715 * We can compute this reasonably efficiently by combining:
2716 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2718 static u32 __compute_runnable_contrib(u64 n)
2722 if (likely(n <= LOAD_AVG_PERIOD))
2723 return runnable_avg_yN_sum[n];
2724 else if (unlikely(n >= LOAD_AVG_MAX_N))
2725 return LOAD_AVG_MAX;
2727 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2729 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2730 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2732 n -= LOAD_AVG_PERIOD;
2733 } while (n > LOAD_AVG_PERIOD);
2735 contrib = decay_load(contrib, n);
2736 return contrib + runnable_avg_yN_sum[n];
2739 #ifdef CONFIG_SCHED_HMP
2741 /* CPU selection flag */
2742 #define SBC_FLAG_PREV_CPU 0x1
2743 #define SBC_FLAG_BEST_CAP_CPU 0x2
2744 #define SBC_FLAG_CPU_COST 0x4
2745 #define SBC_FLAG_MIN_COST 0x8
2746 #define SBC_FLAG_IDLE_LEAST_LOADED 0x10
2747 #define SBC_FLAG_IDLE_CSTATE 0x20
2748 #define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
2749 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
2750 #define SBC_FLAG_CSTATE_LOAD 0x100
2751 #define SBC_FLAG_BEST_SIBLING 0x200
2752 #define SBC_FLAG_WAKER_CPU 0x400
2753 #define SBC_FLAG_PACK_TASK 0x800
2755 /* Cluster selection flag */
2756 #define SBC_FLAG_COLOC_CLUSTER 0x10000
2757 #define SBC_FLAG_WAKER_CLUSTER 0x20000
2758 #define SBC_FLAG_BACKUP_CLUSTER 0x40000
2759 #define SBC_FLAG_BOOST_CLUSTER 0x80000
2761 struct cpu_select_env {
2762 struct task_struct *p;
2763 struct related_thread_group *rtg;
2766 u8 need_waker_cluster:1;
2768 enum sched_boost_policy boost_policy;
2771 DECLARE_BITMAP(candidate_list, NR_CPUS);
2772 DECLARE_BITMAP(backup_list, NR_CPUS);
2776 u32 sbc_best_cluster_flag;
2777 struct cpumask search_cpus;
2780 struct cluster_cpu_stats {
2781 int best_idle_cpu, least_loaded_cpu;
2782 int best_capacity_cpu, best_cpu, best_sibling_cpu;
2783 int min_cost, best_sibling_cpu_cost;
2784 int best_cpu_wakeup_latency;
2785 u64 min_load, best_load, best_sibling_cpu_load;
2786 s64 highest_spare_capacity;
2790 * Should task be woken to any available idle cpu?
2792 * Waking tasks to idle cpu has mixed implications on both performance and
2793 * power. In many cases, scheduler can't estimate correctly impact of using idle
2794 * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
2795 * module to pass a strong hint to scheduler that the task in question should be
2796 * woken to idle cpu, generally to improve performance.
2798 static inline int wake_to_idle(struct task_struct *p)
2800 return (current->flags & PF_WAKE_UP_IDLE) ||
2801 (p->flags & PF_WAKE_UP_IDLE);
2804 static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
2808 total_load = env->task_load + env->cpu_load;
2810 if (total_load > sched_spill_load ||
2811 (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
2817 static int skip_cpu(int cpu, struct cpu_select_env *env)
2819 int tcpu = task_cpu(env->p);
2825 if (is_reserved(cpu))
2828 switch (env->reason) {
2830 skip = !idle_cpu(cpu);
2832 case IRQLOAD_MIGRATION:
2833 /* Purposely fall through */
2835 skip = (cpu == tcpu);
2843 acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
2850 tcpu = task_cpu(env->p);
2851 switch (env->reason) {
2853 return cluster->capacity > cpu_capacity(tcpu);
2855 case DOWN_MIGRATION:
2856 return cluster->capacity < cpu_capacity(tcpu);
2866 skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
2868 if (!test_bit(cluster->id, env->candidate_list))
2871 if (!acceptable_capacity(cluster, env)) {
2872 __clear_bit(cluster->id, env->candidate_list);
2879 static struct sched_cluster *
2880 select_least_power_cluster(struct cpu_select_env *env)
2882 struct sched_cluster *cluster;
2885 int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
2887 env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
2889 if (task_load_will_fit(env->p, env->task_load,
2890 cpu, env->boost_policy)) {
2891 env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
2893 if (env->boost_policy == SCHED_BOOST_NONE)
2894 return env->rtg->preferred_cluster;
2896 for_each_sched_cluster(cluster) {
2897 if (cluster != env->rtg->preferred_cluster) {
2898 __set_bit(cluster->id,
2900 __clear_bit(cluster->id,
2901 env->candidate_list);
2905 return env->rtg->preferred_cluster;
2909 * Since the task load does not fit on the preferred
2910 * cluster anymore, pretend that the task does not
2911 * have any preferred cluster. This allows the waking
2912 * task to get the appropriate CPU it needs as per the
2913 * non co-location placement policy without having to
2914 * wait until the preferred cluster is updated.
2919 for_each_sched_cluster(cluster) {
2920 if (!skip_cluster(cluster, env)) {
2921 int cpu = cluster_first_cpu(cluster);
2923 env->task_load = scale_load_to_cpu(task_load(env->p),
2925 if (task_load_will_fit(env->p, env->task_load, cpu,
2929 __set_bit(cluster->id, env->backup_list);
2930 __clear_bit(cluster->id, env->candidate_list);
2937 static struct sched_cluster *
2938 next_candidate(const unsigned long *list, int start, int end)
2942 cluster_id = find_next_bit(list, end, start - 1 + 1);
2943 if (cluster_id >= end)
2946 return sched_cluster[cluster_id];
2950 update_spare_capacity(struct cluster_cpu_stats *stats,
2951 struct cpu_select_env *env, int cpu, int capacity,
2954 s64 spare_capacity = sched_ravg_window - cpu_load;
2956 if (spare_capacity > 0 &&
2957 (spare_capacity > stats->highest_spare_capacity ||
2958 (spare_capacity == stats->highest_spare_capacity &&
2959 ((!env->need_waker_cluster &&
2960 capacity > cpu_capacity(stats->best_capacity_cpu)) ||
2961 (env->need_waker_cluster &&
2962 cpu_rq(cpu)->nr_running <
2963 cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
2965 * If sync waker is the only runnable of CPU, cr_avg of the
2966 * CPU is 0 so we have high chance to place the wakee on the
2967 * waker's CPU which likely causes preemtion of the waker.
2968 * This can lead migration of preempted waker. Place the
2969 * wakee on the real idle CPU when it's possible by checking
2970 * nr_running to avoid such preemption.
2972 stats->highest_spare_capacity = spare_capacity;
2973 stats->best_capacity_cpu = cpu;
2977 static inline void find_backup_cluster(
2978 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
2980 struct sched_cluster *next = NULL;
2982 struct cpumask search_cpus;
2984 while (!bitmap_empty(env->backup_list, num_clusters)) {
2985 next = next_candidate(env->backup_list, 0, num_clusters);
2986 __clear_bit(next->id, env->backup_list);
2988 cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
2989 for_each_cpu(i, &search_cpus) {
2990 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
2991 sched_irqload(i), power_cost(i, task_load(env->p) +
2992 cpu_cravg_sync(i, env->sync)), 0);
2994 update_spare_capacity(stats, env, i, next->capacity,
2995 cpu_load_sync(i, env->sync));
2997 env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
3001 struct sched_cluster *
3002 next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
3003 struct cluster_cpu_stats *stats)
3005 struct sched_cluster *next = NULL;
3007 __clear_bit(cluster->id, env->candidate_list);
3009 if (env->rtg && preferred_cluster(cluster, env->p))
3013 if (bitmap_empty(env->candidate_list, num_clusters))
3016 next = next_candidate(env->candidate_list, 0, num_clusters);
3018 if (next->min_power_cost > stats->min_cost) {
3019 clear_bit(next->id, env->candidate_list);
3024 if (skip_cluster(next, env))
3029 env->task_load = scale_load_to_cpu(task_load(env->p),
3030 cluster_first_cpu(next));
3034 #ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
3035 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3036 struct cpu_select_env *env, int cpu_cost)
3039 int prev_cpu = env->prev_cpu;
3041 wakeup_latency = cpu_rq(cpu)->wakeup_latency;
3043 if (env->need_idle) {
3044 stats->min_cost = cpu_cost;
3045 if (idle_cpu(cpu)) {
3046 if (wakeup_latency < stats->best_cpu_wakeup_latency ||
3047 (wakeup_latency == stats->best_cpu_wakeup_latency &&
3049 stats->best_idle_cpu = cpu;
3050 stats->best_cpu_wakeup_latency = wakeup_latency;
3053 if (env->cpu_load < stats->min_load ||
3054 (env->cpu_load == stats->min_load &&
3056 stats->least_loaded_cpu = cpu;
3057 stats->min_load = env->cpu_load;
3064 if (cpu_cost < stats->min_cost) {
3065 stats->min_cost = cpu_cost;
3066 stats->best_cpu_wakeup_latency = wakeup_latency;
3067 stats->best_load = env->cpu_load;
3068 stats->best_cpu = cpu;
3069 env->sbc_best_flag = SBC_FLAG_CPU_COST;
3073 /* CPU cost is the same. Start breaking the tie by C-state */
3075 if (wakeup_latency > stats->best_cpu_wakeup_latency)
3078 if (wakeup_latency < stats->best_cpu_wakeup_latency) {
3079 stats->best_cpu_wakeup_latency = wakeup_latency;
3080 stats->best_load = env->cpu_load;
3081 stats->best_cpu = cpu;
3082 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
3086 /* C-state is the same. Use prev CPU to break the tie */
3087 if (cpu == prev_cpu) {
3088 stats->best_cpu = cpu;
3089 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
3093 if (stats->best_cpu != prev_cpu &&
3094 ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
3095 (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
3096 stats->best_load = env->cpu_load;
3097 stats->best_cpu = cpu;
3098 env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
3101 #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3102 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3103 struct cpu_select_env *env, int cpu_cost)
3105 int prev_cpu = env->prev_cpu;
3107 if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
3108 if (stats->best_sibling_cpu_cost > cpu_cost ||
3109 (stats->best_sibling_cpu_cost == cpu_cost &&
3110 stats->best_sibling_cpu_load > env->cpu_load)) {
3111 stats->best_sibling_cpu_cost = cpu_cost;
3112 stats->best_sibling_cpu_load = env->cpu_load;
3113 stats->best_sibling_cpu = cpu;
3117 if ((cpu_cost < stats->min_cost) ||
3118 ((stats->best_cpu != prev_cpu &&
3119 stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
3120 if (env->need_idle) {
3121 if (idle_cpu(cpu)) {
3122 stats->min_cost = cpu_cost;
3123 stats->best_idle_cpu = cpu;
3126 stats->min_cost = cpu_cost;
3127 stats->min_load = env->cpu_load;
3128 stats->best_cpu = cpu;
3129 env->sbc_best_flag = SBC_FLAG_MIN_COST;
3133 #endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3135 static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3136 struct cpu_select_env *env)
3141 * We try to find the least loaded *busy* CPU irrespective
3142 * of the power cost.
3145 cpu_cost = cpu_min_power_cost(cpu);
3148 cpu_cost = power_cost(cpu, task_load(env->p) +
3149 cpu_cravg_sync(cpu, env->sync));
3151 if (cpu_cost <= stats->min_cost)
3152 __update_cluster_stats(cpu, stats, env, cpu_cost);
3155 static void find_best_cpu_in_cluster(struct sched_cluster *c,
3156 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3159 struct cpumask search_cpus;
3161 cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
3163 env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
3165 for_each_cpu(i, &search_cpus) {
3166 env->cpu_load = cpu_load_sync(i, env->sync);
3168 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3170 power_cost(i, task_load(env->p) +
3171 cpu_cravg_sync(i, env->sync)), 0);
3173 if (skip_cpu(i, env))
3176 update_spare_capacity(stats, env, i, c->capacity,
3180 * need_idle takes precedence over sched boost but when both
3181 * are set, idlest CPU with in all the clusters is selected
3182 * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
3183 * big cluster is selected within boost_policy = BOOST_ON_BIG.
3185 if ((!env->need_idle &&
3186 env->boost_policy != SCHED_BOOST_NONE) ||
3187 env->need_waker_cluster ||
3188 sched_cpu_high_irqload(i) ||
3189 spill_threshold_crossed(env, cpu_rq(i)))
3192 update_cluster_stats(i, stats, env);
3196 static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
3198 stats->best_cpu = stats->best_idle_cpu = -1;
3199 stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
3200 stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
3201 stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
3202 stats->highest_spare_capacity = 0;
3203 stats->least_loaded_cpu = -1;
3204 stats->best_cpu_wakeup_latency = INT_MAX;
3205 /* No need to initialize stats->best_load */
3208 static inline bool env_has_special_flags(struct cpu_select_env *env)
3210 if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
3218 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3221 struct task_struct *task = env->p;
3222 struct sched_cluster *cluster;
3224 if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
3227 prev_cpu = env->prev_cpu;
3228 if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
3231 if (task->ravg.mark_start - task->last_cpu_selected_ts >=
3232 sched_long_cpu_selection_threshold)
3236 * This function should be used by task wake up path only as it's
3237 * assuming p->last_switch_out_ts as last sleep time.
3238 * p->last_switch_out_ts can denote last preemption time as well as
3241 if (task->ravg.mark_start - task->last_switch_out_ts >=
3242 sched_short_sleep_task_threshold)
3245 env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
3246 cluster = cpu_rq(prev_cpu)->cluster;
3248 if (!task_load_will_fit(task, env->task_load, prev_cpu,
3249 sched_boost_policy())) {
3251 __set_bit(cluster->id, env->backup_list);
3252 __clear_bit(cluster->id, env->candidate_list);
3256 env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
3257 if (sched_cpu_high_irqload(prev_cpu) ||
3258 spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
3259 update_spare_capacity(stats, env, prev_cpu,
3260 cluster->capacity, env->cpu_load);
3261 cpumask_clear_cpu(prev_cpu, &env->search_cpus);
3269 wake_to_waker_cluster(struct cpu_select_env *env)
3272 task_load(current) > sched_big_waker_task_load &&
3273 task_load(env->p) < sched_small_wakee_task_load;
3277 bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
3279 return sysctl_sched_prefer_sync_wakee_to_waker &&
3280 cpu_rq(cpu)->nr_running == 1 &&
3281 cpumask_test_cpu(cpu, &env->search_cpus);
3285 cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
3287 return cpumask_intersects(&env->search_cpus, &cluster->cpus);
3290 /* return cheapest cpu that can fit this task */
3291 static int select_best_cpu(struct task_struct *p, int target, int reason,
3294 struct sched_cluster *cluster, *pref_cluster = NULL;
3295 struct cluster_cpu_stats stats;
3296 struct related_thread_group *grp;
3297 unsigned int sbc_flag = 0;
3298 int cpu = raw_smp_processor_id();
3301 struct cpu_select_env env = {
3304 .need_idle = wake_to_idle(p),
3305 .need_waker_cluster = 0,
3310 .sbc_best_cluster_flag = 0,
3314 env.boost_policy = task_sched_boost(p) ?
3315 sched_boost_policy() : SCHED_BOOST_NONE;
3317 bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
3318 bitmap_zero(env.backup_list, NR_CPUS);
3320 cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
3321 cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
3323 init_cluster_cpu_stats(&stats);
3324 special = env_has_special_flags(&env);
3328 grp = task_related_thread_group(p);
3330 if (grp && grp->preferred_cluster) {
3331 pref_cluster = grp->preferred_cluster;
3332 if (!cluster_allowed(&env, pref_cluster))
3333 clear_bit(pref_cluster->id, env.candidate_list);
3336 } else if (!special) {
3337 cluster = cpu_rq(cpu)->cluster;
3338 if (wake_to_waker_cluster(&env)) {
3339 if (bias_to_waker_cpu(&env, cpu)) {
3341 sbc_flag = SBC_FLAG_WAKER_CLUSTER |
3344 } else if (cluster_allowed(&env, cluster)) {
3345 env.need_waker_cluster = 1;
3346 bitmap_zero(env.candidate_list, NR_CPUS);
3347 __set_bit(cluster->id, env.candidate_list);
3348 env.sbc_best_cluster_flag =
3349 SBC_FLAG_WAKER_CLUSTER;
3351 } else if (bias_to_prev_cpu(&env, &stats)) {
3352 sbc_flag = SBC_FLAG_PREV_CPU;
3357 if (!special && is_short_burst_task(p)) {
3358 env.pack_task = true;
3359 sbc_flag = SBC_FLAG_PACK_TASK;
3362 cluster = select_least_power_cluster(&env);
3368 * 'cluster' now points to the minimum power cluster which can satisfy
3369 * task's perf goals. Walk down the cluster list starting with that
3370 * cluster. For non-small tasks, skip clusters that don't have
3371 * mostly_idle/idle cpus
3375 find_best_cpu_in_cluster(cluster, &env, &stats);
3377 } while ((cluster = next_best_cluster(cluster, &env, &stats)));
3379 if (env.need_idle) {
3380 if (stats.best_idle_cpu >= 0) {
3381 target = stats.best_idle_cpu;
3382 sbc_flag |= SBC_FLAG_IDLE_CSTATE;
3383 } else if (stats.least_loaded_cpu >= 0) {
3384 target = stats.least_loaded_cpu;
3385 sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
3387 } else if (stats.best_cpu >= 0) {
3388 if (stats.best_sibling_cpu >= 0 &&
3389 stats.best_cpu != task_cpu(p) &&
3390 stats.min_cost == stats.best_sibling_cpu_cost) {
3391 stats.best_cpu = stats.best_sibling_cpu;
3392 sbc_flag |= SBC_FLAG_BEST_SIBLING;
3394 sbc_flag |= env.sbc_best_flag;
3395 target = stats.best_cpu;
3397 if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
3403 * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
3404 * backup_list = little cluster, candidate_list = none and
3405 * stats->best_capacity_cpu points the best spare capacity
3406 * CPU among the CPUs in the big cluster.
3408 if (env.boost_policy == SCHED_BOOST_ON_BIG &&
3409 stats.best_capacity_cpu >= 0)
3410 sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
3412 find_backup_cluster(&env, &stats);
3414 if (stats.best_capacity_cpu >= 0) {
3415 target = stats.best_capacity_cpu;
3416 sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
3419 p->last_cpu_selected_ts = sched_ktime_clock();
3421 sbc_flag |= env.sbc_best_cluster_flag;
3423 trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
3424 env.reason, env.sync, env.need_idle, sbc_flag, target);
3428 #ifdef CONFIG_CFS_BANDWIDTH
3430 static inline struct task_group *next_task_group(struct task_group *tg)
3432 tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
3434 return (&tg->list == &task_groups) ? NULL : tg;
3437 /* Iterate over all cfs_rq in a cpu */
3438 #define for_each_cfs_rq(cfs_rq, tg, cpu) \
3439 for (tg = container_of(&task_groups, struct task_group, list); \
3440 ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
3442 void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
3444 struct task_group *tg;
3445 struct cfs_rq *cfs_rq;
3449 for_each_cfs_rq(cfs_rq, tg, cpu)
3450 reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
3455 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
3457 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3458 struct task_struct *p, int change_cra);
3459 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3460 struct task_struct *p, int change_cra);
3462 /* Add task's contribution to a cpu' HMP statistics */
3463 void _inc_hmp_sched_stats_fair(struct rq *rq,
3464 struct task_struct *p, int change_cra)
3466 struct cfs_rq *cfs_rq;
3467 struct sched_entity *se = &p->se;
3470 * Although below check is not strictly required (as
3471 * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
3472 * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
3473 * efficiency by short-circuiting for_each_sched_entity() loop when
3474 * sched_disable_window_stats
3476 if (sched_disable_window_stats)
3479 for_each_sched_entity(se) {
3480 cfs_rq = cfs_rq_of(se);
3481 inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3482 if (cfs_rq_throttled(cfs_rq))
3486 /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3488 inc_rq_hmp_stats(rq, p, change_cra);
3491 /* Remove task's contribution from a cpu' HMP statistics */
3493 _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
3495 struct cfs_rq *cfs_rq;
3496 struct sched_entity *se = &p->se;
3498 /* See comment on efficiency in _inc_hmp_sched_stats_fair */
3499 if (sched_disable_window_stats)
3502 for_each_sched_entity(se) {
3503 cfs_rq = cfs_rq_of(se);
3504 dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3505 if (cfs_rq_throttled(cfs_rq))
3509 /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3511 dec_rq_hmp_stats(rq, p, change_cra);
3514 static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3516 _inc_hmp_sched_stats_fair(rq, p, 1);
3519 static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3521 _dec_hmp_sched_stats_fair(rq, p, 1);
3524 static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3525 u32 new_task_load, u32 new_pred_demand)
3527 struct cfs_rq *cfs_rq;
3528 struct sched_entity *se = &p->se;
3529 s64 task_load_delta = (s64)new_task_load - task_load(p);
3530 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3532 for_each_sched_entity(se) {
3533 cfs_rq = cfs_rq_of(se);
3535 fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
3538 fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
3539 if (cfs_rq_throttled(cfs_rq))
3543 /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
3545 fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
3548 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3552 static int task_will_be_throttled(struct task_struct *p);
3554 #else /* CONFIG_CFS_BANDWIDTH */
3556 inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
3559 inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3561 inc_nr_big_task(&rq->hmp_stats, p);
3562 inc_cumulative_runnable_avg(&rq->hmp_stats, p);
3566 dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3568 dec_nr_big_task(&rq->hmp_stats, p);
3569 dec_cumulative_runnable_avg(&rq->hmp_stats, p);
3572 fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3573 u32 new_task_load, u32 new_pred_demand)
3575 s64 task_load_delta = (s64)new_task_load - task_load(p);
3576 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3578 fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
3580 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3583 static inline int task_will_be_throttled(struct task_struct *p)
3588 void _inc_hmp_sched_stats_fair(struct rq *rq,
3589 struct task_struct *p, int change_cra)
3591 inc_nr_big_task(&rq->hmp_stats, p);
3594 #endif /* CONFIG_CFS_BANDWIDTH */
3597 * Reset balance_interval at all sched_domain levels of given cpu, so that it
3600 static inline void reset_balance_interval(int cpu)
3602 struct sched_domain *sd;
3604 if (cpu >= nr_cpu_ids)
3608 for_each_domain(cpu, sd)
3609 sd->balance_interval = 0;
3614 * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
3615 * cpu as per its demand or priority)
3617 * Returns reason why task needs to be migrated
3619 static inline int migration_needed(struct task_struct *p, int cpu)
3622 struct related_thread_group *grp;
3624 if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
3627 /* No need to migrate task that is about to be throttled */
3628 if (task_will_be_throttled(p))
3631 if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
3632 cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
3633 return UP_MIGRATION;
3635 if (sched_cpu_high_irqload(cpu))
3636 return IRQLOAD_MIGRATION;
3638 nice = task_nice(p);
3640 grp = task_related_thread_group(p);
3642 * Don't assume higher capacity means higher power. If the task
3643 * is running on the power efficient CPU, avoid migrating it
3644 * to a lower capacity cluster.
3646 if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
3647 upmigrate_discouraged(p)) &&
3648 cpu_capacity(cpu) > min_capacity &&
3649 cpu_max_power_cost(cpu) == max_power_cost) {
3651 return DOWN_MIGRATION;
3654 if (!task_will_fit(p, cpu)) {
3656 return UP_MIGRATION;
3664 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
3666 unsigned long flags;
3669 /* Invoke active balance to force migrate currently running task */
3670 raw_spin_lock_irqsave(&rq->lock, flags);
3671 if (!rq->active_balance) {
3672 rq->active_balance = 1;
3673 rq->push_cpu = new_cpu;
3678 raw_spin_unlock_irqrestore(&rq->lock, flags);
3683 static DEFINE_RAW_SPINLOCK(migration_lock);
3685 static bool do_migration(int reason, int new_cpu, int cpu)
3687 if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
3688 && same_cluster(new_cpu, cpu))
3691 /* Inter cluster high irqload migrations are OK */
3692 return new_cpu != cpu;
3696 * Check if currently running task should be migrated to a better cpu.
3698 * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
3700 void check_for_migration(struct rq *rq, struct task_struct *p)
3702 int cpu = cpu_of(rq), new_cpu;
3703 int active_balance = 0, reason;
3705 reason = migration_needed(p, cpu);
3709 raw_spin_lock(&migration_lock);
3710 new_cpu = select_best_cpu(p, cpu, reason, 0);
3712 if (do_migration(reason, new_cpu, cpu)) {
3713 active_balance = kick_active_balance(rq, p, new_cpu);
3715 mark_reserved(new_cpu);
3718 raw_spin_unlock(&migration_lock);
3721 stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
3722 &rq->active_balance_work);
3725 #ifdef CONFIG_CFS_BANDWIDTH
3727 static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
3729 cfs_rq->hmp_stats.nr_big_tasks = 0;
3730 cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
3731 cfs_rq->hmp_stats.pred_demands_sum = 0;
3734 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3735 struct task_struct *p, int change_cra)
3737 inc_nr_big_task(&cfs_rq->hmp_stats, p);
3739 inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3742 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3743 struct task_struct *p, int change_cra)
3745 dec_nr_big_task(&cfs_rq->hmp_stats, p);
3747 dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3750 static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3751 struct cfs_rq *cfs_rq)
3753 stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
3754 stats->cumulative_runnable_avg +=
3755 cfs_rq->hmp_stats.cumulative_runnable_avg;
3756 stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
3759 static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3760 struct cfs_rq *cfs_rq)
3762 stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
3763 stats->cumulative_runnable_avg -=
3764 cfs_rq->hmp_stats.cumulative_runnable_avg;
3765 stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
3767 BUG_ON(stats->nr_big_tasks < 0 ||
3768 (s64)stats->cumulative_runnable_avg < 0);
3769 BUG_ON((s64)stats->pred_demands_sum < 0);
3772 #else /* CONFIG_CFS_BANDWIDTH */
3774 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3775 struct task_struct *p, int change_cra) { }
3777 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3778 struct task_struct *p, int change_cra) { }
3780 #endif /* CONFIG_CFS_BANDWIDTH */
3782 #else /* CONFIG_SCHED_HMP */
3784 static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
3786 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3787 struct task_struct *p, int change_cra) { }
3789 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3790 struct task_struct *p, int change_cra) { }
3792 #define dec_throttled_cfs_rq_hmp_stats(...)
3793 #define inc_throttled_cfs_rq_hmp_stats(...)
3795 #endif /* CONFIG_SCHED_HMP */
3797 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
3798 #error "load tracking assumes 2^10 as unit"
3801 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
3804 * We can represent the historical contribution to runnable average as the
3805 * coefficients of a geometric series. To do this we sub-divide our runnable
3806 * history into segments of approximately 1ms (1024us); label the segment that
3807 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3809 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3811 * (now) (~1ms ago) (~2ms ago)
3813 * Let u_i denote the fraction of p_i that the entity was runnable.
3815 * We then designate the fractions u_i as our co-efficients, yielding the
3816 * following representation of historical load:
3817 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3819 * We choose y based on the with of a reasonably scheduling period, fixing:
3822 * This means that the contribution to load ~32ms ago (u_32) will be weighted
3823 * approximately half as much as the contribution to load within the last ms
3826 * When a period "rolls over" and we have new u_0`, multiplying the previous
3827 * sum again by y is sufficient to update:
3828 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3829 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3831 static __always_inline int
3832 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3833 unsigned long weight, int running, struct cfs_rq *cfs_rq)
3835 u64 delta, scaled_delta, periods;
3837 unsigned int delta_w, scaled_delta_w, decayed = 0;
3838 unsigned long scale_freq, scale_cpu;
3840 delta = now - sa->last_update_time;
3842 * This should only happen when time goes backwards, which it
3843 * unfortunately does during sched clock init when we swap over to TSC.
3845 if ((s64)delta < 0) {
3846 sa->last_update_time = now;
3851 * Use 1024ns as the unit of measurement since it's a reasonable
3852 * approximation of 1us and fast to compute.
3857 sa->last_update_time = now;
3859 scale_freq = arch_scale_freq_capacity(NULL, cpu);
3860 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3861 trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
3863 /* delta_w is the amount already accumulated against our next period */
3864 delta_w = sa->period_contrib;
3865 if (delta + delta_w >= 1024) {
3868 /* how much left for next period will start over, we don't know yet */
3869 sa->period_contrib = 0;
3872 * Now that we know we're crossing a period boundary, figure
3873 * out how much from delta we need to complete the current
3874 * period and accrue it.
3876 delta_w = 1024 - delta_w;
3877 scaled_delta_w = cap_scale(delta_w, scale_freq);
3879 sa->load_sum += weight * scaled_delta_w;
3881 cfs_rq->runnable_load_sum +=
3882 weight * scaled_delta_w;
3886 sa->util_sum += scaled_delta_w * scale_cpu;
3890 /* Figure out how many additional periods this update spans */
3891 periods = delta / 1024;
3894 sa->load_sum = decay_load(sa->load_sum, periods + 1);
3896 cfs_rq->runnable_load_sum =
3897 decay_load(cfs_rq->runnable_load_sum, periods + 1);
3899 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
3901 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
3902 contrib = __compute_runnable_contrib(periods);
3903 contrib = cap_scale(contrib, scale_freq);
3905 sa->load_sum += weight * contrib;
3907 cfs_rq->runnable_load_sum += weight * contrib;
3910 sa->util_sum += contrib * scale_cpu;
3913 /* Remainder of delta accrued against u_0` */
3914 scaled_delta = cap_scale(delta, scale_freq);
3916 sa->load_sum += weight * scaled_delta;
3918 cfs_rq->runnable_load_sum += weight * scaled_delta;
3922 sa->util_sum += scaled_delta * scale_cpu;
3924 sa->period_contrib += delta;
3927 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
3929 cfs_rq->runnable_load_avg =
3930 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3932 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
3939 * Signed add and clamp on underflow.
3941 * Explicitly do a load-store to ensure the intermediate value never hits
3942 * memory. This allows lockless observations without ever seeing the negative
3945 #define add_positive(_ptr, _val) do { \
3946 typeof(_ptr) ptr = (_ptr); \
3947 typeof(_val) val = (_val); \
3948 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3952 if (val < 0 && res > var) \
3955 WRITE_ONCE(*ptr, res); \
3958 #ifdef CONFIG_FAIR_GROUP_SCHED
3960 * update_tg_load_avg - update the tg's load avg
3961 * @cfs_rq: the cfs_rq whose avg changed
3962 * @force: update regardless of how small the difference
3964 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3965 * However, because tg->load_avg is a global value there are performance
3968 * In order to avoid having to look at the other cfs_rq's, we use a
3969 * differential update where we store the last value we propagated. This in
3970 * turn allows skipping updates if the differential is 'small'.
3972 * Updating tg's load_avg is necessary before update_cfs_share() (which is
3973 * done) and effective_load() (which is not done because it is too costly).
3975 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3977 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3980 * No need to update load_avg for root_task_group as it is not used.
3982 if (cfs_rq->tg == &root_task_group)
3985 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3986 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3987 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3992 * Called within set_task_rq() right before setting a task's cpu. The
3993 * caller only guarantees p->pi_lock is held; no other assumptions,
3994 * including the state of rq->lock, should be made.
3996 void set_task_rq_fair(struct sched_entity *se,
3997 struct cfs_rq *prev, struct cfs_rq *next)
3999 if (!sched_feat(ATTACH_AGE_LOAD))
4003 * We are supposed to update the task to "current" time, then its up to
4004 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4005 * getting what current time is, so simply throw away the out-of-date
4006 * time. This will result in the wakee task is less decayed, but giving
4007 * the wakee more load sounds not bad.
4009 if (se->avg.last_update_time && prev) {
4010 u64 p_last_update_time;
4011 u64 n_last_update_time;
4013 #ifndef CONFIG_64BIT
4014 u64 p_last_update_time_copy;
4015 u64 n_last_update_time_copy;
4018 p_last_update_time_copy = prev->load_last_update_time_copy;
4019 n_last_update_time_copy = next->load_last_update_time_copy;
4023 p_last_update_time = prev->avg.last_update_time;
4024 n_last_update_time = next->avg.last_update_time;
4026 } while (p_last_update_time != p_last_update_time_copy ||
4027 n_last_update_time != n_last_update_time_copy);
4029 p_last_update_time = prev->avg.last_update_time;
4030 n_last_update_time = next->avg.last_update_time;
4032 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
4033 &se->avg, 0, 0, NULL);
4034 se->avg.last_update_time = n_last_update_time;
4038 /* Take into account change of utilization of a child task group */
4040 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
4042 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4043 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
4045 /* Nothing to update */
4049 /* Set new sched_entity's utilization */
4050 se->avg.util_avg = gcfs_rq->avg.util_avg;
4051 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
4053 /* Update parent cfs_rq utilization */
4054 add_positive(&cfs_rq->avg.util_avg, delta);
4055 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
4058 /* Take into account change of load of a child task group */
4060 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
4062 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4063 long delta, load = gcfs_rq->avg.load_avg;
4066 * If the load of group cfs_rq is null, the load of the
4067 * sched_entity will also be null so we can skip the formula
4072 /* Get tg's load and ensure tg_load > 0 */
4073 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
4075 /* Ensure tg_load >= load and updated with current load*/
4076 tg_load -= gcfs_rq->tg_load_avg_contrib;
4080 * We need to compute a correction term in the case that the
4081 * task group is consuming more CPU than a task of equal
4082 * weight. A task with a weight equals to tg->shares will have
4083 * a load less or equal to scale_load_down(tg->shares).
4084 * Similarly, the sched_entities that represent the task group
4085 * at parent level, can't have a load higher than
4086 * scale_load_down(tg->shares). And the Sum of sched_entities'
4087 * load must be <= scale_load_down(tg->shares).
4089 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
4090 /* scale gcfs_rq's load into tg's shares*/
4091 load *= scale_load_down(gcfs_rq->tg->shares);
4096 delta = load - se->avg.load_avg;
4098 /* Nothing to update */
4102 /* Set new sched_entity's load */
4103 se->avg.load_avg = load;
4104 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
4106 /* Update parent cfs_rq load */
4107 add_positive(&cfs_rq->avg.load_avg, delta);
4108 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
4111 * If the sched_entity is already enqueued, we also have to update the
4112 * runnable load avg.
4115 /* Update parent cfs_rq runnable_load_avg */
4116 add_positive(&cfs_rq->runnable_load_avg, delta);
4117 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
4121 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
4123 cfs_rq->propagate_avg = 1;
4126 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
4128 struct cfs_rq *cfs_rq = group_cfs_rq(se);
4130 if (!cfs_rq->propagate_avg)
4133 cfs_rq->propagate_avg = 0;
4137 /* Update task and its cfs_rq load average */
4138 static inline int propagate_entity_load_avg(struct sched_entity *se)
4140 struct cfs_rq *cfs_rq;
4142 if (entity_is_task(se))
4145 if (!test_and_clear_tg_cfs_propagate(se))
4148 cfs_rq = cfs_rq_of(se);
4150 set_tg_cfs_propagate(cfs_rq);
4152 update_tg_cfs_util(cfs_rq, se);
4153 update_tg_cfs_load(cfs_rq, se);
4158 #else /* CONFIG_FAIR_GROUP_SCHED */
4160 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
4162 static inline int propagate_entity_load_avg(struct sched_entity *se)
4167 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
4169 #endif /* CONFIG_FAIR_GROUP_SCHED */
4171 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
4173 if (&this_rq()->cfs == cfs_rq) {
4175 * There are a few boundary cases this might miss but it should
4176 * get called often enough that that should (hopefully) not be
4177 * a real problem -- added to that it only calls on the local
4178 * CPU, so if we enqueue remotely we'll miss an update, but
4179 * the next tick/schedule should update.
4181 * It will not get called when we go idle, because the idle
4182 * thread is a different class (!fair), nor will the utilization
4183 * number include things like RT tasks.
4185 * As is, the util number is not freq-invariant (we'd have to
4186 * implement arch_scale_freq_capacity() for that).
4190 cpufreq_update_util(rq_of(cfs_rq), 0);
4194 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
4197 * Unsigned subtract and clamp on underflow.
4199 * Explicitly do a load-store to ensure the intermediate value never hits
4200 * memory. This allows lockless observations without ever seeing the negative
4203 #define sub_positive(_ptr, _val) do { \
4204 typeof(_ptr) ptr = (_ptr); \
4205 typeof(*ptr) val = (_val); \
4206 typeof(*ptr) res, var = READ_ONCE(*ptr); \
4210 WRITE_ONCE(*ptr, res); \
4214 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4215 * @now: current time, as per cfs_rq_clock_task()
4216 * @cfs_rq: cfs_rq to update
4217 * @update_freq: should we call cfs_rq_util_change() or will the call do so
4219 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4220 * avg. The immediate corollary is that all (fair) tasks must be attached, see
4221 * post_init_entity_util_avg().
4223 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4225 * Returns true if the load decayed or we removed load.
4227 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4228 * call update_tg_load_avg() when this function returns true.
4231 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4233 struct sched_avg *sa = &cfs_rq->avg;
4234 int decayed, removed = 0, removed_util = 0;
4236 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
4237 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
4238 sub_positive(&sa->load_avg, r);
4239 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
4241 set_tg_cfs_propagate(cfs_rq);
4244 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
4245 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
4246 sub_positive(&sa->util_avg, r);
4247 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
4249 set_tg_cfs_propagate(cfs_rq);
4252 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
4253 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
4255 #ifndef CONFIG_64BIT
4257 cfs_rq->load_last_update_time_copy = sa->last_update_time;
4260 /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
4261 if (cfs_rq == &rq_of(cfs_rq)->cfs)
4262 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
4264 if (update_freq && (decayed || removed_util))
4265 cfs_rq_util_change(cfs_rq);
4267 return decayed || removed;
4271 * Optional action to be done while updating the load average
4273 #define UPDATE_TG 0x1
4274 #define SKIP_AGE_LOAD 0x2
4276 /* Update task and its cfs_rq load average */
4277 static inline void update_load_avg(struct sched_entity *se, int flags)
4279 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4280 u64 now = cfs_rq_clock_task(cfs_rq);
4281 int cpu = cpu_of(rq_of(cfs_rq));
4286 * Track task load average for carrying it to new CPU after migrated, and
4287 * track group sched_entity load average for task_h_load calc in migration
4289 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
4290 __update_load_avg(now, cpu, &se->avg,
4291 se->on_rq * scale_load_down(se->load.weight),
4292 cfs_rq->curr == se, NULL);
4295 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
4296 decayed |= propagate_entity_load_avg(se);
4298 if (decayed && (flags & UPDATE_TG))
4299 update_tg_load_avg(cfs_rq, 0);
4301 if (entity_is_task(se)) {
4302 #ifdef CONFIG_SCHED_WALT
4303 ptr = (void *)&(task_of(se)->ravg);
4305 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
4310 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4311 * @cfs_rq: cfs_rq to attach to
4312 * @se: sched_entity to attach
4314 * Must call update_cfs_rq_load_avg() before this, since we rely on
4315 * cfs_rq->avg.last_update_time being current.
4317 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4319 se->avg.last_update_time = cfs_rq->avg.last_update_time;
4320 cfs_rq->avg.load_avg += se->avg.load_avg;
4321 cfs_rq->avg.load_sum += se->avg.load_sum;
4322 cfs_rq->avg.util_avg += se->avg.util_avg;
4323 cfs_rq->avg.util_sum += se->avg.util_sum;
4324 set_tg_cfs_propagate(cfs_rq);
4326 cfs_rq_util_change(cfs_rq);
4330 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4331 * @cfs_rq: cfs_rq to detach from
4332 * @se: sched_entity to detach
4334 * Must call update_cfs_rq_load_avg() before this, since we rely on
4335 * cfs_rq->avg.last_update_time being current.
4337 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4340 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
4341 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
4342 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4343 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4344 set_tg_cfs_propagate(cfs_rq);
4346 cfs_rq_util_change(cfs_rq);
4349 /* Add the load generated by se into cfs_rq's load average */
4351 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4353 struct sched_avg *sa = &se->avg;
4355 cfs_rq->runnable_load_avg += sa->load_avg;
4356 cfs_rq->runnable_load_sum += sa->load_sum;
4358 if (!sa->last_update_time) {
4359 attach_entity_load_avg(cfs_rq, se);
4360 update_tg_load_avg(cfs_rq, 0);
4364 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
4366 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4368 cfs_rq->runnable_load_avg =
4369 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
4370 cfs_rq->runnable_load_sum =
4371 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
4374 #ifndef CONFIG_64BIT
4375 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4377 u64 last_update_time_copy;
4378 u64 last_update_time;
4381 last_update_time_copy = cfs_rq->load_last_update_time_copy;
4383 last_update_time = cfs_rq->avg.last_update_time;
4384 } while (last_update_time != last_update_time_copy);
4386 return last_update_time;
4389 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4391 return cfs_rq->avg.last_update_time;
4396 * Synchronize entity load avg of dequeued entity without locking
4399 void sync_entity_load_avg(struct sched_entity *se)
4401 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4402 u64 last_update_time;
4404 last_update_time = cfs_rq_last_update_time(cfs_rq);
4405 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
4409 * Task first catches up with cfs_rq, and then subtract
4410 * itself from the cfs_rq (task must be off the queue now).
4412 void remove_entity_load_avg(struct sched_entity *se)
4414 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4417 * tasks cannot exit without having gone through wake_up_new_task() ->
4418 * post_init_entity_util_avg() which will have added things to the
4419 * cfs_rq, so we can remove unconditionally.
4421 * Similarly for groups, they will have passed through
4422 * post_init_entity_util_avg() before unregister_sched_fair_group()
4426 sync_entity_load_avg(se);
4427 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
4428 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
4432 * Update the rq's load with the elapsed running time before entering
4433 * idle. if the last scheduled task is not a CFS task, idle_enter will
4434 * be the only way to update the runnable statistic.
4436 void idle_enter_fair(struct rq *this_rq)
4441 * Update the rq's load with the elapsed idle time before a task is
4442 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
4443 * be the only way to update the runnable statistic.
4445 void idle_exit_fair(struct rq *this_rq)
4449 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
4451 return cfs_rq->runnable_load_avg;
4454 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4456 return cfs_rq->avg.load_avg;
4459 static int idle_balance(struct rq *this_rq);
4461 #else /* CONFIG_SMP */
4464 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4469 #define UPDATE_TG 0x0
4470 #define SKIP_AGE_LOAD 0x0
4472 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
4474 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4476 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4477 static inline void remove_entity_load_avg(struct sched_entity *se) {}
4480 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4482 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4484 static inline int idle_balance(struct rq *rq)
4489 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4490 struct task_struct *p, int change_cra) { }
4492 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4493 struct task_struct *p, int change_cra) { }
4495 #endif /* CONFIG_SMP */
4497 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
4499 #ifdef CONFIG_SCHEDSTATS
4500 struct task_struct *tsk = NULL;
4502 if (entity_is_task(se))
4505 if (se->statistics.sleep_start) {
4506 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
4511 if (unlikely(delta > se->statistics.sleep_max))
4512 se->statistics.sleep_max = delta;
4514 se->statistics.sleep_start = 0;
4515 se->statistics.sum_sleep_runtime += delta;
4518 account_scheduler_latency(tsk, delta >> 10, 1);
4519 trace_sched_stat_sleep(tsk, delta);
4522 if (se->statistics.block_start) {
4523 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
4528 if (unlikely(delta > se->statistics.block_max))
4529 se->statistics.block_max = delta;
4531 se->statistics.block_start = 0;
4532 se->statistics.sum_sleep_runtime += delta;
4535 if (tsk->in_iowait) {
4536 se->statistics.iowait_sum += delta;
4537 se->statistics.iowait_count++;
4538 trace_sched_stat_iowait(tsk, delta);
4541 trace_sched_stat_blocked(tsk, delta);
4542 trace_sched_blocked_reason(tsk);
4545 * Blocking time is in units of nanosecs, so shift by
4546 * 20 to get a milliseconds-range estimation of the
4547 * amount of time that the task spent sleeping:
4549 if (unlikely(prof_on == SLEEP_PROFILING)) {
4550 profile_hits(SLEEP_PROFILING,
4551 (void *)get_wchan(tsk),
4554 account_scheduler_latency(tsk, delta >> 10, 0);
4560 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4562 #ifdef CONFIG_SCHED_DEBUG
4563 s64 d = se->vruntime - cfs_rq->min_vruntime;
4568 if (d > 3*sysctl_sched_latency)
4569 schedstat_inc(cfs_rq, nr_spread_over);
4574 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4576 u64 vruntime = cfs_rq->min_vruntime;
4579 * The 'current' period is already promised to the current tasks,
4580 * however the extra weight of the new task will slow them down a
4581 * little, place the new task so that it fits in the slot that
4582 * stays open at the end.
4584 if (initial && sched_feat(START_DEBIT))
4585 vruntime += sched_vslice(cfs_rq, se);
4587 /* sleeps up to a single latency don't count. */
4589 unsigned long thresh = sysctl_sched_latency;
4592 * Halve their sleep time's effect, to allow
4593 * for a gentler effect of sleepers:
4595 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4601 /* ensure we never gain time by being placed backwards. */
4602 se->vruntime = max_vruntime(se->vruntime, vruntime);
4605 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4608 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4611 * Update the normalized vruntime before updating min_vruntime
4612 * through calling update_curr().
4614 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
4615 se->vruntime += cfs_rq->min_vruntime;
4618 * Update run-time statistics of the 'current'.
4620 update_curr(cfs_rq);
4621 update_load_avg(se, UPDATE_TG);
4622 enqueue_entity_load_avg(cfs_rq, se);
4623 update_cfs_shares(se);
4624 account_entity_enqueue(cfs_rq, se);
4626 if (flags & ENQUEUE_WAKEUP) {
4627 place_entity(cfs_rq, se, 0);
4628 enqueue_sleeper(cfs_rq, se);
4631 update_stats_enqueue(cfs_rq, se);
4632 check_spread(cfs_rq, se);
4633 if (se != cfs_rq->curr)
4634 __enqueue_entity(cfs_rq, se);
4637 if (cfs_rq->nr_running == 1) {
4638 list_add_leaf_cfs_rq(cfs_rq);
4639 check_enqueue_throttle(cfs_rq);
4643 static void __clear_buddies_last(struct sched_entity *se)
4645 for_each_sched_entity(se) {
4646 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4647 if (cfs_rq->last != se)
4650 cfs_rq->last = NULL;
4654 static void __clear_buddies_next(struct sched_entity *se)
4656 for_each_sched_entity(se) {
4657 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4658 if (cfs_rq->next != se)
4661 cfs_rq->next = NULL;
4665 static void __clear_buddies_skip(struct sched_entity *se)
4667 for_each_sched_entity(se) {
4668 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4669 if (cfs_rq->skip != se)
4672 cfs_rq->skip = NULL;
4676 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4678 if (cfs_rq->last == se)
4679 __clear_buddies_last(se);
4681 if (cfs_rq->next == se)
4682 __clear_buddies_next(se);
4684 if (cfs_rq->skip == se)
4685 __clear_buddies_skip(se);
4688 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4691 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4694 * Update run-time statistics of the 'current'.
4696 update_curr(cfs_rq);
4699 * When dequeuing a sched_entity, we must:
4700 * - Update loads to have both entity and cfs_rq synced with now.
4701 * - Substract its load from the cfs_rq->runnable_avg.
4702 * - Substract its previous weight from cfs_rq->load.weight.
4703 * - For group entity, update its weight to reflect the new share
4704 * of its group cfs_rq.
4706 update_load_avg(se, UPDATE_TG);
4707 dequeue_entity_load_avg(cfs_rq, se);
4709 update_stats_dequeue(cfs_rq, se);
4710 if (flags & DEQUEUE_SLEEP) {
4711 #ifdef CONFIG_SCHEDSTATS
4712 if (entity_is_task(se)) {
4713 struct task_struct *tsk = task_of(se);
4715 if (tsk->state & TASK_INTERRUPTIBLE)
4716 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
4717 if (tsk->state & TASK_UNINTERRUPTIBLE)
4718 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
4723 clear_buddies(cfs_rq, se);
4725 if (se != cfs_rq->curr)
4726 __dequeue_entity(cfs_rq, se);
4728 account_entity_dequeue(cfs_rq, se);
4731 * Normalize the entity after updating the min_vruntime because the
4732 * update can refer to the ->curr item and we need to reflect this
4733 * movement in our normalized position.
4735 if (!(flags & DEQUEUE_SLEEP))
4736 se->vruntime -= cfs_rq->min_vruntime;
4738 /* return excess runtime on last dequeue */
4739 return_cfs_rq_runtime(cfs_rq);
4741 update_min_vruntime(cfs_rq);
4742 update_cfs_shares(se);
4746 * Preempt the current task with a newly woken task if needed:
4749 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4751 unsigned long ideal_runtime, delta_exec;
4752 struct sched_entity *se;
4755 ideal_runtime = sched_slice(cfs_rq, curr);
4756 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4757 if (delta_exec > ideal_runtime) {
4758 resched_curr(rq_of(cfs_rq));
4760 * The current task ran long enough, ensure it doesn't get
4761 * re-elected due to buddy favours.
4763 clear_buddies(cfs_rq, curr);
4768 * Ensure that a task that missed wakeup preemption by a
4769 * narrow margin doesn't have to wait for a full slice.
4770 * This also mitigates buddy induced latencies under load.
4772 if (delta_exec < sysctl_sched_min_granularity)
4775 se = __pick_first_entity(cfs_rq);
4776 delta = curr->vruntime - se->vruntime;
4781 if (delta > ideal_runtime)
4782 resched_curr(rq_of(cfs_rq));
4786 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4788 /* 'current' is not kept within the tree. */
4791 * Any task has to be enqueued before it get to execute on
4792 * a CPU. So account for the time it spent waiting on the
4795 update_stats_wait_end(cfs_rq, se);
4796 __dequeue_entity(cfs_rq, se);
4797 update_load_avg(se, UPDATE_TG);
4800 update_stats_curr_start(cfs_rq, se);
4802 #ifdef CONFIG_SCHEDSTATS
4804 * Track our maximum slice length, if the CPU's load is at
4805 * least twice that of our own weight (i.e. dont track it
4806 * when there are only lesser-weight tasks around):
4808 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4809 se->statistics.slice_max = max(se->statistics.slice_max,
4810 se->sum_exec_runtime - se->prev_sum_exec_runtime);
4813 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4817 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4820 * Pick the next process, keeping these things in mind, in this order:
4821 * 1) keep things fair between processes/task groups
4822 * 2) pick the "next" process, since someone really wants that to run
4823 * 3) pick the "last" process, for cache locality
4824 * 4) do not run the "skip" process, if something else is available
4826 static struct sched_entity *
4827 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4829 struct sched_entity *left = __pick_first_entity(cfs_rq);
4830 struct sched_entity *se;
4833 * If curr is set we have to see if its left of the leftmost entity
4834 * still in the tree, provided there was anything in the tree at all.
4836 if (!left || (curr && entity_before(curr, left)))
4839 se = left; /* ideally we run the leftmost entity */
4842 * Avoid running the skip buddy, if running something else can
4843 * be done without getting too unfair.
4845 if (cfs_rq->skip == se) {
4846 struct sched_entity *second;
4849 second = __pick_first_entity(cfs_rq);
4851 second = __pick_next_entity(se);
4852 if (!second || (curr && entity_before(curr, second)))
4856 if (second && wakeup_preempt_entity(second, left) < 1)
4861 * Prefer last buddy, try to return the CPU to a preempted task.
4863 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4867 * Someone really wants this to run. If it's not unfair, run it.
4869 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4872 clear_buddies(cfs_rq, se);
4877 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4879 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4882 * If still on the runqueue then deactivate_task()
4883 * was not called and update_curr() has to be done:
4886 update_curr(cfs_rq);
4888 /* throttle cfs_rqs exceeding runtime */
4889 check_cfs_rq_runtime(cfs_rq);
4891 check_spread(cfs_rq, prev);
4893 update_stats_wait_start(cfs_rq, prev);
4894 /* Put 'current' back into the tree. */
4895 __enqueue_entity(cfs_rq, prev);
4896 /* in !on_rq case, update occurred at dequeue */
4897 update_load_avg(prev, 0);
4899 cfs_rq->curr = NULL;
4903 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4906 * Update run-time statistics of the 'current'.
4908 update_curr(cfs_rq);
4911 * Ensure that runnable average is periodically updated.
4913 update_load_avg(curr, UPDATE_TG);
4914 update_cfs_shares(curr);
4916 #ifdef CONFIG_SCHED_HRTICK
4918 * queued ticks are scheduled to match the slice, so don't bother
4919 * validating it and just reschedule.
4922 resched_curr(rq_of(cfs_rq));
4926 * don't let the period tick interfere with the hrtick preemption
4928 if (!sched_feat(DOUBLE_TICK) &&
4929 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4933 if (cfs_rq->nr_running > 1)
4934 check_preempt_tick(cfs_rq, curr);
4938 /**************************************************
4939 * CFS bandwidth control machinery
4942 #ifdef CONFIG_CFS_BANDWIDTH
4944 #ifdef HAVE_JUMP_LABEL
4945 static struct static_key __cfs_bandwidth_used;
4947 static inline bool cfs_bandwidth_used(void)
4949 return static_key_false(&__cfs_bandwidth_used);
4952 void cfs_bandwidth_usage_inc(void)
4954 static_key_slow_inc(&__cfs_bandwidth_used);
4957 void cfs_bandwidth_usage_dec(void)
4959 static_key_slow_dec(&__cfs_bandwidth_used);
4961 #else /* HAVE_JUMP_LABEL */
4962 static bool cfs_bandwidth_used(void)
4967 void cfs_bandwidth_usage_inc(void) {}
4968 void cfs_bandwidth_usage_dec(void) {}
4969 #endif /* HAVE_JUMP_LABEL */
4972 * default period for cfs group bandwidth.
4973 * default: 0.1s, units: nanoseconds
4975 static inline u64 default_cfs_period(void)
4977 return 100000000ULL;
4980 static inline u64 sched_cfs_bandwidth_slice(void)
4982 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4986 * Replenish runtime according to assigned quota and update expiration time.
4987 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4988 * additional synchronization around rq->lock.
4990 * requires cfs_b->lock
4992 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4996 if (cfs_b->quota == RUNTIME_INF)
4999 now = sched_clock_cpu(smp_processor_id());
5000 cfs_b->runtime = cfs_b->quota;
5001 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
5004 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5006 return &tg->cfs_bandwidth;
5009 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
5010 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5012 if (unlikely(cfs_rq->throttle_count))
5013 return cfs_rq->throttled_clock_task;
5015 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
5018 /* returns 0 on failure to allocate runtime */
5019 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5021 struct task_group *tg = cfs_rq->tg;
5022 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
5023 u64 amount = 0, min_amount, expires;
5025 /* note: this is a positive sum as runtime_remaining <= 0 */
5026 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
5028 raw_spin_lock(&cfs_b->lock);
5029 if (cfs_b->quota == RUNTIME_INF)
5030 amount = min_amount;
5032 start_cfs_bandwidth(cfs_b);
5034 if (cfs_b->runtime > 0) {
5035 amount = min(cfs_b->runtime, min_amount);
5036 cfs_b->runtime -= amount;
5040 expires = cfs_b->runtime_expires;
5041 raw_spin_unlock(&cfs_b->lock);
5043 cfs_rq->runtime_remaining += amount;
5045 * we may have advanced our local expiration to account for allowed
5046 * spread between our sched_clock and the one on which runtime was
5049 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
5050 cfs_rq->runtime_expires = expires;
5052 return cfs_rq->runtime_remaining > 0;
5056 * Note: This depends on the synchronization provided by sched_clock and the
5057 * fact that rq->clock snapshots this value.
5059 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5061 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5063 /* if the deadline is ahead of our clock, nothing to do */
5064 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
5067 if (cfs_rq->runtime_remaining < 0)
5071 * If the local deadline has passed we have to consider the
5072 * possibility that our sched_clock is 'fast' and the global deadline
5073 * has not truly expired.
5075 * Fortunately we can check determine whether this the case by checking
5076 * whether the global deadline has advanced. It is valid to compare
5077 * cfs_b->runtime_expires without any locks since we only care about
5078 * exact equality, so a partial write will still work.
5081 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
5082 /* extend local deadline, drift is bounded above by 2 ticks */
5083 cfs_rq->runtime_expires += TICK_NSEC;
5085 /* global deadline is ahead, expiration has passed */
5086 cfs_rq->runtime_remaining = 0;
5090 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5092 /* dock delta_exec before expiring quota (as it could span periods) */
5093 cfs_rq->runtime_remaining -= delta_exec;
5094 expire_cfs_rq_runtime(cfs_rq);
5096 if (likely(cfs_rq->runtime_remaining > 0))
5100 * if we're unable to extend our runtime we resched so that the active
5101 * hierarchy can be throttled
5103 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5104 resched_curr(rq_of(cfs_rq));
5107 static __always_inline
5108 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5110 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5113 __account_cfs_rq_runtime(cfs_rq, delta_exec);
5116 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5118 return cfs_bandwidth_used() && cfs_rq->throttled;
5121 #ifdef CONFIG_SCHED_HMP
5123 * Check if task is part of a hierarchy where some cfs_rq does not have any
5126 * We can't rely on throttled_hierarchy() to do this test, as
5127 * cfs_rq->throttle_count will not be updated yet when this function is called
5128 * from scheduler_tick()
5130 static int task_will_be_throttled(struct task_struct *p)
5132 struct sched_entity *se = &p->se;
5133 struct cfs_rq *cfs_rq;
5135 if (!cfs_bandwidth_used())
5138 for_each_sched_entity(se) {
5139 cfs_rq = cfs_rq_of(se);
5140 if (!cfs_rq->runtime_enabled)
5142 if (cfs_rq->runtime_remaining <= 0)
5150 /* check whether cfs_rq, or any parent, is throttled */
5151 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5153 return cfs_bandwidth_used() && cfs_rq->throttle_count;
5157 * Ensure that neither of the group entities corresponding to src_cpu or
5158 * dest_cpu are members of a throttled hierarchy when performing group
5159 * load-balance operations.
5161 static inline int throttled_lb_pair(struct task_group *tg,
5162 int src_cpu, int dest_cpu)
5164 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5166 src_cfs_rq = tg->cfs_rq[src_cpu];
5167 dest_cfs_rq = tg->cfs_rq[dest_cpu];
5169 return throttled_hierarchy(src_cfs_rq) ||
5170 throttled_hierarchy(dest_cfs_rq);
5173 /* updated child weight may affect parent so we have to do this bottom up */
5174 static int tg_unthrottle_up(struct task_group *tg, void *data)
5176 struct rq *rq = data;
5177 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5179 cfs_rq->throttle_count--;
5181 if (!cfs_rq->throttle_count) {
5182 /* adjust cfs_rq_clock_task() */
5183 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
5184 cfs_rq->throttled_clock_task;
5191 static int tg_throttle_down(struct task_group *tg, void *data)
5193 struct rq *rq = data;
5194 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5196 /* group is entering throttled state, stop time */
5197 if (!cfs_rq->throttle_count)
5198 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5199 cfs_rq->throttle_count++;
5204 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
5206 struct rq *rq = rq_of(cfs_rq);
5207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5208 struct sched_entity *se;
5209 long task_delta, dequeue = 1;
5212 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5214 /* freeze hierarchy runnable averages while throttled */
5216 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5219 task_delta = cfs_rq->h_nr_running;
5220 for_each_sched_entity(se) {
5221 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5222 /* throttled entity or throttle-on-deactivate */
5227 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5228 qcfs_rq->h_nr_running -= task_delta;
5229 dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
5231 if (qcfs_rq->load.weight)
5236 sub_nr_running(rq, task_delta);
5237 dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
5240 cfs_rq->throttled = 1;
5241 cfs_rq->throttled_clock = rq_clock(rq);
5242 raw_spin_lock(&cfs_b->lock);
5243 empty = list_empty(&cfs_b->throttled_cfs_rq);
5246 * Add to the _head_ of the list, so that an already-started
5247 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
5248 * not running add to the tail so that later runqueues don't get starved.
5250 if (cfs_b->distribute_running)
5251 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5253 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5256 * If we're the first throttled task, make sure the bandwidth
5260 start_cfs_bandwidth(cfs_b);
5262 raw_spin_unlock(&cfs_b->lock);
5264 /* Log effect on hmp stats after throttling */
5265 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5266 sched_irqload(cpu_of(rq)),
5267 power_cost(cpu_of(rq), 0),
5268 cpu_temp(cpu_of(rq)));
5271 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5273 struct rq *rq = rq_of(cfs_rq);
5274 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5275 struct sched_entity *se;
5278 struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
5280 se = cfs_rq->tg->se[cpu_of(rq)];
5282 cfs_rq->throttled = 0;
5284 update_rq_clock(rq);
5286 raw_spin_lock(&cfs_b->lock);
5287 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5288 list_del_rcu(&cfs_rq->throttled_list);
5289 raw_spin_unlock(&cfs_b->lock);
5291 /* update hierarchical throttle state */
5292 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5294 if (!cfs_rq->load.weight)
5297 task_delta = cfs_rq->h_nr_running;
5298 for_each_sched_entity(se) {
5302 cfs_rq = cfs_rq_of(se);
5304 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
5305 cfs_rq->h_nr_running += task_delta;
5306 inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
5308 if (cfs_rq_throttled(cfs_rq))
5313 add_nr_running(rq, task_delta);
5314 inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
5317 /* determine whether we need to wake up potentially idle cpu */
5318 if (rq->curr == rq->idle && rq->cfs.nr_running)
5321 /* Log effect on hmp stats after un-throttling */
5322 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5323 sched_irqload(cpu_of(rq)),
5324 power_cost(cpu_of(rq), 0),
5325 cpu_temp(cpu_of(rq)));
5328 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
5329 u64 remaining, u64 expires)
5331 struct cfs_rq *cfs_rq;
5333 u64 starting_runtime = remaining;
5336 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5338 struct rq *rq = rq_of(cfs_rq);
5340 raw_spin_lock(&rq->lock);
5341 if (!cfs_rq_throttled(cfs_rq))
5344 runtime = -cfs_rq->runtime_remaining + 1;
5345 if (runtime > remaining)
5346 runtime = remaining;
5347 remaining -= runtime;
5349 cfs_rq->runtime_remaining += runtime;
5350 cfs_rq->runtime_expires = expires;
5352 /* we check whether we're throttled above */
5353 if (cfs_rq->runtime_remaining > 0)
5354 unthrottle_cfs_rq(cfs_rq);
5357 raw_spin_unlock(&rq->lock);
5364 return starting_runtime - remaining;
5368 * Responsible for refilling a task_group's bandwidth and unthrottling its
5369 * cfs_rqs as appropriate. If there has been no activity within the last
5370 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5371 * used to track this state.
5373 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5375 u64 runtime, runtime_expires;
5378 /* no need to continue the timer with no bandwidth constraint */
5379 if (cfs_b->quota == RUNTIME_INF)
5380 goto out_deactivate;
5382 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5383 cfs_b->nr_periods += overrun;
5386 * idle depends on !throttled (for the case of a large deficit), and if
5387 * we're going inactive then everything else can be deferred
5389 if (cfs_b->idle && !throttled)
5390 goto out_deactivate;
5392 __refill_cfs_bandwidth_runtime(cfs_b);
5395 /* mark as potentially idle for the upcoming period */
5400 /* account preceding periods in which throttling occurred */
5401 cfs_b->nr_throttled += overrun;
5403 runtime_expires = cfs_b->runtime_expires;
5406 * This check is repeated as we are holding onto the new bandwidth while
5407 * we unthrottle. This can potentially race with an unthrottled group
5408 * trying to acquire new bandwidth from the global pool. This can result
5409 * in us over-using our runtime if it is all used during this loop, but
5410 * only by limited amounts in that extreme case.
5412 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
5413 runtime = cfs_b->runtime;
5414 cfs_b->distribute_running = 1;
5415 raw_spin_unlock(&cfs_b->lock);
5416 /* we can't nest cfs_b->lock while distributing bandwidth */
5417 runtime = distribute_cfs_runtime(cfs_b, runtime,
5419 raw_spin_lock(&cfs_b->lock);
5421 cfs_b->distribute_running = 0;
5422 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5424 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5428 * While we are ensured activity in the period following an
5429 * unthrottle, this also covers the case in which the new bandwidth is
5430 * insufficient to cover the existing bandwidth deficit. (Forcing the
5431 * timer to remain active while there are any throttled entities.)
5441 /* a cfs_rq won't donate quota below this amount */
5442 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5443 /* minimum remaining period time to redistribute slack quota */
5444 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5445 /* how long we wait to gather additional slack before distributing */
5446 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5449 * Are we near the end of the current quota period?
5451 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
5452 * hrtimer base being cleared by hrtimer_start. In the case of
5453 * migrate_hrtimers, base is never cleared, so we are fine.
5455 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5457 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5460 /* if the call-back is running a quota refresh is already occurring */
5461 if (hrtimer_callback_running(refresh_timer))
5464 /* is a quota refresh about to occur? */
5465 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5466 if (remaining < min_expire)
5472 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5474 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5476 /* if there's a quota refresh soon don't bother with slack */
5477 if (runtime_refresh_within(cfs_b, min_left))
5480 hrtimer_start(&cfs_b->slack_timer,
5481 ns_to_ktime(cfs_bandwidth_slack_period),
5485 /* we know any runtime found here is valid as update_curr() precedes return */
5486 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5488 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5489 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5491 if (slack_runtime <= 0)
5494 raw_spin_lock(&cfs_b->lock);
5495 if (cfs_b->quota != RUNTIME_INF &&
5496 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5497 cfs_b->runtime += slack_runtime;
5499 /* we are under rq->lock, defer unthrottling using a timer */
5500 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5501 !list_empty(&cfs_b->throttled_cfs_rq))
5502 start_cfs_slack_bandwidth(cfs_b);
5504 raw_spin_unlock(&cfs_b->lock);
5506 /* even if it's not valid for return we don't want to try again */
5507 cfs_rq->runtime_remaining -= slack_runtime;
5510 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5512 if (!cfs_bandwidth_used())
5515 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5518 __return_cfs_rq_runtime(cfs_rq);
5522 * This is done with a timer (instead of inline with bandwidth return) since
5523 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5525 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5527 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5530 /* confirm we're still not at a refresh boundary */
5531 raw_spin_lock(&cfs_b->lock);
5532 if (cfs_b->distribute_running) {
5533 raw_spin_unlock(&cfs_b->lock);
5537 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5538 raw_spin_unlock(&cfs_b->lock);
5542 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5543 runtime = cfs_b->runtime;
5545 expires = cfs_b->runtime_expires;
5547 cfs_b->distribute_running = 1;
5549 raw_spin_unlock(&cfs_b->lock);
5554 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5556 raw_spin_lock(&cfs_b->lock);
5557 if (expires == cfs_b->runtime_expires)
5558 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5559 cfs_b->distribute_running = 0;
5560 raw_spin_unlock(&cfs_b->lock);
5564 * When a group wakes up we want to make sure that its quota is not already
5565 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5566 * runtime as update_curr() throttling can not not trigger until it's on-rq.
5568 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5570 if (!cfs_bandwidth_used())
5573 /* Synchronize hierarchical throttle counter: */
5574 if (unlikely(!cfs_rq->throttle_uptodate)) {
5575 struct rq *rq = rq_of(cfs_rq);
5576 struct cfs_rq *pcfs_rq;
5577 struct task_group *tg;
5579 cfs_rq->throttle_uptodate = 1;
5581 /* Get closest up-to-date node, because leaves go first: */
5582 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
5583 pcfs_rq = tg->cfs_rq[cpu_of(rq)];
5584 if (pcfs_rq->throttle_uptodate)
5588 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5589 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5593 /* an active group must be handled by the update_curr()->put() path */
5594 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5597 /* ensure the group is not already throttled */
5598 if (cfs_rq_throttled(cfs_rq))
5601 /* update runtime allocation */
5602 account_cfs_rq_runtime(cfs_rq, 0);
5603 if (cfs_rq->runtime_remaining <= 0)
5604 throttle_cfs_rq(cfs_rq);
5607 /* conditionally throttle active cfs_rq's from put_prev_entity() */
5608 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5610 if (!cfs_bandwidth_used())
5613 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5617 * it's possible for a throttled entity to be forced into a running
5618 * state (e.g. set_curr_task), in this case we're finished.
5620 if (cfs_rq_throttled(cfs_rq))
5623 throttle_cfs_rq(cfs_rq);
5627 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5629 struct cfs_bandwidth *cfs_b =
5630 container_of(timer, struct cfs_bandwidth, slack_timer);
5632 do_sched_cfs_slack_timer(cfs_b);
5634 return HRTIMER_NORESTART;
5637 extern const u64 max_cfs_quota_period;
5639 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5641 struct cfs_bandwidth *cfs_b =
5642 container_of(timer, struct cfs_bandwidth, period_timer);
5647 raw_spin_lock(&cfs_b->lock);
5649 overrun = hrtimer_forward_now(timer, cfs_b->period);
5654 u64 new, old = ktime_to_ns(cfs_b->period);
5656 new = (old * 147) / 128; /* ~115% */
5657 new = min(new, max_cfs_quota_period);
5659 cfs_b->period = ns_to_ktime(new);
5661 /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
5662 cfs_b->quota *= new;
5663 cfs_b->quota = div64_u64(cfs_b->quota, old);
5665 pr_warn_ratelimited(
5666 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
5668 div_u64(new, NSEC_PER_USEC),
5669 div_u64(cfs_b->quota, NSEC_PER_USEC));
5671 /* reset count so we don't come right back in here */
5675 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5678 cfs_b->period_active = 0;
5679 raw_spin_unlock(&cfs_b->lock);
5681 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5684 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5686 raw_spin_lock_init(&cfs_b->lock);
5688 cfs_b->quota = RUNTIME_INF;
5689 cfs_b->period = ns_to_ktime(default_cfs_period());
5691 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5692 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5693 cfs_b->period_timer.function = sched_cfs_period_timer;
5694 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5695 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5696 cfs_b->distribute_running = 0;
5699 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5701 cfs_rq->runtime_enabled = 0;
5702 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5703 init_cfs_rq_hmp_stats(cfs_rq);
5706 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5708 lockdep_assert_held(&cfs_b->lock);
5710 if (!cfs_b->period_active) {
5711 cfs_b->period_active = 1;
5712 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5713 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5717 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5719 /* init_cfs_bandwidth() was not called */
5720 if (!cfs_b->throttled_cfs_rq.next)
5723 hrtimer_cancel(&cfs_b->period_timer);
5724 hrtimer_cancel(&cfs_b->slack_timer);
5727 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5729 struct cfs_rq *cfs_rq;
5731 for_each_leaf_cfs_rq(rq, cfs_rq) {
5732 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
5734 raw_spin_lock(&cfs_b->lock);
5735 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5736 raw_spin_unlock(&cfs_b->lock);
5740 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5742 struct cfs_rq *cfs_rq;
5744 for_each_leaf_cfs_rq(rq, cfs_rq) {
5745 if (!cfs_rq->runtime_enabled)
5749 * clock_task is not advancing so we just need to make sure
5750 * there's some valid quota amount
5752 cfs_rq->runtime_remaining = 1;
5754 * Offline rq is schedulable till cpu is completely disabled
5755 * in take_cpu_down(), so we prevent new cfs throttling here.
5757 cfs_rq->runtime_enabled = 0;
5759 if (cfs_rq_throttled(cfs_rq))
5760 unthrottle_cfs_rq(cfs_rq);
5764 #else /* CONFIG_CFS_BANDWIDTH */
5765 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5767 return rq_clock_task(rq_of(cfs_rq));
5770 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5771 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5772 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5773 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5775 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5780 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5785 static inline int throttled_lb_pair(struct task_group *tg,
5786 int src_cpu, int dest_cpu)
5791 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5793 #ifdef CONFIG_FAIR_GROUP_SCHED
5794 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5797 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5801 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5802 static inline void update_runtime_enabled(struct rq *rq) {}
5803 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5805 #endif /* CONFIG_CFS_BANDWIDTH */
5807 /**************************************************
5808 * CFS operations on tasks:
5811 #ifdef CONFIG_SCHED_HRTICK
5812 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5814 struct sched_entity *se = &p->se;
5815 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5817 WARN_ON(task_rq(p) != rq);
5819 if (rq->cfs.h_nr_running > 1) {
5820 u64 slice = sched_slice(cfs_rq, se);
5821 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5822 s64 delta = slice - ran;
5829 hrtick_start(rq, delta);
5834 * called from enqueue/dequeue and updates the hrtick when the
5835 * current task is from our class.
5837 static void hrtick_update(struct rq *rq)
5839 struct task_struct *curr = rq->curr;
5841 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5844 hrtick_start_fair(rq, curr);
5846 #else /* !CONFIG_SCHED_HRTICK */
5848 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5852 static inline void hrtick_update(struct rq *rq)
5858 static bool __cpu_overutilized(int cpu, int delta);
5859 static bool cpu_overutilized(int cpu);
5860 unsigned long boosted_cpu_util(int cpu);
5862 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
5866 * The enqueue_task method is called before nr_running is
5867 * increased. Here we update the fair scheduling stats and
5868 * then put the task into the rbtree:
5871 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5873 struct cfs_rq *cfs_rq;
5874 struct sched_entity *se = &p->se;
5876 int task_new = flags & ENQUEUE_WAKEUP_NEW;
5880 * If in_iowait is set, the code below may not trigger any cpufreq
5881 * utilization updates, so do it here explicitly with the IOWAIT flag
5885 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5887 for_each_sched_entity(se) {
5890 cfs_rq = cfs_rq_of(se);
5891 enqueue_entity(cfs_rq, se, flags);
5894 * end evaluation on encountering a throttled cfs_rq
5896 * note: in the case of encountering a throttled cfs_rq we will
5897 * post the final h_nr_running increment below.
5899 if (cfs_rq_throttled(cfs_rq))
5901 cfs_rq->h_nr_running++;
5902 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5904 flags = ENQUEUE_WAKEUP;
5907 for_each_sched_entity(se) {
5908 cfs_rq = cfs_rq_of(se);
5909 cfs_rq->h_nr_running++;
5910 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5912 if (cfs_rq_throttled(cfs_rq))
5915 update_load_avg(se, UPDATE_TG);
5916 update_cfs_shares(se);
5920 add_nr_running(rq, 1);
5921 inc_rq_hmp_stats(rq, p, 1);
5927 * Update SchedTune accounting.
5929 * We do it before updating the CPU capacity to ensure the
5930 * boost value of the current task is accounted for in the
5931 * selection of the OPP.
5933 * We do it also in the case where we enqueue a throttled task;
5934 * we could argue that a throttled task should not boost a CPU,
5936 * a) properly implementing CPU boosting considering throttled
5937 * tasks will increase a lot the complexity of the solution
5938 * b) it's not easy to quantify the benefits introduced by
5939 * such a more complex solution.
5940 * Thus, for the time being we go for the simple solution and boost
5941 * also for throttled RQs.
5943 schedtune_enqueue_task(p, cpu_of(rq));
5945 if (energy_aware() && !se) {
5946 if (!task_new && !rq->rd->overutilized &&
5947 cpu_overutilized(rq->cpu)) {
5948 rq->rd->overutilized = true;
5949 trace_sched_overutilized(true);
5953 #endif /* CONFIG_SMP */
5957 static void set_next_buddy(struct sched_entity *se);
5960 * The dequeue_task method is called before nr_running is
5961 * decreased. We remove the task from the rbtree and
5962 * update the fair scheduling stats:
5964 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5966 struct cfs_rq *cfs_rq;
5967 struct sched_entity *se = &p->se;
5968 int task_sleep = flags & DEQUEUE_SLEEP;
5970 for_each_sched_entity(se) {
5971 cfs_rq = cfs_rq_of(se);
5972 dequeue_entity(cfs_rq, se, flags);
5975 * end evaluation on encountering a throttled cfs_rq
5977 * note: in the case of encountering a throttled cfs_rq we will
5978 * post the final h_nr_running decrement below.
5980 if (cfs_rq_throttled(cfs_rq))
5982 cfs_rq->h_nr_running--;
5983 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
5985 /* Don't dequeue parent if it has other entities besides us */
5986 if (cfs_rq->load.weight) {
5987 /* Avoid re-evaluating load for this entity: */
5988 se = parent_entity(se);
5990 * Bias pick_next to pick a task from this cfs_rq, as
5991 * p is sleeping when it is within its sched_slice.
5993 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5997 flags |= DEQUEUE_SLEEP;
6000 for_each_sched_entity(se) {
6001 cfs_rq = cfs_rq_of(se);
6002 cfs_rq->h_nr_running--;
6003 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6005 if (cfs_rq_throttled(cfs_rq))
6008 update_load_avg(se, UPDATE_TG);
6009 update_cfs_shares(se);
6013 sub_nr_running(rq, 1);
6014 dec_rq_hmp_stats(rq, p, 1);
6020 * Update SchedTune accounting
6022 * We do it before updating the CPU capacity to ensure the
6023 * boost value of the current task is accounted for in the
6024 * selection of the OPP.
6026 schedtune_dequeue_task(p, cpu_of(rq));
6028 #endif /* CONFIG_SMP */
6036 * per rq 'load' arrray crap; XXX kill this.
6040 * The exact cpuload at various idx values, calculated at every tick would be
6041 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
6043 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
6044 * on nth tick when cpu may be busy, then we have:
6045 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6046 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
6048 * decay_load_missed() below does efficient calculation of
6049 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6050 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
6052 * The calculation is approximated on a 128 point scale.
6053 * degrade_zero_ticks is the number of ticks after which load at any
6054 * particular idx is approximated to be zero.
6055 * degrade_factor is a precomputed table, a row for each load idx.
6056 * Each column corresponds to degradation factor for a power of two ticks,
6057 * based on 128 point scale.
6059 * row 2, col 3 (=12) says that the degradation at load idx 2 after
6060 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
6062 * With this power of 2 load factors, we can degrade the load n times
6063 * by looking at 1 bits in n and doing as many mult/shift instead of
6064 * n mult/shifts needed by the exact degradation.
6066 #define DEGRADE_SHIFT 7
6067 static const unsigned char
6068 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
6069 static const unsigned char
6070 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
6071 {0, 0, 0, 0, 0, 0, 0, 0},
6072 {64, 32, 8, 0, 0, 0, 0, 0},
6073 {96, 72, 40, 12, 1, 0, 0},
6074 {112, 98, 75, 43, 15, 1, 0},
6075 {120, 112, 98, 76, 45, 16, 2} };
6078 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
6079 * would be when CPU is idle and so we just decay the old load without
6080 * adding any new load.
6082 static unsigned long
6083 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
6087 if (!missed_updates)
6090 if (missed_updates >= degrade_zero_ticks[idx])
6094 return load >> missed_updates;
6096 while (missed_updates) {
6097 if (missed_updates % 2)
6098 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
6100 missed_updates >>= 1;
6107 * Update rq->cpu_load[] statistics. This function is usually called every
6108 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
6109 * every tick. We fix it up based on jiffies.
6111 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
6112 unsigned long pending_updates)
6116 this_rq->nr_load_updates++;
6118 /* Update our load: */
6119 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
6120 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
6121 unsigned long old_load, new_load;
6123 /* scale is effectively 1 << i now, and >> i divides by scale */
6125 old_load = this_rq->cpu_load[i];
6126 old_load = decay_load_missed(old_load, pending_updates - 1, i);
6127 new_load = this_load;
6129 * Round up the averaging division if load is increasing. This
6130 * prevents us from getting stuck on 9 if the load is 10, for
6133 if (new_load > old_load)
6134 new_load += scale - 1;
6136 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
6139 sched_avg_update(this_rq);
6142 /* Used instead of source_load when we know the type == 0 */
6143 static unsigned long weighted_cpuload(const int cpu)
6145 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
6148 #ifdef CONFIG_NO_HZ_COMMON
6150 * There is no sane way to deal with nohz on smp when using jiffies because the
6151 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
6152 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
6154 * Therefore we cannot use the delta approach from the regular tick since that
6155 * would seriously skew the load calculation. However we'll make do for those
6156 * updates happening while idle (nohz_idle_balance) or coming out of idle
6157 * (tick_nohz_idle_exit).
6159 * This means we might still be one tick off for nohz periods.
6163 * Called from nohz_idle_balance() to update the load ratings before doing the
6166 static void update_idle_cpu_load(struct rq *this_rq)
6168 unsigned long curr_jiffies = READ_ONCE(jiffies);
6169 unsigned long load = weighted_cpuload(cpu_of(this_rq));
6170 unsigned long pending_updates;
6173 * bail if there's load or we're actually up-to-date.
6175 if (load || curr_jiffies == this_rq->last_load_update_tick)
6178 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6179 this_rq->last_load_update_tick = curr_jiffies;
6181 __update_cpu_load(this_rq, load, pending_updates);
6185 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
6187 void update_cpu_load_nohz(void)
6189 struct rq *this_rq = this_rq();
6190 unsigned long curr_jiffies = READ_ONCE(jiffies);
6191 unsigned long pending_updates;
6193 if (curr_jiffies == this_rq->last_load_update_tick)
6196 raw_spin_lock(&this_rq->lock);
6197 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6198 if (pending_updates) {
6199 this_rq->last_load_update_tick = curr_jiffies;
6201 * We were idle, this means load 0, the current load might be
6202 * !0 due to remote wakeups and the sort.
6204 __update_cpu_load(this_rq, 0, pending_updates);
6206 raw_spin_unlock(&this_rq->lock);
6208 #endif /* CONFIG_NO_HZ */
6211 * Called from scheduler_tick()
6213 void update_cpu_load_active(struct rq *this_rq)
6215 unsigned long load = weighted_cpuload(cpu_of(this_rq));
6217 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
6219 this_rq->last_load_update_tick = jiffies;
6220 __update_cpu_load(this_rq, load, 1);
6224 * Return a low guess at the load of a migration-source cpu weighted
6225 * according to the scheduling class and "nice" value.
6227 * We want to under-estimate the load of migration sources, to
6228 * balance conservatively.
6230 static unsigned long source_load(int cpu, int type)
6232 struct rq *rq = cpu_rq(cpu);
6233 unsigned long total = weighted_cpuload(cpu);
6235 if (type == 0 || !sched_feat(LB_BIAS))
6238 return min(rq->cpu_load[type-1], total);
6242 * Return a high guess at the load of a migration-target cpu weighted
6243 * according to the scheduling class and "nice" value.
6245 static unsigned long target_load(int cpu, int type)
6247 struct rq *rq = cpu_rq(cpu);
6248 unsigned long total = weighted_cpuload(cpu);
6250 if (type == 0 || !sched_feat(LB_BIAS))
6253 return max(rq->cpu_load[type-1], total);
6257 static unsigned long cpu_avg_load_per_task(int cpu)
6259 struct rq *rq = cpu_rq(cpu);
6260 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
6261 unsigned long load_avg = weighted_cpuload(cpu);
6264 return load_avg / nr_running;
6269 static void record_wakee(struct task_struct *p)
6272 * Rough decay (wiping) for cost saving, don't worry
6273 * about the boundary, really active task won't care
6276 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6277 current->wakee_flips >>= 1;
6278 current->wakee_flip_decay_ts = jiffies;
6281 if (current->last_wakee != p) {
6282 current->last_wakee = p;
6283 current->wakee_flips++;
6287 static void task_waking_fair(struct task_struct *p)
6289 struct sched_entity *se = &p->se;
6290 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6293 #ifndef CONFIG_64BIT
6294 u64 min_vruntime_copy;
6297 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6299 min_vruntime = cfs_rq->min_vruntime;
6300 } while (min_vruntime != min_vruntime_copy);
6302 min_vruntime = cfs_rq->min_vruntime;
6305 se->vruntime -= min_vruntime;
6309 #ifdef CONFIG_FAIR_GROUP_SCHED
6311 * effective_load() calculates the load change as seen from the root_task_group
6313 * Adding load to a group doesn't make a group heavier, but can cause movement
6314 * of group shares between cpus. Assuming the shares were perfectly aligned one
6315 * can calculate the shift in shares.
6317 * Calculate the effective load difference if @wl is added (subtracted) to @tg
6318 * on this @cpu and results in a total addition (subtraction) of @wg to the
6319 * total group weight.
6321 * Given a runqueue weight distribution (rw_i) we can compute a shares
6322 * distribution (s_i) using:
6324 * s_i = rw_i / \Sum rw_j (1)
6326 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
6327 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
6328 * shares distribution (s_i):
6330 * rw_i = { 2, 4, 1, 0 }
6331 * s_i = { 2/7, 4/7, 1/7, 0 }
6333 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
6334 * task used to run on and the CPU the waker is running on), we need to
6335 * compute the effect of waking a task on either CPU and, in case of a sync
6336 * wakeup, compute the effect of the current task going to sleep.
6338 * So for a change of @wl to the local @cpu with an overall group weight change
6339 * of @wl we can compute the new shares distribution (s'_i) using:
6341 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
6343 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
6344 * differences in waking a task to CPU 0. The additional task changes the
6345 * weight and shares distributions like:
6347 * rw'_i = { 3, 4, 1, 0 }
6348 * s'_i = { 3/8, 4/8, 1/8, 0 }
6350 * We can then compute the difference in effective weight by using:
6352 * dw_i = S * (s'_i - s_i) (3)
6354 * Where 'S' is the group weight as seen by its parent.
6356 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
6357 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
6358 * 4/7) times the weight of the group.
6360 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6362 struct sched_entity *se = tg->se[cpu];
6364 if (!tg->parent) /* the trivial, non-cgroup case */
6367 for_each_sched_entity(se) {
6368 struct cfs_rq *cfs_rq = se->my_q;
6369 long W, w = cfs_rq_load_avg(cfs_rq);
6374 * W = @wg + \Sum rw_j
6376 W = wg + atomic_long_read(&tg->load_avg);
6378 /* Ensure \Sum rw_j >= rw_i */
6379 W -= cfs_rq->tg_load_avg_contrib;
6388 * wl = S * s'_i; see (2)
6391 wl = (w * (long)tg->shares) / W;
6396 * Per the above, wl is the new se->load.weight value; since
6397 * those are clipped to [MIN_SHARES, ...) do so now. See
6398 * calc_cfs_shares().
6400 if (wl < MIN_SHARES)
6404 * wl = dw_i = S * (s'_i - s_i); see (3)
6406 wl -= se->avg.load_avg;
6409 * Recursively apply this logic to all parent groups to compute
6410 * the final effective load change on the root group. Since
6411 * only the @tg group gets extra weight, all parent groups can
6412 * only redistribute existing shares. @wl is the shift in shares
6413 * resulting from this level per the above.
6422 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6430 * Returns the current capacity of cpu after applying both
6431 * cpu and freq scaling.
6433 unsigned long capacity_curr_of(int cpu)
6435 return cpu_rq(cpu)->cpu_capacity_orig *
6436 arch_scale_freq_capacity(NULL, cpu)
6437 >> SCHED_CAPACITY_SHIFT;
6441 struct sched_group *sg_top;
6442 struct sched_group *sg_cap;
6450 struct task_struct *task;
6464 static int cpu_util_wake(int cpu, struct task_struct *p);
6467 * __cpu_norm_util() returns the cpu util relative to a specific capacity,
6468 * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
6469 * energy calculations.
6471 * Since util is a scale-invariant utilization defined as:
6473 * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
6475 * the normalized util can be found using the specific capacity.
6477 * capacity = capacity_orig * curr_freq/max_freq
6479 * norm_util = running_time/time ~ util/capacity
6481 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
6483 if (util >= capacity)
6484 return SCHED_CAPACITY_SCALE;
6486 return (util << SCHED_CAPACITY_SHIFT)/capacity;
6489 static unsigned long group_max_util(struct energy_env *eenv)
6491 unsigned long max_util = 0;
6495 for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
6496 util = cpu_util_wake(cpu, eenv->task);
6499 * If we are looking at the target CPU specified by the eenv,
6500 * then we should add the (estimated) utilization of the task
6501 * assuming we will wake it up on that CPU.
6503 if (unlikely(cpu == eenv->trg_cpu))
6504 util += eenv->util_delta;
6506 max_util = max(max_util, util);
6513 * group_norm_util() returns the approximated group util relative to it's
6514 * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
6515 * in energy calculations.
6517 * Since task executions may or may not overlap in time in the group the true
6518 * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
6519 * when iterating over all CPUs in the group.
6520 * The latter estimate is used as it leads to a more pessimistic energy
6521 * estimate (more busy).
6524 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
6526 unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
6527 unsigned long util, util_sum = 0;
6530 for_each_cpu(cpu, sched_group_cpus(sg)) {
6531 util = cpu_util_wake(cpu, eenv->task);
6534 * If we are looking at the target CPU specified by the eenv,
6535 * then we should add the (estimated) utilization of the task
6536 * assuming we will wake it up on that CPU.
6538 if (unlikely(cpu == eenv->trg_cpu))
6539 util += eenv->util_delta;
6541 util_sum += __cpu_norm_util(util, capacity);
6544 return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
6547 static int find_new_capacity(struct energy_env *eenv,
6548 const struct sched_group_energy * const sge)
6550 int idx, max_idx = sge->nr_cap_states - 1;
6551 unsigned long util = group_max_util(eenv);
6553 /* default is max_cap if we don't find a match */
6554 eenv->cap_idx = max_idx;
6556 for (idx = 0; idx < sge->nr_cap_states; idx++) {
6557 if (sge->cap_states[idx].cap >= util) {
6558 eenv->cap_idx = idx;
6563 return eenv->cap_idx;
6566 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
6568 int i, state = INT_MAX;
6569 int src_in_grp, dst_in_grp;
6572 /* Find the shallowest idle state in the sched group. */
6573 for_each_cpu(i, sched_group_cpus(sg))
6574 state = min(state, idle_get_state_idx(cpu_rq(i)));
6576 /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
6579 src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
6580 dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
6581 if (src_in_grp == dst_in_grp) {
6582 /* both CPUs under consideration are in the same group or not in
6583 * either group, migration should leave idle state the same.
6589 * Try to estimate if a deeper idle state is
6590 * achievable when we move the task.
6592 for_each_cpu(i, sched_group_cpus(sg)) {
6593 grp_util += cpu_util_wake(i, eenv->task);
6594 if (unlikely(i == eenv->trg_cpu))
6595 grp_util += eenv->util_delta;
6599 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
6600 /* after moving, this group is at most partly
6601 * occupied, so it should have some idle time.
6603 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
6604 int new_state = grp_util * max_idle_state_idx;
6606 /* group will have no util, use lowest state */
6607 new_state = max_idle_state_idx + 1;
6609 /* for partially idle, linearly map util to idle
6610 * states, excluding the lowest one. This does not
6611 * correspond to the state we expect to enter in
6612 * reality, but an indication of what might happen.
6614 new_state = min(max_idle_state_idx, (int)
6615 (new_state / sg->sgc->max_capacity));
6616 new_state = max_idle_state_idx - new_state;
6620 /* After moving, the group will be fully occupied
6621 * so assume it will not be idle at all.
6630 * sched_group_energy(): Computes the absolute energy consumption of cpus
6631 * belonging to the sched_group including shared resources shared only by
6632 * members of the group. Iterates over all cpus in the hierarchy below the
6633 * sched_group starting from the bottom working it's way up before going to
6634 * the next cpu until all cpus are covered at all levels. The current
6635 * implementation is likely to gather the same util statistics multiple times.
6636 * This can probably be done in a faster but more complex way.
6637 * Note: sched_group_energy() may fail when racing with sched_domain updates.
6639 static int sched_group_energy(struct energy_env *eenv)
6641 struct cpumask visit_cpus;
6642 u64 total_energy = 0;
6645 WARN_ON(!eenv->sg_top->sge);
6647 cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
6648 /* If a cpu is hotplugged in while we are in this function,
6649 * it does not appear in the existing visit_cpus mask
6650 * which came from the sched_group pointer of the
6651 * sched_domain pointed at by sd_ea for either the prev
6652 * or next cpu and was dereferenced in __energy_diff.
6653 * Since we will dereference sd_scs later as we iterate
6654 * through the CPUs we expect to visit, new CPUs can
6655 * be present which are not in the visit_cpus mask.
6656 * Guard this with cpu_count.
6658 cpu_count = cpumask_weight(&visit_cpus);
6660 while (!cpumask_empty(&visit_cpus)) {
6661 struct sched_group *sg_shared_cap = NULL;
6662 int cpu = cpumask_first(&visit_cpus);
6663 struct sched_domain *sd;
6666 * Is the group utilization affected by cpus outside this
6668 * This sd may have groups with cpus which were not present
6669 * when we took visit_cpus.
6671 sd = rcu_dereference(per_cpu(sd_scs, cpu));
6673 if (sd && sd->parent)
6674 sg_shared_cap = sd->parent->groups;
6676 for_each_domain(cpu, sd) {
6677 struct sched_group *sg = sd->groups;
6679 /* Has this sched_domain already been visited? */
6680 if (sd->child && group_first_cpu(sg) != cpu)
6684 unsigned long group_util;
6685 int sg_busy_energy, sg_idle_energy;
6686 int cap_idx, idle_idx;
6688 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
6689 eenv->sg_cap = sg_shared_cap;
6693 cap_idx = find_new_capacity(eenv, sg->sge);
6695 if (sg->group_weight == 1) {
6696 /* Remove capacity of src CPU (before task move) */
6697 if (eenv->trg_cpu == eenv->src_cpu &&
6698 cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
6699 eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
6700 eenv->cap.delta -= eenv->cap.before;
6702 /* Add capacity of dst CPU (after task move) */
6703 if (eenv->trg_cpu == eenv->dst_cpu &&
6704 cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
6705 eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
6706 eenv->cap.delta += eenv->cap.after;
6710 idle_idx = group_idle_state(eenv, sg);
6711 group_util = group_norm_util(eenv, sg);
6713 sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
6714 sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
6715 * sg->sge->idle_states[idle_idx].power);
6717 total_energy += sg_busy_energy + sg_idle_energy;
6721 * cpu_count here is the number of
6722 * cpus we expect to visit in this
6723 * calculation. If we race against
6724 * hotplug, we can have extra cpus
6725 * added to the groups we are
6726 * iterating which do not appear in
6727 * the visit_cpus mask. In that case
6728 * we are not able to calculate energy
6729 * without restarting so we will bail
6730 * out and use prev_cpu this time.
6734 cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
6738 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
6741 } while (sg = sg->next, sg != sd->groups);
6745 * If we raced with hotplug and got an sd NULL-pointer;
6746 * returning a wrong energy estimation is better than
6747 * entering an infinite loop.
6748 * Specifically: If a cpu is unplugged after we took
6749 * the visit_cpus mask, it no longer has an sd_scs
6750 * pointer, so when we dereference it, we get NULL.
6752 if (cpumask_test_cpu(cpu, &visit_cpus))
6755 cpumask_clear_cpu(cpu, &visit_cpus);
6759 eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
6763 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
6765 return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
6768 static inline unsigned long task_util(struct task_struct *p);
6771 * energy_diff(): Estimate the energy impact of changing the utilization
6772 * distribution. eenv specifies the change: utilisation amount, source, and
6773 * destination cpu. Source or destination cpu may be -1 in which case the
6774 * utilization is removed from or added to the system (e.g. task wake-up). If
6775 * both are specified, the utilization is migrated.
6777 static inline int __energy_diff(struct energy_env *eenv)
6779 struct sched_domain *sd;
6780 struct sched_group *sg;
6781 int sd_cpu = -1, energy_before = 0, energy_after = 0;
6784 struct energy_env eenv_before = {
6785 .util_delta = task_util(eenv->task),
6786 .src_cpu = eenv->src_cpu,
6787 .dst_cpu = eenv->dst_cpu,
6788 .trg_cpu = eenv->src_cpu,
6789 .nrg = { 0, 0, 0, 0},
6794 if (eenv->src_cpu == eenv->dst_cpu)
6797 sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
6798 sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
6801 return 0; /* Error */
6806 if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
6807 eenv_before.sg_top = eenv->sg_top = sg;
6809 if (sched_group_energy(&eenv_before))
6810 return 0; /* Invalid result abort */
6811 energy_before += eenv_before.energy;
6813 /* Keep track of SRC cpu (before) capacity */
6814 eenv->cap.before = eenv_before.cap.before;
6815 eenv->cap.delta = eenv_before.cap.delta;
6817 if (sched_group_energy(eenv))
6818 return 0; /* Invalid result abort */
6819 energy_after += eenv->energy;
6821 } while (sg = sg->next, sg != sd->groups);
6823 eenv->nrg.before = energy_before;
6824 eenv->nrg.after = energy_after;
6825 eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
6827 #ifndef CONFIG_SCHED_TUNE
6828 trace_sched_energy_diff(eenv->task,
6829 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6830 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6831 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6832 eenv->nrg.delta, eenv->payoff);
6835 * Dead-zone margin preventing too many migrations.
6838 margin = eenv->nrg.before >> 6; /* ~1.56% */
6840 diff = eenv->nrg.after - eenv->nrg.before;
6842 eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
6844 return eenv->nrg.diff;
6847 #ifdef CONFIG_SCHED_TUNE
6849 struct target_nrg schedtune_target_nrg;
6851 #ifdef CONFIG_CGROUP_SCHEDTUNE
6852 extern bool schedtune_initialized;
6853 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6856 * System energy normalization
6857 * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
6858 * corresponding to the specified energy variation.
6861 normalize_energy(int energy_diff)
6865 #ifdef CONFIG_CGROUP_SCHEDTUNE
6866 /* during early setup, we don't know the extents */
6867 if (unlikely(!schedtune_initialized))
6868 return energy_diff < 0 ? -1 : 1 ;
6869 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6871 #ifdef CONFIG_SCHED_DEBUG
6875 /* Check for boundaries */
6876 max_delta = schedtune_target_nrg.max_power;
6877 max_delta -= schedtune_target_nrg.min_power;
6878 WARN_ON(abs(energy_diff) >= max_delta);
6882 /* Do scaling using positive numbers to increase the range */
6883 normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
6885 /* Scale by energy magnitude */
6886 normalized_nrg <<= SCHED_CAPACITY_SHIFT;
6888 /* Normalize on max energy for target platform */
6889 normalized_nrg = reciprocal_divide(
6890 normalized_nrg, schedtune_target_nrg.rdiv);
6892 return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
6896 energy_diff(struct energy_env *eenv)
6898 int boost = schedtune_task_boost(eenv->task);
6901 /* Conpute "absolute" energy diff */
6902 __energy_diff(eenv);
6904 /* Return energy diff when boost margin is 0 */
6906 trace_sched_energy_diff(eenv->task,
6907 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6908 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6909 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6910 0, -eenv->nrg.diff);
6911 return eenv->nrg.diff;
6914 /* Compute normalized energy diff */
6915 nrg_delta = normalize_energy(eenv->nrg.diff);
6916 eenv->nrg.delta = nrg_delta;
6918 eenv->payoff = schedtune_accept_deltas(
6923 trace_sched_energy_diff(eenv->task,
6924 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6925 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6926 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6927 eenv->nrg.delta, eenv->payoff);
6930 * When SchedTune is enabled, the energy_diff() function will return
6931 * the computed energy payoff value. Since the energy_diff() return
6932 * value is expected to be negative by its callers, this evaluation
6933 * function return a negative value each time the evaluation return a
6934 * positive payoff, which is the condition for the acceptance of
6935 * a scheduling decision
6937 return -eenv->payoff;
6939 #else /* CONFIG_SCHED_TUNE */
6940 #define energy_diff(eenv) __energy_diff(eenv)
6944 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
6945 * A waker of many should wake a different task than the one last awakened
6946 * at a frequency roughly N times higher than one of its wakees. In order
6947 * to determine whether we should let the load spread vs consolodating to
6948 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
6949 * partner, and a factor of lls_size higher frequency in the other. With
6950 * both conditions met, we can be relatively sure that the relationship is
6951 * non-monogamous, with partner count exceeding socket size. Waker/wakee
6952 * being client/server, worker/dispatcher, interrupt source or whatever is
6953 * irrelevant, spread criteria is apparent partner count exceeds socket size.
6955 static int wake_wide(struct task_struct *p, int sibling_count_hint)
6957 unsigned int master = current->wakee_flips;
6958 unsigned int slave = p->wakee_flips;
6959 int llc_size = this_cpu_read(sd_llc_size);
6961 if (sibling_count_hint >= llc_size)
6965 swap(master, slave);
6966 if (slave < llc_size || master < slave * llc_size)
6971 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6972 int prev_cpu, int sync)
6974 s64 this_load, load;
6975 s64 this_eff_load, prev_eff_load;
6977 struct task_group *tg;
6978 unsigned long weight;
6982 this_cpu = smp_processor_id();
6983 load = source_load(prev_cpu, idx);
6984 this_load = target_load(this_cpu, idx);
6987 * If sync wakeup then subtract the (maximum possible)
6988 * effect of the currently running task from the load
6989 * of the current CPU:
6992 tg = task_group(current);
6993 weight = current->se.avg.load_avg;
6995 this_load += effective_load(tg, this_cpu, -weight, -weight);
6996 load += effective_load(tg, prev_cpu, 0, -weight);
7000 weight = p->se.avg.load_avg;
7003 * In low-load situations, where prev_cpu is idle and this_cpu is idle
7004 * due to the sync cause above having dropped this_load to 0, we'll
7005 * always have an imbalance, but there's really nothing you can do
7006 * about that, so that's good too.
7008 * Otherwise check if either cpus are near enough in load to allow this
7009 * task to be woken on this_cpu.
7011 this_eff_load = 100;
7012 this_eff_load *= capacity_of(prev_cpu);
7014 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
7015 prev_eff_load *= capacity_of(this_cpu);
7017 if (this_load > 0) {
7018 this_eff_load *= this_load +
7019 effective_load(tg, this_cpu, weight, weight);
7021 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
7024 balanced = this_eff_load <= prev_eff_load;
7026 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
7031 schedstat_inc(sd, ttwu_move_affine);
7032 schedstat_inc(p, se.statistics.nr_wakeups_affine);
7037 static inline unsigned long task_util(struct task_struct *p)
7039 return p->se.avg.util_avg;
7042 static inline unsigned long boosted_task_util(struct task_struct *task);
7044 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
7046 unsigned long capacity = capacity_of(cpu);
7048 util += boosted_task_util(p);
7050 return (capacity * 1024) > (util * capacity_margin);
7053 static inline bool task_fits_max(struct task_struct *p, int cpu)
7055 unsigned long capacity = capacity_of(cpu);
7056 unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7058 if (capacity == max_capacity)
7061 if (capacity * capacity_margin > max_capacity * 1024)
7064 return __task_fits(p, cpu, 0);
7067 static bool __cpu_overutilized(int cpu, int delta)
7069 return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
7072 static bool cpu_overutilized(int cpu)
7074 return __cpu_overutilized(cpu, 0);
7077 #ifdef CONFIG_SCHED_TUNE
7079 struct reciprocal_value schedtune_spc_rdiv;
7082 schedtune_margin(unsigned long signal, long boost)
7084 long long margin = 0;
7087 * Signal proportional compensation (SPC)
7089 * The Boost (B) value is used to compute a Margin (M) which is
7090 * proportional to the complement of the original Signal (S):
7091 * M = B * (SCHED_CAPACITY_SCALE - S)
7092 * The obtained M could be used by the caller to "boost" S.
7095 margin = SCHED_CAPACITY_SCALE - signal;
7098 margin = -signal * boost;
7100 margin = reciprocal_divide(margin, schedtune_spc_rdiv);
7108 schedtune_cpu_margin(unsigned long util, int cpu)
7110 int boost = schedtune_cpu_boost(cpu);
7115 return schedtune_margin(util, boost);
7119 schedtune_task_margin(struct task_struct *task)
7121 int boost = schedtune_task_boost(task);
7128 util = task_util(task);
7129 margin = schedtune_margin(util, boost);
7134 #else /* CONFIG_SCHED_TUNE */
7137 schedtune_cpu_margin(unsigned long util, int cpu)
7143 schedtune_task_margin(struct task_struct *task)
7148 #endif /* CONFIG_SCHED_TUNE */
7151 boosted_cpu_util(int cpu)
7153 unsigned long util = cpu_util_freq(cpu);
7154 long margin = schedtune_cpu_margin(util, cpu);
7156 trace_sched_boost_cpu(cpu, util, margin);
7158 return util + margin;
7161 static inline unsigned long
7162 boosted_task_util(struct task_struct *task)
7164 unsigned long util = task_util(task);
7165 long margin = schedtune_task_margin(task);
7167 trace_sched_boost_task(task, util, margin);
7169 return util + margin;
7172 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
7174 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
7178 * find_idlest_group finds and returns the least busy CPU group within the
7181 * Assumes p is allowed on at least one CPU in sd.
7183 static struct sched_group *
7184 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
7185 int this_cpu, int sd_flag)
7187 struct sched_group *idlest = NULL, *group = sd->groups;
7188 struct sched_group *most_spare_sg = NULL;
7189 unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
7190 unsigned long most_spare = 0, this_spare = 0;
7191 int load_idx = sd->forkexec_idx;
7192 int imbalance = 100 + (sd->imbalance_pct-100)/2;
7194 if (sd_flag & SD_BALANCE_WAKE)
7195 load_idx = sd->wake_idx;
7198 unsigned long load, avg_load, spare_cap, max_spare_cap;
7202 /* Skip over this group if it has no CPUs allowed */
7203 if (!cpumask_intersects(sched_group_cpus(group),
7204 tsk_cpus_allowed(p)))
7207 local_group = cpumask_test_cpu(this_cpu,
7208 sched_group_cpus(group));
7211 * Tally up the load of all CPUs in the group and find
7212 * the group containing the CPU with most spare capacity.
7217 for_each_cpu(i, sched_group_cpus(group)) {
7218 /* Bias balancing toward cpus of our domain */
7220 load = source_load(i, load_idx);
7222 load = target_load(i, load_idx);
7226 spare_cap = capacity_spare_wake(i, p);
7228 if (spare_cap > max_spare_cap)
7229 max_spare_cap = spare_cap;
7232 /* Adjust by relative CPU capacity of the group */
7233 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
7236 this_load = avg_load;
7237 this_spare = max_spare_cap;
7239 if (avg_load < min_load) {
7240 min_load = avg_load;
7244 if (most_spare < max_spare_cap) {
7245 most_spare = max_spare_cap;
7246 most_spare_sg = group;
7249 } while (group = group->next, group != sd->groups);
7252 * The cross-over point between using spare capacity or least load
7253 * is too conservative for high utilization tasks on partially
7254 * utilized systems if we require spare_capacity > task_util(p),
7255 * so we allow for some task stuffing by using
7256 * spare_capacity > task_util(p)/2.
7258 * Spare capacity can't be used for fork because the utilization has
7259 * not been set yet, we must first select a rq to compute the initial
7262 if (sd_flag & SD_BALANCE_FORK)
7265 if (this_spare > task_util(p) / 2 &&
7266 imbalance*this_spare > 100*most_spare)
7268 else if (most_spare > task_util(p) / 2)
7269 return most_spare_sg;
7272 if (!idlest || 100*this_load < imbalance*min_load)
7278 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
7281 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7283 unsigned long load, min_load = ULONG_MAX;
7284 unsigned int min_exit_latency = UINT_MAX;
7285 u64 latest_idle_timestamp = 0;
7286 int least_loaded_cpu = this_cpu;
7287 int shallowest_idle_cpu = -1;
7290 /* Check if we have any choice: */
7291 if (group->group_weight == 1)
7292 return cpumask_first(sched_group_cpus(group));
7294 /* Traverse only the allowed CPUs */
7295 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
7297 struct rq *rq = cpu_rq(i);
7298 struct cpuidle_state *idle = idle_get_state(rq);
7299 if (idle && idle->exit_latency < min_exit_latency) {
7301 * We give priority to a CPU whose idle state
7302 * has the smallest exit latency irrespective
7303 * of any idle timestamp.
7305 min_exit_latency = idle->exit_latency;
7306 latest_idle_timestamp = rq->idle_stamp;
7307 shallowest_idle_cpu = i;
7308 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7309 rq->idle_stamp > latest_idle_timestamp) {
7311 * If equal or no active idle state, then
7312 * the most recently idled CPU might have
7315 latest_idle_timestamp = rq->idle_stamp;
7316 shallowest_idle_cpu = i;
7318 } else if (shallowest_idle_cpu == -1) {
7319 load = weighted_cpuload(i);
7320 if (load < min_load || (load == min_load && i == this_cpu)) {
7322 least_loaded_cpu = i;
7327 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7330 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
7331 int cpu, int prev_cpu, int sd_flag)
7334 int wu = sd_flag & SD_BALANCE_WAKE;
7338 schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
7339 schedstat_inc(this_rq(), eas_stats.cas_attempts);
7342 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
7346 struct sched_group *group;
7347 struct sched_domain *tmp;
7351 schedstat_inc(sd, eas_stats.cas_attempts);
7353 if (!(sd->flags & sd_flag)) {
7358 group = find_idlest_group(sd, p, cpu, sd_flag);
7364 new_cpu = find_idlest_group_cpu(group, p, cpu);
7365 if (new_cpu == cpu) {
7366 /* Now try balancing at a lower domain level of cpu */
7371 /* Now try balancing at a lower domain level of new_cpu */
7372 cpu = cas_cpu = new_cpu;
7373 weight = sd->span_weight;
7375 for_each_domain(cpu, tmp) {
7376 if (weight <= tmp->span_weight)
7378 if (tmp->flags & sd_flag)
7381 /* while loop will break here if sd == NULL */
7384 if (wu && (cas_cpu >= 0)) {
7385 schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
7386 schedstat_inc(this_rq(), eas_stats.cas_count);
7393 * Try and locate an idle CPU in the sched_domain.
7395 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7397 struct sched_domain *sd;
7398 struct sched_group *sg;
7399 int best_idle_cpu = -1;
7400 int best_idle_cstate = INT_MAX;
7401 unsigned long best_idle_capacity = ULONG_MAX;
7403 schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
7404 schedstat_inc(this_rq(), eas_stats.sis_attempts);
7406 if (!sysctl_sched_cstate_aware) {
7407 if (idle_cpu(target)) {
7408 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
7409 schedstat_inc(this_rq(), eas_stats.sis_idle);
7414 * If the prevous cpu is cache affine and idle, don't be stupid.
7416 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
7417 schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
7418 schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
7423 if (!(current->flags & PF_WAKE_UP_IDLE) &&
7424 !(p->flags & PF_WAKE_UP_IDLE))
7428 * Otherwise, iterate the domains and find an elegible idle cpu.
7430 sd = rcu_dereference(per_cpu(sd_llc, target));
7431 for_each_lower_domain(sd) {
7435 if (!cpumask_intersects(sched_group_cpus(sg),
7436 tsk_cpus_allowed(p)))
7439 if (sysctl_sched_cstate_aware) {
7440 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7441 int idle_idx = idle_get_state_idx(cpu_rq(i));
7442 unsigned long new_usage = boosted_task_util(p);
7443 unsigned long capacity_orig = capacity_orig_of(i);
7445 if (new_usage > capacity_orig || !idle_cpu(i))
7448 if (i == target && new_usage <= capacity_curr_of(target)) {
7449 schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
7450 schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
7451 schedstat_inc(sd, eas_stats.sis_suff_cap);
7455 if (idle_idx < best_idle_cstate &&
7456 capacity_orig <= best_idle_capacity) {
7458 best_idle_cstate = idle_idx;
7459 best_idle_capacity = capacity_orig;
7463 for_each_cpu(i, sched_group_cpus(sg)) {
7464 if (i == target || !idle_cpu(i))
7468 target = cpumask_first_and(sched_group_cpus(sg),
7469 tsk_cpus_allowed(p));
7470 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
7471 schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
7472 schedstat_inc(sd, eas_stats.sis_idle_cpu);
7477 } while (sg != sd->groups);
7480 if (best_idle_cpu >= 0)
7481 target = best_idle_cpu;
7484 schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
7485 schedstat_inc(this_rq(), eas_stats.sis_count);
7491 * cpu_util_wake: Compute cpu utilization with any contributions from
7492 * the waking task p removed. check_for_migration() looks for a better CPU of
7493 * rq->curr. For that case we should return cpu util with contributions from
7494 * currently running task p removed.
7496 static int cpu_util_wake(int cpu, struct task_struct *p)
7498 unsigned long util, capacity;
7500 #ifdef CONFIG_SCHED_WALT
7502 * WALT does not decay idle tasks in the same manner
7503 * as PELT, so it makes little sense to subtract task
7504 * utilization from cpu utilization. Instead just use
7505 * cpu_util for this case.
7507 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7508 p->state == TASK_WAKING)
7509 return cpu_util(cpu);
7511 /* Task has no contribution or is new */
7512 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
7513 return cpu_util(cpu);
7515 capacity = capacity_orig_of(cpu);
7516 util = max_t(long, cpu_util(cpu) - task_util(p), 0);
7518 return (util >= capacity) ? capacity : util;
7521 static int start_cpu(bool boosted)
7523 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7525 return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
7528 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
7529 bool boosted, bool prefer_idle)
7531 unsigned long best_idle_min_cap_orig = ULONG_MAX;
7532 unsigned long min_util = boosted_task_util(p);
7533 unsigned long target_capacity = ULONG_MAX;
7534 unsigned long min_wake_util = ULONG_MAX;
7535 unsigned long target_max_spare_cap = 0;
7536 unsigned long best_active_util = ULONG_MAX;
7537 int best_idle_cstate = INT_MAX;
7538 struct sched_domain *sd;
7539 struct sched_group *sg;
7540 int best_active_cpu = -1;
7541 int best_idle_cpu = -1;
7542 int target_cpu = -1;
7547 schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
7548 schedstat_inc(this_rq(), eas_stats.fbt_attempts);
7550 /* Find start CPU based on boost value */
7551 cpu = start_cpu(boosted);
7553 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
7554 schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
7558 /* Find SD for the start CPU */
7559 sd = rcu_dereference(per_cpu(sd_ea, cpu));
7561 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
7562 schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
7566 /* Scan CPUs in all SDs */
7569 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7570 unsigned long capacity_curr = capacity_curr_of(i);
7571 unsigned long capacity_orig = capacity_orig_of(i);
7572 unsigned long wake_util, new_util;
7577 if (walt_cpu_high_irqload(i))
7581 * p's blocked utilization is still accounted for on prev_cpu
7582 * so prev_cpu will receive a negative bias due to the double
7583 * accounting. However, the blocked utilization may be zero.
7585 wake_util = cpu_util_wake(i, p);
7586 new_util = wake_util + task_util(p);
7589 * Ensure minimum capacity to grant the required boost.
7590 * The target CPU can be already at a capacity level higher
7591 * than the one required to boost the task.
7593 new_util = max(min_util, new_util);
7594 if (new_util > capacity_orig)
7598 * Case A) Latency sensitive tasks
7600 * Unconditionally favoring tasks that prefer idle CPU to
7604 * - an idle CPU, whatever its idle_state is, since
7605 * the first CPUs we explore are more likely to be
7606 * reserved for latency sensitive tasks.
7607 * - a non idle CPU where the task fits in its current
7608 * capacity and has the maximum spare capacity.
7609 * - a non idle CPU with lower contention from other
7610 * tasks and running at the lowest possible OPP.
7612 * The last two goals tries to favor a non idle CPU
7613 * where the task can run as if it is "almost alone".
7614 * A maximum spare capacity CPU is favoured since
7615 * the task already fits into that CPU's capacity
7616 * without waiting for an OPP chance.
7618 * The following code path is the only one in the CPUs
7619 * exploration loop which is always used by
7620 * prefer_idle tasks. It exits the loop with wither a
7621 * best_active_cpu or a target_cpu which should
7622 * represent an optimal choice for latency sensitive
7628 * Case A.1: IDLE CPU
7629 * Return the first IDLE CPU we find.
7632 schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
7633 schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
7635 trace_sched_find_best_target(p,
7636 prefer_idle, min_util,
7638 best_active_cpu, i);
7644 * Case A.2: Target ACTIVE CPU
7645 * Favor CPUs with max spare capacity.
7647 if ((capacity_curr > new_util) &&
7648 (capacity_orig - new_util > target_max_spare_cap)) {
7649 target_max_spare_cap = capacity_orig - new_util;
7653 if (target_cpu != -1)
7658 * Case A.3: Backup ACTIVE CPU
7660 * - lower utilization due to other tasks
7661 * - lower utilization with the task in
7663 if (wake_util > min_wake_util)
7665 if (new_util > best_active_util)
7667 min_wake_util = wake_util;
7668 best_active_util = new_util;
7669 best_active_cpu = i;
7676 * For non latency sensitive tasks, skip CPUs that
7677 * will be overutilized by moving the task there.
7679 * The goal here is to remain in EAS mode as long as
7680 * possible at least for !prefer_idle tasks.
7682 if ((new_util * capacity_margin) >
7683 (capacity_orig * SCHED_CAPACITY_SCALE))
7687 * Case B) Non latency sensitive tasks on IDLE CPUs.
7689 * Find an optimal backup IDLE CPU for non latency
7693 * - minimizing the capacity_orig,
7694 * i.e. preferring LITTLE CPUs
7695 * - favoring shallowest idle states
7696 * i.e. avoid to wakeup deep-idle CPUs
7698 * The following code path is used by non latency
7699 * sensitive tasks if IDLE CPUs are available. If at
7700 * least one of such CPUs are available it sets the
7701 * best_idle_cpu to the most suitable idle CPU to be
7704 * If idle CPUs are available, favour these CPUs to
7705 * improve performances by spreading tasks.
7706 * Indeed, the energy_diff() computed by the caller
7707 * will take care to ensure the minimization of energy
7708 * consumptions without affecting performance.
7711 int idle_idx = idle_get_state_idx(cpu_rq(i));
7713 /* Select idle CPU with lower cap_orig */
7714 if (capacity_orig > best_idle_min_cap_orig)
7718 * Skip CPUs in deeper idle state, but only
7719 * if they are also less energy efficient.
7720 * IOW, prefer a deep IDLE LITTLE CPU vs a
7721 * shallow idle big CPU.
7723 if (sysctl_sched_cstate_aware &&
7724 best_idle_cstate <= idle_idx)
7727 /* Keep track of best idle CPU */
7728 best_idle_min_cap_orig = capacity_orig;
7729 best_idle_cstate = idle_idx;
7735 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
7737 * Pack tasks in the most energy efficient capacities.
7739 * This task packing strategy prefers more energy
7740 * efficient CPUs (i.e. pack on smaller maximum
7741 * capacity CPUs) while also trying to spread tasks to
7742 * run them all at the lower OPP.
7744 * This assumes for example that it's more energy
7745 * efficient to run two tasks on two CPUs at a lower
7746 * OPP than packing both on a single CPU but running
7747 * that CPU at an higher OPP.
7749 * Thus, this case keep track of the CPU with the
7750 * smallest maximum capacity and highest spare maximum
7754 /* Favor CPUs with smaller capacity */
7755 if (capacity_orig > target_capacity)
7758 /* Favor CPUs with maximum spare capacity */
7759 if ((capacity_orig - new_util) < target_max_spare_cap)
7762 target_max_spare_cap = capacity_orig - new_util;
7763 target_capacity = capacity_orig;
7767 } while (sg = sg->next, sg != sd->groups);
7770 * For non latency sensitive tasks, cases B and C in the previous loop,
7771 * we pick the best IDLE CPU only if we was not able to find a target
7774 * Policies priorities:
7776 * - prefer_idle tasks:
7778 * a) IDLE CPU available, we return immediately
7779 * b) ACTIVE CPU where task fits and has the bigger maximum spare
7780 * capacity (i.e. target_cpu)
7781 * c) ACTIVE CPU with less contention due to other tasks
7782 * (i.e. best_active_cpu)
7784 * - NON prefer_idle tasks:
7786 * a) ACTIVE CPU: target_cpu
7787 * b) IDLE CPU: best_idle_cpu
7789 if (target_cpu == -1)
7790 target_cpu = prefer_idle
7794 *backup_cpu = prefer_idle
7798 trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
7799 best_idle_cpu, best_active_cpu,
7802 schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
7803 schedstat_inc(this_rq(), eas_stats.fbt_count);
7809 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
7810 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
7812 * In that case WAKE_AFFINE doesn't make sense and we'll let
7813 * BALANCE_WAKE sort things out.
7815 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
7817 long min_cap, max_cap;
7819 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
7820 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7822 /* Minimum capacity is close to max, no need to abort wake_affine */
7823 if (max_cap - min_cap < max_cap >> 3)
7826 /* Bring task utilization in sync with prev_cpu */
7827 sync_entity_load_avg(&p->se);
7829 return min_cap * 1024 < task_util(p) * capacity_margin;
7832 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
7834 struct sched_domain *sd;
7835 int target_cpu = prev_cpu, tmp_target, tmp_backup;
7836 bool boosted, prefer_idle;
7838 schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
7839 schedstat_inc(this_rq(), eas_stats.secb_attempts);
7841 if (sysctl_sched_sync_hint_enable && sync) {
7842 int cpu = smp_processor_id();
7844 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7845 schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
7846 schedstat_inc(this_rq(), eas_stats.secb_sync);
7852 #ifdef CONFIG_CGROUP_SCHEDTUNE
7853 boosted = schedtune_task_boost(p) > 0;
7854 prefer_idle = schedtune_prefer_idle(p) > 0;
7856 boosted = get_sysctl_sched_cfs_boost() > 0;
7860 sync_entity_load_avg(&p->se);
7862 sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
7863 /* Find a cpu with sufficient capacity */
7864 tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
7868 if (tmp_target >= 0) {
7869 target_cpu = tmp_target;
7870 if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
7871 schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
7872 schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
7877 if (target_cpu != prev_cpu) {
7879 struct energy_env eenv = {
7880 .util_delta = task_util(p),
7881 .src_cpu = prev_cpu,
7882 .dst_cpu = target_cpu,
7884 .trg_cpu = target_cpu,
7888 #ifdef CONFIG_SCHED_WALT
7889 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7890 p->state == TASK_WAKING)
7891 delta = task_util(p);
7893 /* Not enough spare capacity on previous cpu */
7894 if (__cpu_overutilized(prev_cpu, delta)) {
7895 schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
7896 schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
7900 if (energy_diff(&eenv) >= 0) {
7901 /* No energy saving for target_cpu, try backup */
7902 target_cpu = tmp_backup;
7903 eenv.dst_cpu = target_cpu;
7904 eenv.trg_cpu = target_cpu;
7905 if (tmp_backup < 0 ||
7906 tmp_backup == prev_cpu ||
7907 energy_diff(&eenv) >= 0) {
7908 schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
7909 schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
7910 target_cpu = prev_cpu;
7915 schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
7916 schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
7920 schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
7921 schedstat_inc(this_rq(), eas_stats.secb_count);
7930 * select_task_rq_fair: Select target runqueue for the waking task in domains
7931 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
7932 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
7934 * Balances load by selecting the idlest cpu in the idlest group, or under
7935 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
7937 * Returns the target cpu number.
7939 * preempt must be disabled.
7942 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7943 int sibling_count_hint)
7945 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
7946 int cpu = smp_processor_id();
7947 int new_cpu = prev_cpu;
7948 int want_affine = 0;
7949 int sync = wake_flags & WF_SYNC;
7951 #ifdef CONFIG_SCHED_HMP
7952 return select_best_cpu(p, prev_cpu, 0, sync);
7955 if (sd_flag & SD_BALANCE_WAKE) {
7957 want_affine = !wake_wide(p, sibling_count_hint) &&
7958 !wake_cap(p, cpu, prev_cpu) &&
7959 cpumask_test_cpu(cpu, &p->cpus_allowed);
7962 if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
7963 return select_energy_cpu_brute(p, prev_cpu, sync);
7966 for_each_domain(cpu, tmp) {
7967 if (!(tmp->flags & SD_LOAD_BALANCE))
7971 * If both cpu and prev_cpu are part of this domain,
7972 * cpu is a valid SD_WAKE_AFFINE target.
7974 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7975 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
7980 if (tmp->flags & sd_flag)
7982 else if (!want_affine)
7987 sd = NULL; /* Prefer wake_affine over balance flags */
7988 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
7992 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
7994 * We're going to need the task's util for capacity_spare_wake
7995 * in find_idlest_group. Sync it up to prev_cpu's
7998 sync_entity_load_avg(&p->se);
8002 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
8003 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8006 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
8014 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
8015 * cfs_rq_of(p) references at time of call are still valid and identify the
8016 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
8017 * other assumptions, including the state of rq->lock, should be made.
8019 static void migrate_task_rq_fair(struct task_struct *p)
8022 * We are supposed to update the task to "current" time, then its up to date
8023 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
8024 * what current time is, so simply throw away the out-of-date time. This
8025 * will result in the wakee task is less decayed, but giving the wakee more
8026 * load sounds not bad.
8028 remove_entity_load_avg(&p->se);
8030 /* Tell new CPU we are migrated */
8031 p->se.avg.last_update_time = 0;
8033 /* We have migrated, no longer consider this task hot */
8034 p->se.exec_start = 0;
8037 static void task_dead_fair(struct task_struct *p)
8039 remove_entity_load_avg(&p->se);
8042 #define task_fits_max(p, cpu) true
8043 #endif /* CONFIG_SMP */
8045 static unsigned long
8046 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
8048 unsigned long gran = sysctl_sched_wakeup_granularity;
8051 * Since its curr running now, convert the gran from real-time
8052 * to virtual-time in his units.
8054 * By using 'se' instead of 'curr' we penalize light tasks, so
8055 * they get preempted easier. That is, if 'se' < 'curr' then
8056 * the resulting gran will be larger, therefore penalizing the
8057 * lighter, if otoh 'se' > 'curr' then the resulting gran will
8058 * be smaller, again penalizing the lighter task.
8060 * This is especially important for buddies when the leftmost
8061 * task is higher priority than the buddy.
8063 return calc_delta_fair(gran, se);
8067 * Should 'se' preempt 'curr'.
8081 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
8083 s64 gran, vdiff = curr->vruntime - se->vruntime;
8088 gran = wakeup_gran(curr, se);
8095 static void set_last_buddy(struct sched_entity *se)
8097 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8100 for_each_sched_entity(se)
8101 cfs_rq_of(se)->last = se;
8104 static void set_next_buddy(struct sched_entity *se)
8106 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8109 for_each_sched_entity(se)
8110 cfs_rq_of(se)->next = se;
8113 static void set_skip_buddy(struct sched_entity *se)
8115 for_each_sched_entity(se)
8116 cfs_rq_of(se)->skip = se;
8120 * Preempt the current task with a newly woken task if needed:
8122 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
8124 struct task_struct *curr = rq->curr;
8125 struct sched_entity *se = &curr->se, *pse = &p->se;
8126 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8127 int scale = cfs_rq->nr_running >= sched_nr_latency;
8128 int next_buddy_marked = 0;
8130 if (unlikely(se == pse))
8134 * This is possible from callers such as attach_tasks(), in which we
8135 * unconditionally check_prempt_curr() after an enqueue (which may have
8136 * lead to a throttle). This both saves work and prevents false
8137 * next-buddy nomination below.
8139 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8142 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
8143 set_next_buddy(pse);
8144 next_buddy_marked = 1;
8148 * We can come here with TIF_NEED_RESCHED already set from new task
8151 * Note: this also catches the edge-case of curr being in a throttled
8152 * group (e.g. via set_curr_task), since update_curr() (in the
8153 * enqueue of curr) will have resulted in resched being set. This
8154 * prevents us from potentially nominating it as a false LAST_BUDDY
8157 if (test_tsk_need_resched(curr))
8160 /* Idle tasks are by definition preempted by non-idle tasks. */
8161 if (unlikely(curr->policy == SCHED_IDLE) &&
8162 likely(p->policy != SCHED_IDLE))
8166 * Batch and idle tasks do not preempt non-idle tasks (their preemption
8167 * is driven by the tick):
8169 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8172 find_matching_se(&se, &pse);
8173 update_curr(cfs_rq_of(se));
8175 if (wakeup_preempt_entity(se, pse) == 1) {
8177 * Bias pick_next to pick the sched entity that is
8178 * triggering this preemption.
8180 if (!next_buddy_marked)
8181 set_next_buddy(pse);
8190 * Only set the backward buddy when the current task is still
8191 * on the rq. This can happen when a wakeup gets interleaved
8192 * with schedule on the ->pre_schedule() or idle_balance()
8193 * point, either of which can * drop the rq lock.
8195 * Also, during early boot the idle thread is in the fair class,
8196 * for obvious reasons its a bad idea to schedule back to it.
8198 if (unlikely(!se->on_rq || curr == rq->idle))
8201 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
8205 static struct task_struct *
8206 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
8208 struct cfs_rq *cfs_rq = &rq->cfs;
8209 struct sched_entity *se;
8210 struct task_struct *p;
8214 #ifdef CONFIG_FAIR_GROUP_SCHED
8215 if (!cfs_rq->nr_running)
8218 if (prev->sched_class != &fair_sched_class)
8222 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8223 * likely that a next task is from the same cgroup as the current.
8225 * Therefore attempt to avoid putting and setting the entire cgroup
8226 * hierarchy, only change the part that actually changes.
8230 struct sched_entity *curr = cfs_rq->curr;
8233 * Since we got here without doing put_prev_entity() we also
8234 * have to consider cfs_rq->curr. If it is still a runnable
8235 * entity, update_curr() will update its vruntime, otherwise
8236 * forget we've ever seen it.
8240 update_curr(cfs_rq);
8245 * This call to check_cfs_rq_runtime() will do the
8246 * throttle and dequeue its entity in the parent(s).
8247 * Therefore the 'simple' nr_running test will indeed
8250 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8254 se = pick_next_entity(cfs_rq, curr);
8255 cfs_rq = group_cfs_rq(se);
8261 * Since we haven't yet done put_prev_entity and if the selected task
8262 * is a different task than we started out with, try and touch the
8263 * least amount of cfs_rqs.
8266 struct sched_entity *pse = &prev->se;
8268 while (!(cfs_rq = is_same_group(se, pse))) {
8269 int se_depth = se->depth;
8270 int pse_depth = pse->depth;
8272 if (se_depth <= pse_depth) {
8273 put_prev_entity(cfs_rq_of(pse), pse);
8274 pse = parent_entity(pse);
8276 if (se_depth >= pse_depth) {
8277 set_next_entity(cfs_rq_of(se), se);
8278 se = parent_entity(se);
8282 put_prev_entity(cfs_rq, pse);
8283 set_next_entity(cfs_rq, se);
8286 if (hrtick_enabled(rq))
8287 hrtick_start_fair(rq, p);
8289 rq->misfit_task = !task_fits_max(p, rq->cpu);
8296 if (!cfs_rq->nr_running)
8299 put_prev_task(rq, prev);
8302 se = pick_next_entity(cfs_rq, NULL);
8303 set_next_entity(cfs_rq, se);
8304 cfs_rq = group_cfs_rq(se);
8309 if (hrtick_enabled(rq))
8310 hrtick_start_fair(rq, p);
8312 rq->misfit_task = !task_fits_max(p, rq->cpu);
8317 rq->misfit_task = 0;
8319 * This is OK, because current is on_cpu, which avoids it being picked
8320 * for load-balance and preemption/IRQs are still disabled avoiding
8321 * further scheduler activity on it and we're being very careful to
8322 * re-start the picking loop.
8324 lockdep_unpin_lock(&rq->lock);
8325 new_tasks = idle_balance(rq);
8326 lockdep_pin_lock(&rq->lock);
8328 * Because idle_balance() releases (and re-acquires) rq->lock, it is
8329 * possible for any higher priority task to appear. In that case we
8330 * must re-start the pick_next_entity() loop.
8342 * Account for a descheduled task:
8344 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8346 struct sched_entity *se = &prev->se;
8347 struct cfs_rq *cfs_rq;
8349 for_each_sched_entity(se) {
8350 cfs_rq = cfs_rq_of(se);
8351 put_prev_entity(cfs_rq, se);
8356 * sched_yield() is very simple
8358 * The magic of dealing with the ->skip buddy is in pick_next_entity.
8360 static void yield_task_fair(struct rq *rq)
8362 struct task_struct *curr = rq->curr;
8363 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8364 struct sched_entity *se = &curr->se;
8367 * Are we the only task in the tree?
8369 if (unlikely(rq->nr_running == 1))
8372 clear_buddies(cfs_rq, se);
8374 if (curr->policy != SCHED_BATCH) {
8375 update_rq_clock(rq);
8377 * Update run-time statistics of the 'current'.
8379 update_curr(cfs_rq);
8381 * Tell update_rq_clock() that we've just updated,
8382 * so we don't do microscopic update in schedule()
8383 * and double the fastpath cost.
8385 rq_clock_skip_update(rq, true);
8391 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
8393 struct sched_entity *se = &p->se;
8395 /* throttled hierarchies are not runnable */
8396 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8399 /* Tell the scheduler that we'd really like pse to run next. */
8402 yield_task_fair(rq);
8408 /**************************************************
8409 * Fair scheduling class load-balancing methods.
8413 * The purpose of load-balancing is to achieve the same basic fairness the
8414 * per-cpu scheduler provides, namely provide a proportional amount of compute
8415 * time to each task. This is expressed in the following equation:
8417 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
8419 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
8420 * W_i,0 is defined as:
8422 * W_i,0 = \Sum_j w_i,j (2)
8424 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
8425 * is derived from the nice value as per prio_to_weight[].
8427 * The weight average is an exponential decay average of the instantaneous
8430 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
8432 * C_i is the compute capacity of cpu i, typically it is the
8433 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8434 * can also include other factors [XXX].
8436 * To achieve this balance we define a measure of imbalance which follows
8437 * directly from (1):
8439 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
8441 * We them move tasks around to minimize the imbalance. In the continuous
8442 * function space it is obvious this converges, in the discrete case we get
8443 * a few fun cases generally called infeasible weight scenarios.
8446 * - infeasible weights;
8447 * - local vs global optima in the discrete case. ]
8452 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8453 * for all i,j solution, we create a tree of cpus that follows the hardware
8454 * topology where each level pairs two lower groups (or better). This results
8455 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
8456 * tree to only the first of the previous level and we decrease the frequency
8457 * of load-balance at each level inv. proportional to the number of cpus in
8463 * \Sum { --- * --- * 2^i } = O(n) (5)
8465 * `- size of each group
8466 * | | `- number of cpus doing load-balance
8468 * `- sum over all levels
8470 * Coupled with a limit on how many tasks we can migrate every balance pass,
8471 * this makes (5) the runtime complexity of the balancer.
8473 * An important property here is that each CPU is still (indirectly) connected
8474 * to every other cpu in at most O(log n) steps:
8476 * The adjacency matrix of the resulting graph is given by:
8479 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
8482 * And you'll find that:
8484 * A^(log_2 n)_i,j != 0 for all i,j (7)
8486 * Showing there's indeed a path between every cpu in at most O(log n) steps.
8487 * The task movement gives a factor of O(m), giving a convergence complexity
8490 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
8495 * In order to avoid CPUs going idle while there's still work to do, new idle
8496 * balancing is more aggressive and has the newly idle cpu iterate up the domain
8497 * tree itself instead of relying on other CPUs to bring it work.
8499 * This adds some complexity to both (5) and (8) but it reduces the total idle
8507 * Cgroups make a horror show out of (2), instead of a simple sum we get:
8510 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
8515 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
8517 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
8519 * The big problem is S_k, its a global sum needed to compute a local (W_i)
8522 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8523 * rewrite all of this once again.]
8526 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8528 enum fbq_type { regular, remote, all };
8537 #define LBF_ALL_PINNED 0x01
8538 #define LBF_NEED_BREAK 0x02
8539 #define LBF_DST_PINNED 0x04
8540 #define LBF_SOME_PINNED 0x08
8541 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
8542 #define LBF_IGNORE_BIG_TASKS 0x100
8543 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
8544 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
8547 struct sched_domain *sd;
8555 struct cpumask *dst_grpmask;
8557 enum cpu_idle_type idle;
8559 unsigned int src_grp_nr_running;
8560 /* The set of CPUs under consideration for load-balancing */
8561 struct cpumask *cpus;
8562 unsigned int busiest_grp_capacity;
8563 unsigned int busiest_nr_running;
8568 unsigned int loop_break;
8569 unsigned int loop_max;
8571 enum fbq_type fbq_type;
8572 enum group_type busiest_group_type;
8573 struct list_head tasks;
8574 enum sched_boost_policy boost_policy;
8578 * Is this task likely cache-hot:
8580 static int task_hot(struct task_struct *p, struct lb_env *env)
8584 lockdep_assert_held(&env->src_rq->lock);
8586 if (p->sched_class != &fair_sched_class)
8589 if (unlikely(p->policy == SCHED_IDLE))
8593 * Buddy candidates are cache hot:
8595 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8596 (&p->se == cfs_rq_of(&p->se)->next ||
8597 &p->se == cfs_rq_of(&p->se)->last))
8600 if (sysctl_sched_migration_cost == -1)
8602 if (sysctl_sched_migration_cost == 0)
8605 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8607 return delta < (s64)sysctl_sched_migration_cost;
8610 #ifdef CONFIG_NUMA_BALANCING
8612 * Returns 1, if task migration degrades locality
8613 * Returns 0, if task migration improves locality i.e migration preferred.
8614 * Returns -1, if task migration is not affected by locality.
8616 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8618 struct numa_group *numa_group = rcu_dereference(p->numa_group);
8619 unsigned long src_faults, dst_faults;
8620 int src_nid, dst_nid;
8622 if (!static_branch_likely(&sched_numa_balancing))
8625 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8628 src_nid = cpu_to_node(env->src_cpu);
8629 dst_nid = cpu_to_node(env->dst_cpu);
8631 if (src_nid == dst_nid)
8634 /* Migrating away from the preferred node is always bad. */
8635 if (src_nid == p->numa_preferred_nid) {
8636 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8642 /* Encourage migration to the preferred node. */
8643 if (dst_nid == p->numa_preferred_nid)
8647 src_faults = group_faults(p, src_nid);
8648 dst_faults = group_faults(p, dst_nid);
8650 src_faults = task_faults(p, src_nid);
8651 dst_faults = task_faults(p, dst_nid);
8654 return dst_faults < src_faults;
8658 static inline int migrate_degrades_locality(struct task_struct *p,
8666 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8669 int can_migrate_task(struct task_struct *p, struct lb_env *env)
8672 int twf, group_cpus;
8674 lockdep_assert_held(&env->src_rq->lock);
8677 * We do not migrate tasks that are:
8678 * 1) throttled_lb_pair, or
8679 * 2) cannot be migrated to this CPU due to cpus_allowed, or
8680 * 3) running (obviously), or
8681 * 4) are cache-hot on their current CPU.
8683 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8686 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
8689 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
8691 env->flags |= LBF_SOME_PINNED;
8694 * Remember if this task can be migrated to any other cpu in
8695 * our sched_group. We may want to revisit it if we couldn't
8696 * meet load balance goals by pulling other tasks on src_cpu.
8698 * Also avoid computing new_dst_cpu if we have already computed
8699 * one in current iteration.
8701 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
8704 /* Prevent to re-select dst_cpu via env's cpus */
8705 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8706 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
8707 env->flags |= LBF_DST_PINNED;
8708 env->new_dst_cpu = cpu;
8716 /* Record that we found atleast one task that could run on dst_cpu */
8717 env->flags &= ~LBF_ALL_PINNED;
8719 if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
8720 if (nr_big_tasks(env->src_rq) && !is_big_task(p))
8723 if (env->boost_policy == SCHED_BOOST_ON_BIG &&
8724 !task_sched_boost(p))
8728 twf = task_will_fit(p, env->dst_cpu);
8731 * Attempt to not pull tasks that don't fit. We may get lucky and find
8732 * one that actually fits.
8734 if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
8737 if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
8738 !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
8742 * Group imbalance can sometimes cause work to be pulled across groups
8743 * even though the group could have managed the imbalance on its own.
8744 * Prevent inter-cluster migrations for big tasks when the number of
8745 * tasks is lower than the capacity of the group.
8747 group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
8748 SCHED_CAPACITY_SCALE);
8749 if (!twf && env->busiest_nr_running <= group_cpus)
8752 if (task_running(env->src_rq, p)) {
8753 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
8758 * Aggressive migration if:
8759 * 1) IDLE or NEWLY_IDLE balance.
8760 * 2) destination numa is preferred
8761 * 3) task is cache cold, or
8762 * 4) too many balance attempts have failed.
8764 tsk_cache_hot = migrate_degrades_locality(p, env);
8765 if (tsk_cache_hot == -1)
8766 tsk_cache_hot = task_hot(p, env);
8768 if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
8769 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
8770 if (tsk_cache_hot == 1) {
8771 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
8772 schedstat_inc(p, se.statistics.nr_forced_migrations);
8777 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
8782 * detach_task() -- detach the task for the migration specified in env
8784 static void detach_task(struct task_struct *p, struct lb_env *env)
8786 lockdep_assert_held(&env->src_rq->lock);
8788 p->on_rq = TASK_ON_RQ_MIGRATING;
8789 deactivate_task(env->src_rq, p, 0);
8790 double_lock_balance(env->src_rq, env->dst_rq);
8791 set_task_cpu(p, env->dst_cpu);
8792 if (task_in_related_thread_group(p))
8793 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
8794 double_unlock_balance(env->src_rq, env->dst_rq);
8798 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
8799 * part of active balancing operations within "domain".
8801 * Returns a task if successful and NULL otherwise.
8803 static struct task_struct *detach_one_task(struct lb_env *env)
8805 struct task_struct *p, *n;
8807 lockdep_assert_held(&env->src_rq->lock);
8809 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
8810 if (!can_migrate_task(p, env))
8813 detach_task(p, env);
8816 * Right now, this is only the second place where
8817 * lb_gained[env->idle] is updated (other is detach_tasks)
8818 * so we can safely collect stats here rather than
8819 * inside detach_tasks().
8821 schedstat_inc(env->sd, lb_gained[env->idle]);
8828 static const unsigned int sched_nr_migrate_break = 32;
8831 * detach_tasks() -- tries to detach up to imbalance weighted load from
8832 * busiest_rq, as part of a balancing operation within domain "sd".
8834 * Returns number of detached tasks if successful and 0 otherwise.
8836 static int detach_tasks(struct lb_env *env)
8838 struct list_head *tasks = &env->src_rq->cfs_tasks;
8839 struct task_struct *p;
8842 int orig_loop = env->loop;
8844 lockdep_assert_held(&env->src_rq->lock);
8846 if (env->imbalance <= 0)
8849 if (!same_cluster(env->dst_cpu, env->src_cpu))
8850 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
8852 if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
8853 env->flags |= LBF_IGNORE_BIG_TASKS;
8856 while (!list_empty(tasks)) {
8858 * We don't want to steal all, otherwise we may be treated likewise,
8859 * which could at worst lead to a livelock crash.
8861 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8864 p = list_first_entry(tasks, struct task_struct, se.group_node);
8867 /* We've more or less seen every task there is, call it quits */
8868 if (env->loop > env->loop_max)
8871 /* take a breather every nr_migrate tasks */
8872 if (env->loop > env->loop_break) {
8873 env->loop_break += sched_nr_migrate_break;
8874 env->flags |= LBF_NEED_BREAK;
8878 if (!can_migrate_task(p, env))
8881 load = task_h_load(p);
8883 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8886 if ((load / 2) > env->imbalance)
8889 detach_task(p, env);
8890 list_add(&p->se.group_node, &env->tasks);
8893 env->imbalance -= load;
8895 #ifdef CONFIG_PREEMPT
8897 * NEWIDLE balancing is a source of latency, so preemptible
8898 * kernels will stop after the first task is detached to minimize
8899 * the critical section.
8901 if (env->idle == CPU_NEWLY_IDLE)
8906 * We only want to steal up to the prescribed amount of
8909 if (env->imbalance <= 0)
8914 list_move_tail(&p->se.group_node, tasks);
8917 if (env->flags & (LBF_IGNORE_BIG_TASKS |
8918 LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
8919 tasks = &env->src_rq->cfs_tasks;
8920 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
8921 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
8922 env->loop = orig_loop;
8927 * Right now, this is one of only two places we collect this stat
8928 * so we can safely collect detach_one_task() stats here rather
8929 * than inside detach_one_task().
8931 schedstat_add(env->sd, lb_gained[env->idle], detached);
8937 * attach_task() -- attach the task detached by detach_task() to its new rq.
8939 static void attach_task(struct rq *rq, struct task_struct *p)
8941 lockdep_assert_held(&rq->lock);
8943 BUG_ON(task_rq(p) != rq);
8944 activate_task(rq, p, 0);
8945 p->on_rq = TASK_ON_RQ_QUEUED;
8946 check_preempt_curr(rq, p, 0);
8950 * attach_one_task() -- attaches the task returned from detach_one_task() to
8953 static void attach_one_task(struct rq *rq, struct task_struct *p)
8955 raw_spin_lock(&rq->lock);
8957 raw_spin_unlock(&rq->lock);
8961 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8964 static void attach_tasks(struct lb_env *env)
8966 struct list_head *tasks = &env->tasks;
8967 struct task_struct *p;
8969 raw_spin_lock(&env->dst_rq->lock);
8971 while (!list_empty(tasks)) {
8972 p = list_first_entry(tasks, struct task_struct, se.group_node);
8973 list_del_init(&p->se.group_node);
8975 attach_task(env->dst_rq, p);
8978 raw_spin_unlock(&env->dst_rq->lock);
8981 #ifdef CONFIG_FAIR_GROUP_SCHED
8982 static void update_blocked_averages(int cpu)
8984 struct rq *rq = cpu_rq(cpu);
8985 struct cfs_rq *cfs_rq;
8986 unsigned long flags;
8988 raw_spin_lock_irqsave(&rq->lock, flags);
8989 update_rq_clock(rq);
8992 * Iterates the task_group tree in a bottom up fashion, see
8993 * list_add_leaf_cfs_rq() for details.
8995 for_each_leaf_cfs_rq(rq, cfs_rq) {
8996 /* throttled entities do not contribute to load */
8997 if (throttled_hierarchy(cfs_rq))
9000 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
9002 update_tg_load_avg(cfs_rq, 0);
9004 /* Propagate pending load changes to the parent */
9005 if (cfs_rq->tg->se[cpu])
9006 update_load_avg(cfs_rq->tg->se[cpu], 0);
9008 raw_spin_unlock_irqrestore(&rq->lock, flags);
9012 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9013 * This needs to be done in a top-down fashion because the load of a child
9014 * group is a fraction of its parents load.
9016 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9018 struct rq *rq = rq_of(cfs_rq);
9019 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9020 unsigned long now = jiffies;
9023 if (cfs_rq->last_h_load_update == now)
9026 WRITE_ONCE(cfs_rq->h_load_next, NULL);
9027 for_each_sched_entity(se) {
9028 cfs_rq = cfs_rq_of(se);
9029 WRITE_ONCE(cfs_rq->h_load_next, se);
9030 if (cfs_rq->last_h_load_update == now)
9035 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9036 cfs_rq->last_h_load_update = now;
9039 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9040 load = cfs_rq->h_load;
9041 load = div64_ul(load * se->avg.load_avg,
9042 cfs_rq_load_avg(cfs_rq) + 1);
9043 cfs_rq = group_cfs_rq(se);
9044 cfs_rq->h_load = load;
9045 cfs_rq->last_h_load_update = now;
9049 static unsigned long task_h_load(struct task_struct *p)
9051 struct cfs_rq *cfs_rq = task_cfs_rq(p);
9053 update_cfs_rq_h_load(cfs_rq);
9054 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9055 cfs_rq_load_avg(cfs_rq) + 1);
9058 static inline void update_blocked_averages(int cpu)
9060 struct rq *rq = cpu_rq(cpu);
9061 struct cfs_rq *cfs_rq = &rq->cfs;
9062 unsigned long flags;
9064 raw_spin_lock_irqsave(&rq->lock, flags);
9065 update_rq_clock(rq);
9066 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
9067 raw_spin_unlock_irqrestore(&rq->lock, flags);
9070 static unsigned long task_h_load(struct task_struct *p)
9072 return p->se.avg.load_avg;
9076 /********** Helpers for find_busiest_group ************************/
9079 * sg_lb_stats - stats of a sched_group required for load_balancing
9081 struct sg_lb_stats {
9082 unsigned long avg_load; /*Avg load across the CPUs of the group */
9083 unsigned long group_load; /* Total load over the CPUs of the group */
9084 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
9085 unsigned long load_per_task;
9086 unsigned long group_capacity;
9087 unsigned long group_util; /* Total utilization of the group */
9088 unsigned int sum_nr_running; /* Nr tasks running in the group */
9089 #ifdef CONFIG_SCHED_HMP
9090 unsigned long sum_nr_big_tasks;
9091 u64 group_cpu_load; /* Scaled load of all CPUs of the group */
9093 unsigned int idle_cpus;
9094 unsigned int group_weight;
9095 enum group_type group_type;
9096 int group_no_capacity;
9097 int group_misfit_task; /* A cpu has a task too big for its capacity */
9098 #ifdef CONFIG_NUMA_BALANCING
9099 unsigned int nr_numa_running;
9100 unsigned int nr_preferred_running;
9105 * sd_lb_stats - Structure to store the statistics of a sched_domain
9106 * during load balancing.
9108 struct sd_lb_stats {
9109 struct sched_group *busiest; /* Busiest group in this sd */
9110 struct sched_group *local; /* Local group in this sd */
9111 unsigned long total_load; /* Total load of all groups in sd */
9112 unsigned long total_capacity; /* Total capacity of all groups in sd */
9113 unsigned long avg_load; /* Average load across all groups in sd */
9115 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
9116 struct sg_lb_stats local_stat; /* Statistics of the local group */
9119 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9122 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9123 * local_stat because update_sg_lb_stats() does a full clear/assignment.
9124 * We must however clear busiest_stat::avg_load because
9125 * update_sd_pick_busiest() reads this before assignment.
9127 *sds = (struct sd_lb_stats){
9131 .total_capacity = 0UL,
9134 .sum_nr_running = 0,
9135 .group_type = group_other,
9136 #ifdef CONFIG_SCHED_HMP
9137 .sum_nr_big_tasks = 0UL,
9138 .group_cpu_load = 0ULL,
9144 #ifdef CONFIG_SCHED_HMP
9147 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9149 int local_cpu, busiest_cpu;
9150 int local_capacity, busiest_capacity;
9151 int local_pwr_cost, busiest_pwr_cost;
9153 int boost = sched_boost();
9155 if (!sysctl_sched_restrict_cluster_spill ||
9156 boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
9159 local_cpu = group_first_cpu(sds->local);
9160 busiest_cpu = group_first_cpu(sds->busiest);
9162 local_capacity = cpu_max_possible_capacity(local_cpu);
9163 busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
9165 local_pwr_cost = cpu_max_power_cost(local_cpu);
9166 busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
9168 if (local_pwr_cost <= busiest_pwr_cost)
9171 if (local_capacity > busiest_capacity &&
9172 sds->busiest_stat.sum_nr_big_tasks)
9175 nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
9176 if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
9177 (sds->busiest_stat.sum_nr_running <
9178 nr_cpus * sysctl_sched_spill_nr_run))
9184 #else /* CONFIG_SCHED_HMP */
9187 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9192 #endif /* CONFIG_SCHED_HMP */
9195 * get_sd_load_idx - Obtain the load index for a given sched domain.
9196 * @sd: The sched_domain whose load_idx is to be obtained.
9197 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
9199 * Return: The load index.
9201 static inline int get_sd_load_idx(struct sched_domain *sd,
9202 enum cpu_idle_type idle)
9208 load_idx = sd->busy_idx;
9211 case CPU_NEWLY_IDLE:
9212 load_idx = sd->newidle_idx;
9215 load_idx = sd->idle_idx;
9222 static unsigned long scale_rt_capacity(int cpu)
9224 struct rq *rq = cpu_rq(cpu);
9225 u64 total, used, age_stamp, avg;
9229 * Since we're reading these variables without serialization make sure
9230 * we read them once before doing sanity checks on them.
9232 age_stamp = READ_ONCE(rq->age_stamp);
9233 avg = READ_ONCE(rq->rt_avg);
9234 delta = __rq_clock_broken(rq) - age_stamp;
9236 if (unlikely(delta < 0))
9239 total = sched_avg_period() + delta;
9241 used = div_u64(avg, total);
9244 * deadline bandwidth is defined at system level so we must
9245 * weight this bandwidth with the max capacity of the system.
9246 * As a reminder, avg_bw is 20bits width and
9247 * scale_cpu_capacity is 10 bits width
9249 used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
9251 if (likely(used < SCHED_CAPACITY_SCALE))
9252 return SCHED_CAPACITY_SCALE - used;
9257 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
9259 raw_spin_lock_init(&mcc->lock);
9264 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9266 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
9267 struct sched_group *sdg = sd->groups;
9268 struct max_cpu_capacity *mcc;
9269 unsigned long max_capacity;
9271 unsigned long flags;
9273 cpu_rq(cpu)->cpu_capacity_orig = capacity;
9275 mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
9277 raw_spin_lock_irqsave(&mcc->lock, flags);
9278 max_capacity = mcc->val;
9279 max_cap_cpu = mcc->cpu;
9281 if ((max_capacity > capacity && max_cap_cpu == cpu) ||
9282 (max_capacity < capacity)) {
9283 mcc->val = capacity;
9285 #ifdef CONFIG_SCHED_DEBUG
9286 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9287 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
9292 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9294 skip_unlock: __attribute__ ((unused));
9295 capacity *= scale_rt_capacity(cpu);
9296 capacity >>= SCHED_CAPACITY_SHIFT;
9301 cpu_rq(cpu)->cpu_capacity = capacity;
9302 sdg->sgc->capacity = capacity;
9303 sdg->sgc->max_capacity = capacity;
9304 sdg->sgc->min_capacity = capacity;
9307 void update_group_capacity(struct sched_domain *sd, int cpu)
9309 struct sched_domain *child = sd->child;
9310 struct sched_group *group, *sdg = sd->groups;
9311 unsigned long capacity, max_capacity, min_capacity;
9312 unsigned long interval;
9314 interval = msecs_to_jiffies(sd->balance_interval);
9315 interval = clamp(interval, 1UL, max_load_balance_interval);
9316 sdg->sgc->next_update = jiffies + interval;
9319 update_cpu_capacity(sd, cpu);
9325 min_capacity = ULONG_MAX;
9327 if (child->flags & SD_OVERLAP) {
9329 * SD_OVERLAP domains cannot assume that child groups
9330 * span the current group.
9333 for_each_cpu(cpu, sched_group_cpus(sdg)) {
9334 struct sched_group_capacity *sgc;
9335 struct rq *rq = cpu_rq(cpu);
9337 if (cpumask_test_cpu(cpu, cpu_isolated_mask))
9340 * build_sched_domains() -> init_sched_groups_capacity()
9341 * gets here before we've attached the domains to the
9344 * Use capacity_of(), which is set irrespective of domains
9345 * in update_cpu_capacity().
9347 * This avoids capacity from being 0 and
9348 * causing divide-by-zero issues on boot.
9350 if (unlikely(!rq->sd)) {
9351 capacity += capacity_of(cpu);
9353 sgc = rq->sd->groups->sgc;
9354 capacity += sgc->capacity;
9357 max_capacity = max(capacity, max_capacity);
9358 min_capacity = min(capacity, min_capacity);
9362 * !SD_OVERLAP domains can assume that child groups
9363 * span the current group.
9366 group = child->groups;
9368 struct sched_group_capacity *sgc = group->sgc;
9370 cpumask_t *cpus = sched_group_cpus(group);
9372 /* Revisit this later. This won't work for MT domain */
9373 if (!cpu_isolated(cpumask_first(cpus))) {
9374 capacity += sgc->capacity;
9375 max_capacity = max(sgc->max_capacity, max_capacity);
9376 min_capacity = min(sgc->min_capacity, min_capacity);
9378 group = group->next;
9379 } while (group != child->groups);
9382 sdg->sgc->capacity = capacity;
9383 sdg->sgc->max_capacity = max_capacity;
9384 sdg->sgc->min_capacity = min_capacity;
9388 * Check whether the capacity of the rq has been noticeably reduced by side
9389 * activity. The imbalance_pct is used for the threshold.
9390 * Return true is the capacity is reduced
9393 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9395 return ((rq->cpu_capacity * sd->imbalance_pct) <
9396 (rq->cpu_capacity_orig * 100));
9400 * Group imbalance indicates (and tries to solve) the problem where balancing
9401 * groups is inadequate due to tsk_cpus_allowed() constraints.
9403 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
9404 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
9407 * { 0 1 2 3 } { 4 5 6 7 }
9410 * If we were to balance group-wise we'd place two tasks in the first group and
9411 * two tasks in the second group. Clearly this is undesired as it will overload
9412 * cpu 3 and leave one of the cpus in the second group unused.
9414 * The current solution to this issue is detecting the skew in the first group
9415 * by noticing the lower domain failed to reach balance and had difficulty
9416 * moving tasks due to affinity constraints.
9418 * When this is so detected; this group becomes a candidate for busiest; see
9419 * update_sd_pick_busiest(). And calculate_imbalance() and
9420 * find_busiest_group() avoid some of the usual balance conditions to allow it
9421 * to create an effective group imbalance.
9423 * This is a somewhat tricky proposition since the next run might not find the
9424 * group imbalance and decide the groups need to be balanced again. A most
9425 * subtle and fragile situation.
9428 static inline int sg_imbalanced(struct sched_group *group)
9430 return group->sgc->imbalance;
9434 * group_has_capacity returns true if the group has spare capacity that could
9435 * be used by some tasks.
9436 * We consider that a group has spare capacity if the * number of task is
9437 * smaller than the number of CPUs or if the utilization is lower than the
9438 * available capacity for CFS tasks.
9439 * For the latter, we use a threshold to stabilize the state, to take into
9440 * account the variance of the tasks' load and to return true if the available
9441 * capacity in meaningful for the load balancer.
9442 * As an example, an available capacity of 1% can appear but it doesn't make
9443 * any benefit for the load balance.
9446 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
9448 if (sgs->sum_nr_running < sgs->group_weight)
9451 if ((sgs->group_capacity * 100) >
9452 (sgs->group_util * env->sd->imbalance_pct))
9459 * group_is_overloaded returns true if the group has more tasks than it can
9461 * group_is_overloaded is not equals to !group_has_capacity because a group
9462 * with the exact right number of tasks, has no more spare capacity but is not
9463 * overloaded so both group_has_capacity and group_is_overloaded return
9467 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
9469 if (sgs->sum_nr_running <= sgs->group_weight)
9472 if ((sgs->group_capacity * 100) <
9473 (sgs->group_util * env->sd->imbalance_pct))
9481 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
9482 * per-cpu capacity than sched_group ref.
9485 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
9487 return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
9488 ref->sgc->max_capacity;
9492 group_type group_classify(struct sched_group *group,
9493 struct sg_lb_stats *sgs, struct lb_env *env)
9495 if (sgs->group_no_capacity)
9496 return group_overloaded;
9498 if (sg_imbalanced(group))
9499 return group_imbalanced;
9501 if (sgs->group_misfit_task)
9502 return group_misfit_task;
9507 #ifdef CONFIG_NO_HZ_COMMON
9509 * idle load balancing data
9510 * - used by the nohz balance, but we want it available here
9511 * so that we can see which CPUs have no tick.
9514 cpumask_var_t idle_cpus_mask;
9516 unsigned long next_balance; /* in jiffy units */
9517 } nohz ____cacheline_aligned;
9519 static inline void update_cpu_stats_if_tickless(struct rq *rq)
9521 /* only called from update_sg_lb_stats when irqs are disabled */
9522 if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
9523 /* rate limit updates to once-per-jiffie at most */
9524 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
9527 raw_spin_lock(&rq->lock);
9528 update_rq_clock(rq);
9529 update_idle_cpu_load(rq);
9530 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
9531 raw_spin_unlock(&rq->lock);
9536 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
9540 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9541 * @env: The load balancing environment.
9542 * @group: sched_group whose statistics are to be updated.
9543 * @load_idx: Load index of sched_domain of this_cpu for load calc.
9544 * @local_group: Does group contain this_cpu.
9545 * @sgs: variable to hold the statistics for this group.
9546 * @overload: Indicate more than one runnable task for any CPU.
9547 * @overutilized: Indicate overutilization for any CPU.
9549 static inline void update_sg_lb_stats(struct lb_env *env,
9550 struct sched_group *group, int load_idx,
9551 int local_group, struct sg_lb_stats *sgs,
9552 bool *overload, bool *overutilized)
9557 memset(sgs, 0, sizeof(*sgs));
9559 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
9560 struct rq *rq = cpu_rq(i);
9562 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
9567 if (cpu_isolated(i))
9570 /* if we are entering idle and there are CPUs with
9571 * their tick stopped, do an update for them
9573 if (env->idle == CPU_NEWLY_IDLE)
9574 update_cpu_stats_if_tickless(rq);
9576 /* Bias balancing toward cpus of our domain */
9578 load = target_load(i, load_idx);
9580 load = source_load(i, load_idx);
9582 sgs->group_load += load;
9583 sgs->group_util += cpu_util(i);
9584 sgs->sum_nr_running += rq->cfs.h_nr_running;
9586 nr_running = rq->nr_running;
9590 #ifdef CONFIG_SCHED_HMP
9591 sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
9592 sgs->group_cpu_load += cpu_load(i);
9595 #ifdef CONFIG_NUMA_BALANCING
9596 sgs->nr_numa_running += rq->nr_numa_running;
9597 sgs->nr_preferred_running += rq->nr_preferred_running;
9599 sgs->sum_weighted_load += weighted_cpuload(i);
9601 * No need to call idle_cpu() if nr_running is not 0
9603 if (!nr_running && idle_cpu(i))
9606 if (energy_aware() && cpu_overutilized(i)) {
9607 *overutilized = true;
9608 if (!sgs->group_misfit_task && rq->misfit_task)
9609 sgs->group_misfit_task = capacity_of(i);
9613 /* Isolated CPU has no weight */
9614 if (!group->group_weight) {
9615 sgs->group_capacity = 0;
9617 sgs->group_no_capacity = 1;
9618 sgs->group_type = group_other;
9619 sgs->group_weight = group->group_weight;
9621 /* Adjust by relative CPU capacity of the group */
9622 sgs->group_capacity = group->sgc->capacity;
9623 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
9624 sgs->group_capacity;
9626 sgs->group_weight = group->group_weight;
9628 sgs->group_no_capacity = group_is_overloaded(env, sgs);
9629 sgs->group_type = group_classify(group, sgs, env);
9632 if (sgs->sum_nr_running)
9633 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
9636 #ifdef CONFIG_SCHED_HMP
9637 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9638 struct sd_lb_stats *sds,
9639 struct sched_group *sg,
9640 struct sg_lb_stats *sgs)
9642 if (env->idle != CPU_NOT_IDLE &&
9643 cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
9644 if (sgs->sum_nr_big_tasks >
9645 sds->busiest_stat.sum_nr_big_tasks) {
9646 env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
9654 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9655 struct sd_lb_stats *sds,
9656 struct sched_group *sg,
9657 struct sg_lb_stats *sgs)
9664 * update_sd_pick_busiest - return 1 on busiest group
9665 * @env: The load balancing environment.
9666 * @sds: sched_domain statistics
9667 * @sg: sched_group candidate to be checked for being the busiest
9668 * @sgs: sched_group statistics
9670 * Determine if @sg is a busier group than the previously selected
9673 * Return: %true if @sg is a busier group than the previously selected
9674 * busiest group. %false otherwise.
9676 static bool update_sd_pick_busiest(struct lb_env *env,
9677 struct sd_lb_stats *sds,
9678 struct sched_group *sg,
9679 struct sg_lb_stats *sgs)
9681 struct sg_lb_stats *busiest = &sds->busiest_stat;
9683 if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
9686 if (sgs->group_type > busiest->group_type)
9689 if (sgs->group_type < busiest->group_type)
9692 if (energy_aware()) {
9694 * Candidate sg doesn't face any serious load-balance problems
9695 * so don't pick it if the local sg is already filled up.
9697 if (sgs->group_type == group_other &&
9698 !group_has_capacity(env, &sds->local_stat))
9701 if (sgs->avg_load <= busiest->avg_load)
9704 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9708 * Candidate sg has no more than one task per CPU and
9709 * has higher per-CPU capacity. Migrating tasks to less
9710 * capable CPUs may harm throughput. Maximize throughput,
9711 * power/energy consequences are not considered.
9713 if (sgs->sum_nr_running <= sgs->group_weight &&
9714 group_smaller_cpu_capacity(sds->local, sg))
9719 /* This is the busiest node in its class. */
9720 if (!(env->sd->flags & SD_ASYM_PACKING))
9724 * ASYM_PACKING needs to move all the work to the lowest
9725 * numbered CPUs in the group, therefore mark all groups
9726 * higher than ourself as busy.
9728 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
9732 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
9739 #ifdef CONFIG_NUMA_BALANCING
9740 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9742 if (sgs->sum_nr_running > sgs->nr_numa_running)
9744 if (sgs->sum_nr_running > sgs->nr_preferred_running)
9749 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9751 if (rq->nr_running > rq->nr_numa_running)
9753 if (rq->nr_running > rq->nr_preferred_running)
9758 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9763 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9767 #endif /* CONFIG_NUMA_BALANCING */
9769 #define lb_sd_parent(sd) \
9770 (sd->parent && sd->parent->groups != sd->parent->groups->next)
9773 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9774 * @env: The load balancing environment.
9775 * @sds: variable to hold the statistics for this sched_domain.
9777 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9779 struct sched_domain *child = env->sd->child;
9780 struct sched_group *sg = env->sd->groups;
9781 struct sg_lb_stats tmp_sgs;
9782 int load_idx, prefer_sibling = 0;
9783 bool overload = false, overutilized = false;
9785 if (child && child->flags & SD_PREFER_SIBLING)
9788 load_idx = get_sd_load_idx(env->sd, env->idle);
9791 struct sg_lb_stats *sgs = &tmp_sgs;
9794 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
9797 sgs = &sds->local_stat;
9799 if (env->idle != CPU_NEWLY_IDLE ||
9800 time_after_eq(jiffies, sg->sgc->next_update))
9801 update_group_capacity(env->sd, env->dst_cpu);
9804 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
9805 &overload, &overutilized);
9811 * In case the child domain prefers tasks go to siblings
9812 * first, lower the sg capacity so that we'll try
9813 * and move all the excess tasks away. We lower the capacity
9814 * of a group only if the local group has the capacity to fit
9815 * these excess tasks. The extra check prevents the case where
9816 * you always pull from the heaviest group when it is already
9817 * under-utilized (possible with a large weight task outweighs
9818 * the tasks on the system).
9820 if (prefer_sibling && sds->local &&
9821 group_has_capacity(env, &sds->local_stat) &&
9822 (sgs->sum_nr_running > 1)) {
9823 sgs->group_no_capacity = 1;
9824 sgs->group_type = group_classify(sg, sgs, env);
9828 * Ignore task groups with misfit tasks if local group has no
9829 * capacity or if per-cpu capacity isn't higher.
9831 if (energy_aware() &&
9832 sgs->group_type == group_misfit_task &&
9833 (!group_has_capacity(env, &sds->local_stat) ||
9834 !group_smaller_cpu_capacity(sg, sds->local)))
9835 sgs->group_type = group_other;
9837 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9839 sds->busiest_stat = *sgs;
9840 env->busiest_nr_running = sgs->sum_nr_running;
9841 env->busiest_grp_capacity = sgs->group_capacity;
9845 /* Now, start updating sd_lb_stats */
9846 sds->total_load += sgs->group_load;
9847 sds->total_capacity += sgs->group_capacity;
9850 } while (sg != env->sd->groups);
9852 if (env->sd->flags & SD_NUMA)
9853 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9855 env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9857 if (!lb_sd_parent(env->sd)) {
9858 /* update overload indicator if we are at root domain */
9859 if (env->dst_rq->rd->overload != overload)
9860 env->dst_rq->rd->overload = overload;
9862 /* Update over-utilization (tipping point, U >= 0) indicator */
9863 if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
9864 env->dst_rq->rd->overutilized = overutilized;
9865 trace_sched_overutilized(overutilized);
9868 if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
9869 env->dst_rq->rd->overutilized = true;
9870 trace_sched_overutilized(true);
9877 * check_asym_packing - Check to see if the group is packed into the
9880 * This is primarily intended to used at the sibling level. Some
9881 * cores like POWER7 prefer to use lower numbered SMT threads. In the
9882 * case of POWER7, it can move to lower SMT modes only when higher
9883 * threads are idle. When in lower SMT modes, the threads will
9884 * perform better since they share less core resources. Hence when we
9885 * have idle threads, we want them to be the higher ones.
9887 * This packing function is run on idle threads. It checks to see if
9888 * the busiest CPU in this domain (core in the P7 case) has a higher
9889 * CPU number than the packing function is being run on. Here we are
9890 * assuming lower CPU number will be equivalent to lower a SMT thread
9893 * Return: 1 when packing is required and a task should be moved to
9894 * this CPU. The amount of the imbalance is returned in *imbalance.
9896 * @env: The load balancing environment.
9897 * @sds: Statistics of the sched_domain which is to be packed
9899 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9903 if (!(env->sd->flags & SD_ASYM_PACKING))
9909 busiest_cpu = group_first_cpu(sds->busiest);
9910 if (env->dst_cpu > busiest_cpu)
9913 env->imbalance = DIV_ROUND_CLOSEST(
9914 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9915 SCHED_CAPACITY_SCALE);
9921 * fix_small_imbalance - Calculate the minor imbalance that exists
9922 * amongst the groups of a sched_domain, during
9924 * @env: The load balancing environment.
9925 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9928 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9930 unsigned long tmp, capa_now = 0, capa_move = 0;
9931 unsigned int imbn = 2;
9932 unsigned long scaled_busy_load_per_task;
9933 struct sg_lb_stats *local, *busiest;
9935 local = &sds->local_stat;
9936 busiest = &sds->busiest_stat;
9938 if (!local->sum_nr_running)
9939 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9940 else if (busiest->load_per_task > local->load_per_task)
9943 scaled_busy_load_per_task =
9944 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9945 busiest->group_capacity;
9947 if (busiest->avg_load + scaled_busy_load_per_task >=
9948 local->avg_load + (scaled_busy_load_per_task * imbn)) {
9949 env->imbalance = busiest->load_per_task;
9954 * OK, we don't have enough imbalance to justify moving tasks,
9955 * however we may be able to increase total CPU capacity used by
9959 capa_now += busiest->group_capacity *
9960 min(busiest->load_per_task, busiest->avg_load);
9961 capa_now += local->group_capacity *
9962 min(local->load_per_task, local->avg_load);
9963 capa_now /= SCHED_CAPACITY_SCALE;
9965 /* Amount of load we'd subtract */
9966 if (busiest->avg_load > scaled_busy_load_per_task) {
9967 capa_move += busiest->group_capacity *
9968 min(busiest->load_per_task,
9969 busiest->avg_load - scaled_busy_load_per_task);
9972 /* Amount of load we'd add */
9973 if (busiest->avg_load * busiest->group_capacity <
9974 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9975 tmp = (busiest->avg_load * busiest->group_capacity) /
9976 local->group_capacity;
9978 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9979 local->group_capacity;
9981 capa_move += local->group_capacity *
9982 min(local->load_per_task, local->avg_load + tmp);
9983 capa_move /= SCHED_CAPACITY_SCALE;
9985 /* Move if we gain throughput */
9986 if (capa_move > capa_now)
9987 env->imbalance = busiest->load_per_task;
9991 * calculate_imbalance - Calculate the amount of imbalance present within the
9992 * groups of a given sched_domain during load balance.
9993 * @env: load balance environment
9994 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
9996 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9998 unsigned long max_pull, load_above_capacity = ~0UL;
9999 struct sg_lb_stats *local, *busiest;
10001 local = &sds->local_stat;
10002 busiest = &sds->busiest_stat;
10004 if (busiest->group_type == group_imbalanced) {
10006 * In the group_imb case we cannot rely on group-wide averages
10007 * to ensure cpu-load equilibrium, look at wider averages. XXX
10009 busiest->load_per_task =
10010 min(busiest->load_per_task, sds->avg_load);
10014 * In the presence of smp nice balancing, certain scenarios can have
10015 * max load less than avg load(as we skip the groups at or below
10016 * its cpu_capacity, while calculating max_load..)
10018 if (busiest->avg_load <= sds->avg_load ||
10019 local->avg_load >= sds->avg_load) {
10020 if (energy_aware()) {
10021 /* Misfitting tasks should be migrated in any case */
10022 if (busiest->group_type == group_misfit_task) {
10023 env->imbalance = busiest->group_misfit_task;
10028 * Busiest group is overloaded, local is not, use the spare
10029 * cycles to maximize throughput
10031 if (busiest->group_type == group_overloaded &&
10032 local->group_type <= group_misfit_task) {
10033 env->imbalance = busiest->load_per_task;
10038 env->imbalance = 0;
10039 return fix_small_imbalance(env, sds);
10043 * If there aren't any idle cpus, avoid creating some.
10045 if (busiest->group_type == group_overloaded &&
10046 local->group_type == group_overloaded) {
10047 load_above_capacity = busiest->sum_nr_running *
10049 if (load_above_capacity > busiest->group_capacity)
10050 load_above_capacity -= busiest->group_capacity;
10052 load_above_capacity = ~0UL;
10056 * We're trying to get all the cpus to the average_load, so we don't
10057 * want to push ourselves above the average load, nor do we wish to
10058 * reduce the max loaded cpu below the average load. At the same time,
10059 * we also don't want to reduce the group load below the group capacity
10060 * (so that we can implement power-savings policies etc). Thus we look
10061 * for the minimum possible imbalance.
10063 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
10065 /* How much load to actually move to equalise the imbalance */
10066 env->imbalance = min(
10067 max_pull * busiest->group_capacity,
10068 (sds->avg_load - local->avg_load) * local->group_capacity
10069 ) / SCHED_CAPACITY_SCALE;
10071 /* Boost imbalance to allow misfit task to be balanced. */
10072 if (energy_aware() && busiest->group_type == group_misfit_task)
10073 env->imbalance = max_t(long, env->imbalance,
10074 busiest->group_misfit_task);
10077 * if *imbalance is less than the average load per runnable task
10078 * there is no guarantee that any tasks will be moved so we'll have
10079 * a think about bumping its value to force at least one task to be
10082 if (env->imbalance < busiest->load_per_task)
10083 return fix_small_imbalance(env, sds);
10086 /******* find_busiest_group() helpers end here *********************/
10089 * find_busiest_group - Returns the busiest group within the sched_domain
10090 * if there is an imbalance. If there isn't an imbalance, and
10091 * the user has opted for power-savings, it returns a group whose
10092 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
10093 * such a group exists.
10095 * Also calculates the amount of weighted load which should be moved
10096 * to restore balance.
10098 * @env: The load balancing environment.
10100 * Return: - The busiest group if imbalance exists.
10101 * - If no imbalance and user has opted for power-savings balance,
10102 * return the least loaded group whose CPUs can be
10103 * put to idle by rebalancing its tasks onto our group.
10105 static struct sched_group *find_busiest_group(struct lb_env *env)
10107 struct sg_lb_stats *local, *busiest;
10108 struct sd_lb_stats sds;
10110 init_sd_lb_stats(&sds);
10113 * Compute the various statistics relavent for load balancing at
10116 update_sd_lb_stats(env, &sds);
10118 if (energy_aware() && !env->dst_rq->rd->overutilized)
10121 local = &sds.local_stat;
10122 busiest = &sds.busiest_stat;
10124 /* ASYM feature bypasses nice load balance check */
10125 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
10126 check_asym_packing(env, &sds))
10127 return sds.busiest;
10129 /* There is no busy sibling group to pull tasks from */
10130 if (!sds.busiest || busiest->sum_nr_running == 0)
10133 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10134 goto force_balance;
10136 if (bail_inter_cluster_balance(env, &sds))
10139 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
10140 / sds.total_capacity;
10143 * If the busiest group is imbalanced the below checks don't
10144 * work because they assume all things are equal, which typically
10145 * isn't true due to cpus_allowed constraints and the like.
10147 if (busiest->group_type == group_imbalanced)
10148 goto force_balance;
10151 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
10152 * capacities from resulting in underutilization due to avg_load.
10154 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
10155 busiest->group_no_capacity)
10156 goto force_balance;
10158 /* Misfitting tasks should be dealt with regardless of the avg load */
10159 if (energy_aware() && busiest->group_type == group_misfit_task) {
10160 goto force_balance;
10164 * If the local group is busier than the selected busiest group
10165 * don't try and pull any tasks.
10167 if (local->avg_load >= busiest->avg_load)
10171 * Don't pull any tasks if this group is already above the domain
10174 if (local->avg_load >= sds.avg_load)
10177 if (env->idle == CPU_IDLE) {
10179 * This cpu is idle. If the busiest group is not overloaded
10180 * and there is no imbalance between this and busiest group
10181 * wrt idle cpus, it is balanced. The imbalance becomes
10182 * significant if the diff is greater than 1 otherwise we
10183 * might end up to just move the imbalance on another group
10185 if ((busiest->group_type != group_overloaded) &&
10186 (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
10187 !group_smaller_cpu_capacity(sds.busiest, sds.local))
10191 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
10192 * imbalance_pct to be conservative.
10194 if (100 * busiest->avg_load <=
10195 env->sd->imbalance_pct * local->avg_load)
10200 env->busiest_group_type = busiest->group_type;
10201 /* Looks like there is an imbalance. Compute it */
10202 calculate_imbalance(env, &sds);
10203 return sds.busiest;
10206 env->imbalance = 0;
10210 #ifdef CONFIG_SCHED_HMP
10211 static struct rq *find_busiest_queue_hmp(struct lb_env *env,
10212 struct sched_group *group)
10214 struct rq *busiest = NULL, *busiest_big = NULL;
10215 u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
10216 int max_nr_big = 0, nr_big;
10217 bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
10221 cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
10223 for_each_cpu(i, &cpus) {
10224 struct rq *rq = cpu_rq(i);
10225 u64 cumulative_runnable_avg =
10226 rq->hmp_stats.cumulative_runnable_avg;
10228 if (!cpumask_test_cpu(i, env->cpus))
10233 nr_big = nr_big_tasks(rq);
10234 if (nr_big > max_nr_big ||
10235 (nr_big > 0 && nr_big == max_nr_big &&
10236 cumulative_runnable_avg > max_runnable_avg_big)) {
10237 max_runnable_avg_big = cumulative_runnable_avg;
10239 max_nr_big = nr_big;
10244 if (cumulative_runnable_avg > max_runnable_avg) {
10245 max_runnable_avg = cumulative_runnable_avg;
10251 return busiest_big;
10253 env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
10257 static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
10258 struct sched_group *group)
10265 * find_busiest_queue - find the busiest runqueue among the cpus in group.
10267 static struct rq *find_busiest_queue(struct lb_env *env,
10268 struct sched_group *group)
10270 struct rq *busiest = NULL, *rq;
10271 unsigned long busiest_load = 0, busiest_capacity = 1;
10274 #ifdef CONFIG_SCHED_HMP
10275 return find_busiest_queue_hmp(env, group);
10278 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
10279 unsigned long capacity, wl;
10283 rt = fbq_classify_rq(rq);
10286 * We classify groups/runqueues into three groups:
10287 * - regular: there are !numa tasks
10288 * - remote: there are numa tasks that run on the 'wrong' node
10289 * - all: there is no distinction
10291 * In order to avoid migrating ideally placed numa tasks,
10292 * ignore those when there's better options.
10294 * If we ignore the actual busiest queue to migrate another
10295 * task, the next balance pass can still reduce the busiest
10296 * queue by moving tasks around inside the node.
10298 * If we cannot move enough load due to this classification
10299 * the next pass will adjust the group classification and
10300 * allow migration of more tasks.
10302 * Both cases only affect the total convergence complexity.
10304 if (rt > env->fbq_type)
10307 capacity = capacity_of(i);
10309 wl = weighted_cpuload(i);
10312 * When comparing with imbalance, use weighted_cpuload()
10313 * which is not scaled with the cpu capacity.
10316 if (rq->nr_running == 1 && wl > env->imbalance &&
10317 !check_cpu_capacity(rq, env->sd) &&
10318 env->busiest_group_type != group_misfit_task)
10322 * For the load comparisons with the other cpu's, consider
10323 * the weighted_cpuload() scaled with the cpu capacity, so
10324 * that the load can be moved away from the cpu that is
10325 * potentially running at a lower capacity.
10327 * Thus we're looking for max(wl_i / capacity_i), crosswise
10328 * multiplication to rid ourselves of the division works out
10329 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
10330 * our previous maximum.
10332 if (wl * busiest_capacity > busiest_load * capacity) {
10334 busiest_capacity = capacity;
10343 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10344 * so long as it is large enough.
10346 #define MAX_PINNED_INTERVAL 16
10348 /* Working cpumask for load_balance and load_balance_newidle. */
10349 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
10351 #define NEED_ACTIVE_BALANCE_THRESHOLD 10
10353 static int need_active_balance(struct lb_env *env)
10355 struct sched_domain *sd = env->sd;
10357 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10360 if (env->idle == CPU_NEWLY_IDLE) {
10363 * ASYM_PACKING needs to force migrate tasks from busy but
10364 * higher numbered CPUs in order to pack all tasks in the
10365 * lowest numbered CPUs.
10367 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
10372 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
10373 * It's worth migrating the task if the src_cpu's capacity is reduced
10374 * because of other sched_class or IRQs if more capacity stays
10375 * available on dst_cpu.
10377 if ((env->idle != CPU_NOT_IDLE) &&
10378 (env->src_rq->cfs.h_nr_running == 1)) {
10379 if ((check_cpu_capacity(env->src_rq, sd)) &&
10380 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10384 if (energy_aware() &&
10385 (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
10386 ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
10387 env->src_rq->cfs.h_nr_running == 1 &&
10388 cpu_overutilized(env->src_cpu) &&
10389 !cpu_overutilized(env->dst_cpu)) {
10393 return unlikely(sd->nr_balance_failed >
10394 sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
10397 static int group_balance_cpu_not_isolated(struct sched_group *sg)
10401 cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
10402 cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
10403 return cpumask_first(&cpus);
10406 static int should_we_balance(struct lb_env *env)
10408 struct sched_group *sg = env->sd->groups;
10409 struct cpumask *sg_cpus, *sg_mask;
10410 int cpu, balance_cpu = -1;
10413 * In the newly idle case, we will allow all the cpu's
10414 * to do the newly idle load balance.
10416 if (env->idle == CPU_NEWLY_IDLE)
10419 sg_cpus = sched_group_cpus(sg);
10420 sg_mask = sched_group_mask(sg);
10421 /* Try to find first idle cpu */
10422 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
10423 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
10431 if (balance_cpu == -1)
10432 balance_cpu = group_balance_cpu_not_isolated(sg);
10435 * First idle cpu or the first cpu(busiest) in this sched group
10436 * is eligible for doing load balancing at this and above domains.
10438 return balance_cpu == env->dst_cpu;
10442 * Check this_cpu to ensure it is balanced within domain. Attempt to move
10443 * tasks if there is an imbalance.
10445 static int load_balance(int this_cpu, struct rq *this_rq,
10446 struct sched_domain *sd, enum cpu_idle_type idle,
10447 int *continue_balancing)
10449 int ld_moved = 0, cur_ld_moved, active_balance = 0;
10450 struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
10451 struct sched_group *group = NULL;
10452 struct rq *busiest = NULL;
10453 unsigned long flags;
10454 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
10456 struct lb_env env = {
10458 .dst_cpu = this_cpu,
10460 .dst_grpmask = sched_group_cpus(sd->groups),
10462 .loop_break = sched_nr_migrate_break,
10465 .tasks = LIST_HEAD_INIT(env.tasks),
10469 .busiest_nr_running = 0,
10470 .busiest_grp_capacity = 0,
10471 .boost_policy = sched_boost_policy(),
10475 * For NEWLY_IDLE load_balancing, we don't need to consider
10476 * other cpus in our group
10478 if (idle == CPU_NEWLY_IDLE)
10479 env.dst_grpmask = NULL;
10481 cpumask_copy(cpus, cpu_active_mask);
10483 schedstat_inc(sd, lb_count[idle]);
10486 if (!should_we_balance(&env)) {
10487 *continue_balancing = 0;
10491 group = find_busiest_group(&env);
10493 schedstat_inc(sd, lb_nobusyg[idle]);
10497 busiest = find_busiest_queue(&env, group);
10499 schedstat_inc(sd, lb_nobusyq[idle]);
10503 BUG_ON(busiest == env.dst_rq);
10505 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
10507 env.src_cpu = busiest->cpu;
10508 env.src_rq = busiest;
10511 if (busiest->nr_running > 1) {
10513 * Attempt to move tasks. If find_busiest_group has found
10514 * an imbalance but busiest->nr_running <= 1, the group is
10515 * still unbalanced. ld_moved simply stays zero, so it is
10516 * correctly treated as an imbalance.
10518 env.flags |= LBF_ALL_PINNED;
10519 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
10522 raw_spin_lock_irqsave(&busiest->lock, flags);
10523 update_rq_clock(busiest);
10525 /* The world might have changed. Validate assumptions */
10526 if (busiest->nr_running <= 1) {
10527 raw_spin_unlock_irqrestore(&busiest->lock, flags);
10528 env.flags &= ~LBF_ALL_PINNED;
10533 * cur_ld_moved - load moved in current iteration
10534 * ld_moved - cumulative load moved across iterations
10536 cur_ld_moved = detach_tasks(&env);
10539 * We've detached some tasks from busiest_rq. Every
10540 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
10541 * unlock busiest->lock, and we are able to be sure
10542 * that nobody can manipulate the tasks in parallel.
10543 * See task_rq_lock() family for the details.
10546 raw_spin_unlock(&busiest->lock);
10548 if (cur_ld_moved) {
10549 attach_tasks(&env);
10550 ld_moved += cur_ld_moved;
10553 local_irq_restore(flags);
10555 if (env.flags & LBF_NEED_BREAK) {
10556 env.flags &= ~LBF_NEED_BREAK;
10561 * Revisit (affine) tasks on src_cpu that couldn't be moved to
10562 * us and move them to an alternate dst_cpu in our sched_group
10563 * where they can run. The upper limit on how many times we
10564 * iterate on same src_cpu is dependent on number of cpus in our
10567 * This changes load balance semantics a bit on who can move
10568 * load to a given_cpu. In addition to the given_cpu itself
10569 * (or a ilb_cpu acting on its behalf where given_cpu is
10570 * nohz-idle), we now have balance_cpu in a position to move
10571 * load to given_cpu. In rare situations, this may cause
10572 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
10573 * _independently_ and at _same_ time to move some load to
10574 * given_cpu) causing exceess load to be moved to given_cpu.
10575 * This however should not happen so much in practice and
10576 * moreover subsequent load balance cycles should correct the
10577 * excess load moved.
10579 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
10581 /* Prevent to re-select dst_cpu via env's cpus */
10582 cpumask_clear_cpu(env.dst_cpu, env.cpus);
10584 env.dst_rq = cpu_rq(env.new_dst_cpu);
10585 env.dst_cpu = env.new_dst_cpu;
10586 env.flags &= ~LBF_DST_PINNED;
10588 env.loop_break = sched_nr_migrate_break;
10591 * Go back to "more_balance" rather than "redo" since we
10592 * need to continue with same src_cpu.
10598 * We failed to reach balance because of affinity.
10601 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10603 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
10604 *group_imbalance = 1;
10607 /* All tasks on this runqueue were pinned by CPU affinity */
10608 if (unlikely(env.flags & LBF_ALL_PINNED)) {
10609 cpumask_clear_cpu(cpu_of(busiest), cpus);
10610 if (!cpumask_empty(cpus)) {
10612 env.loop_break = sched_nr_migrate_break;
10615 goto out_all_pinned;
10621 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
10622 schedstat_inc(sd, lb_failed[idle]);
10625 * Increment the failure counter only on periodic balance.
10626 * We do not want newidle balance, which can be very
10627 * frequent, pollute the failure counter causing
10628 * excessive cache_hot migrations and active balances.
10630 if (idle != CPU_NEWLY_IDLE &&
10631 !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
10632 if (env.src_grp_nr_running > 1)
10633 sd->nr_balance_failed++;
10636 if (need_active_balance(&env)) {
10637 raw_spin_lock_irqsave(&busiest->lock, flags);
10639 /* don't kick the active_load_balance_cpu_stop,
10640 * if the curr task on busiest cpu can't be
10641 * moved to this_cpu
10643 if (!cpumask_test_cpu(this_cpu,
10644 tsk_cpus_allowed(busiest->curr))) {
10645 raw_spin_unlock_irqrestore(&busiest->lock,
10647 env.flags |= LBF_ALL_PINNED;
10648 goto out_one_pinned;
10652 * ->active_balance synchronizes accesses to
10653 * ->active_balance_work. Once set, it's cleared
10654 * only after active load balance is finished.
10656 if (!busiest->active_balance &&
10657 !cpu_isolated(cpu_of(busiest))) {
10658 busiest->active_balance = 1;
10659 busiest->push_cpu = this_cpu;
10660 active_balance = 1;
10662 raw_spin_unlock_irqrestore(&busiest->lock, flags);
10664 if (active_balance) {
10665 stop_one_cpu_nowait(cpu_of(busiest),
10666 active_load_balance_cpu_stop, busiest,
10667 &busiest->active_balance_work);
10668 *continue_balancing = 0;
10672 * We've kicked active balancing, reset the failure
10675 sd->nr_balance_failed =
10676 sd->cache_nice_tries +
10677 NEED_ACTIVE_BALANCE_THRESHOLD - 1;
10680 sd->nr_balance_failed = 0;
10682 /* Assumes one 'busiest' cpu that we pulled tasks from */
10683 if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
10684 int check_groups = !!(env.flags &
10685 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10687 check_for_freq_change(this_rq, false, check_groups);
10688 check_for_freq_change(busiest, false, check_groups);
10690 check_for_freq_change(this_rq, true, false);
10693 if (likely(!active_balance)) {
10694 /* We were unbalanced, so reset the balancing interval */
10695 sd->balance_interval = sd->min_interval;
10698 * If we've begun active balancing, start to back off. This
10699 * case may not be covered by the all_pinned logic if there
10700 * is only 1 task on the busy runqueue (because we don't call
10703 if (sd->balance_interval < sd->max_interval)
10704 sd->balance_interval *= 2;
10711 * We reach balance although we may have faced some affinity
10712 * constraints. Clear the imbalance flag if it was set.
10715 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10717 if (*group_imbalance)
10718 *group_imbalance = 0;
10723 * We reach balance because all tasks are pinned at this level so
10724 * we can't migrate them. Let the imbalance flag set so parent level
10725 * can try to migrate them.
10727 schedstat_inc(sd, lb_balanced[idle]);
10729 sd->nr_balance_failed = 0;
10732 /* tune up the balancing interval */
10733 if (((env.flags & LBF_ALL_PINNED) &&
10734 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10735 (sd->balance_interval < sd->max_interval))
10736 sd->balance_interval *= 2;
10740 trace_sched_load_balance(this_cpu, idle, *continue_balancing,
10741 group ? group->cpumask[0] : 0,
10742 busiest ? busiest->nr_running : 0,
10743 env.imbalance, env.flags, ld_moved,
10744 sd->balance_interval);
10748 static inline unsigned long
10749 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10751 unsigned long interval = sd->balance_interval;
10754 interval *= sd->busy_factor;
10756 /* scale ms to jiffies */
10757 interval = msecs_to_jiffies(interval);
10758 interval = clamp(interval, 1UL, max_load_balance_interval);
10764 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
10766 unsigned long interval, next;
10768 interval = get_sd_balance_interval(sd, cpu_busy);
10769 next = sd->last_balance + interval;
10771 if (time_after(*next_balance, next))
10772 *next_balance = next;
10776 * idle_balance is called by schedule() if this_cpu is about to become
10777 * idle. Attempts to pull tasks from other CPUs.
10779 static int idle_balance(struct rq *this_rq)
10781 unsigned long next_balance = jiffies + HZ;
10782 int this_cpu = this_rq->cpu;
10783 struct sched_domain *sd;
10784 int pulled_task = 0;
10787 if (cpu_isolated(this_cpu))
10790 idle_enter_fair(this_rq);
10793 * We must set idle_stamp _before_ calling idle_balance(), such that we
10794 * measure the duration of idle_balance() as idle time.
10796 this_rq->idle_stamp = rq_clock(this_rq);
10798 if (!energy_aware() &&
10799 (this_rq->avg_idle < sysctl_sched_migration_cost ||
10800 !this_rq->rd->overload)) {
10802 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10804 update_next_balance(sd, 0, &next_balance);
10810 raw_spin_unlock(&this_rq->lock);
10812 update_blocked_averages(this_cpu);
10814 for_each_domain(this_cpu, sd) {
10815 int continue_balancing = 1;
10816 u64 t0, domain_cost;
10818 if (!(sd->flags & SD_LOAD_BALANCE))
10821 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10822 update_next_balance(sd, 0, &next_balance);
10826 if (sd->flags & SD_BALANCE_NEWIDLE) {
10827 t0 = sched_clock_cpu(this_cpu);
10829 pulled_task = load_balance(this_cpu, this_rq,
10830 sd, CPU_NEWLY_IDLE,
10831 &continue_balancing);
10833 domain_cost = sched_clock_cpu(this_cpu) - t0;
10834 if (domain_cost > sd->max_newidle_lb_cost)
10835 sd->max_newidle_lb_cost = domain_cost;
10837 curr_cost += domain_cost;
10840 update_next_balance(sd, 0, &next_balance);
10843 * Stop searching for tasks to pull if there are
10844 * now runnable tasks on the balance rq or if
10845 * continue_balancing has been unset (only possible
10846 * due to active migration).
10848 if (pulled_task || this_rq->nr_running > 0 ||
10849 !continue_balancing)
10854 raw_spin_lock(&this_rq->lock);
10856 if (curr_cost > this_rq->max_idle_balance_cost)
10857 this_rq->max_idle_balance_cost = curr_cost;
10860 * While browsing the domains, we released the rq lock, a task could
10861 * have been enqueued in the meantime. Since we're not going idle,
10862 * pretend we pulled a task.
10864 if (this_rq->cfs.h_nr_running && !pulled_task)
10868 /* Move the next balance forward */
10869 if (time_after(this_rq->next_balance, next_balance))
10870 this_rq->next_balance = next_balance;
10872 /* Is there a task of a high priority class? */
10873 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10877 idle_exit_fair(this_rq);
10878 this_rq->idle_stamp = 0;
10881 return pulled_task;
10885 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
10886 * running tasks off the busiest CPU onto idle CPUs. It requires at
10887 * least 1 task to be running on each physical CPU where possible, and
10888 * avoids physical / logical imbalances.
10890 static int active_load_balance_cpu_stop(void *data)
10892 struct rq *busiest_rq = data;
10893 int busiest_cpu = cpu_of(busiest_rq);
10894 int target_cpu = busiest_rq->push_cpu;
10895 struct rq *target_rq = cpu_rq(target_cpu);
10896 struct sched_domain *sd = NULL;
10897 struct task_struct *p = NULL;
10898 struct task_struct *push_task = NULL;
10899 int push_task_detached = 0;
10900 struct lb_env env = {
10902 .dst_cpu = target_cpu,
10903 .dst_rq = target_rq,
10904 .src_cpu = busiest_rq->cpu,
10905 .src_rq = busiest_rq,
10907 .busiest_nr_running = 0,
10908 .busiest_grp_capacity = 0,
10911 .boost_policy = sched_boost_policy(),
10913 bool moved = false;
10915 raw_spin_lock_irq(&busiest_rq->lock);
10917 /* make sure the requested cpu hasn't gone down in the meantime */
10918 if (unlikely(busiest_cpu != smp_processor_id() ||
10919 !busiest_rq->active_balance))
10922 /* Is there any task to move? */
10923 if (busiest_rq->nr_running <= 1)
10927 * This condition is "impossible", if it occurs
10928 * we need to fix it. Originally reported by
10929 * Bjorn Helgaas on a 128-cpu setup.
10931 BUG_ON(busiest_rq == target_rq);
10933 push_task = busiest_rq->push_task;
10934 target_cpu = busiest_rq->push_cpu;
10936 if (task_on_rq_queued(push_task) &&
10937 push_task->state == TASK_RUNNING &&
10938 task_cpu(push_task) == busiest_cpu &&
10939 cpu_online(target_cpu)) {
10940 detach_task(push_task, &env);
10941 push_task_detached = 1;
10947 /* Search for an sd spanning us and the target CPU. */
10949 for_each_domain(target_cpu, sd) {
10950 if ((sd->flags & SD_LOAD_BALANCE) &&
10951 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10957 schedstat_inc(sd, alb_count);
10958 update_rq_clock(busiest_rq);
10960 p = detach_one_task(&env);
10962 schedstat_inc(sd, alb_pushed);
10965 schedstat_inc(sd, alb_failed);
10970 busiest_rq->active_balance = 0;
10971 push_task = busiest_rq->push_task;
10972 target_cpu = busiest_rq->push_cpu;
10975 busiest_rq->push_task = NULL;
10977 raw_spin_unlock(&busiest_rq->lock);
10980 if (push_task_detached)
10981 attach_one_task(target_rq, push_task);
10982 put_task_struct(push_task);
10983 clear_reserved(target_cpu);
10987 attach_one_task(target_rq, p);
10989 local_irq_enable();
10991 if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
10992 int check_groups = !!(env.flags &
10993 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10994 check_for_freq_change(busiest_rq, false, check_groups);
10995 check_for_freq_change(target_rq, false, check_groups);
10996 } else if (moved) {
10997 check_for_freq_change(target_rq, true, false);
11003 static inline int on_null_domain(struct rq *rq)
11005 return unlikely(!rcu_dereference_sched(rq->sd));
11008 #ifdef CONFIG_NO_HZ_COMMON
11010 * idle load balancing details
11011 * - When one of the busy CPUs notice that there may be an idle rebalancing
11012 * needed, they will kick the idle load balancer, which then does idle
11013 * load balancing for all the idle CPUs.
11016 #ifdef CONFIG_SCHED_HMP
11017 static inline int find_new_hmp_ilb(int type)
11019 int call_cpu = raw_smp_processor_id();
11020 struct sched_domain *sd;
11025 /* Pick an idle cpu "closest" to call_cpu */
11026 for_each_domain(call_cpu, sd) {
11027 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
11028 sched_domain_span(sd)) {
11029 if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
11030 cpu_max_power_cost(ilb) <=
11031 cpu_max_power_cost(call_cpu))) {
11033 reset_balance_interval(ilb);
11042 #else /* CONFIG_SCHED_HMP */
11043 static inline int find_new_hmp_ilb(int type)
11047 #endif /* CONFIG_SCHED_HMP */
11049 static inline int find_new_ilb(int type)
11053 #ifdef CONFIG_SCHED_HMP
11054 return find_new_hmp_ilb(type);
11057 ilb = cpumask_first(nohz.idle_cpus_mask);
11059 if (ilb < nr_cpu_ids && idle_cpu(ilb))
11066 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
11067 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
11068 * CPU (if there is one).
11070 static void nohz_balancer_kick(int type)
11074 nohz.next_balance++;
11076 ilb_cpu = find_new_ilb(type);
11078 if (ilb_cpu >= nr_cpu_ids)
11081 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
11084 * Use smp_send_reschedule() instead of resched_cpu().
11085 * This way we generate a sched IPI on the target cpu which
11086 * is idle. And the softirq performing nohz idle load balance
11087 * will be run before returning from the IPI.
11089 smp_send_reschedule(ilb_cpu);
11093 void nohz_balance_clear_nohz_mask(int cpu)
11095 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
11096 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
11097 atomic_dec(&nohz.nr_cpus);
11101 static inline void nohz_balance_exit_idle(int cpu)
11103 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
11105 * Completely isolated CPUs don't ever set, so we must test.
11107 nohz_balance_clear_nohz_mask(cpu);
11108 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11112 static inline void set_cpu_sd_state_busy(void)
11114 struct sched_domain *sd;
11115 int cpu = smp_processor_id();
11118 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11120 if (!sd || !sd->nohz_idle)
11124 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
11129 void set_cpu_sd_state_idle(void)
11131 struct sched_domain *sd;
11132 int cpu = smp_processor_id();
11135 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11137 if (!sd || sd->nohz_idle)
11141 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
11147 * This routine will record that the cpu is going idle with tick stopped.
11148 * This info will be used in performing idle load balancing in the future.
11150 void nohz_balance_enter_idle(int cpu)
11153 * If this cpu is going down, then nothing needs to be done.
11155 if (!cpu_active(cpu))
11158 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
11162 * If we're a completely isolated CPU, we don't play.
11164 if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
11167 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
11168 atomic_inc(&nohz.nr_cpus);
11169 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11172 static int sched_ilb_notifier(struct notifier_block *nfb,
11173 unsigned long action, void *hcpu)
11175 switch (action & ~CPU_TASKS_FROZEN) {
11177 nohz_balance_exit_idle(smp_processor_id());
11180 return NOTIFY_DONE;
11185 static DEFINE_SPINLOCK(balancing);
11188 * Scale the max load_balance interval with the number of CPUs in the system.
11189 * This trades load-balance latency on larger machines for less cross talk.
11191 void update_max_interval(void)
11193 cpumask_t avail_mask;
11194 unsigned int available_cpus;
11196 cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
11197 available_cpus = cpumask_weight(&avail_mask);
11199 max_load_balance_interval = HZ*available_cpus/10;
11203 * It checks each scheduling domain to see if it is due to be balanced,
11204 * and initiates a balancing operation if so.
11206 * Balancing parameters are set up in init_sched_domains.
11208 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
11210 int continue_balancing = 1;
11212 unsigned long interval;
11213 struct sched_domain *sd;
11214 /* Earliest time when we have to do rebalance again */
11215 unsigned long next_balance = jiffies + 60*HZ;
11216 int update_next_balance = 0;
11217 int need_serialize, need_decay = 0;
11220 update_blocked_averages(cpu);
11223 for_each_domain(cpu, sd) {
11225 * Decay the newidle max times here because this is a regular
11226 * visit to all the domains. Decay ~1% per second.
11228 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
11229 sd->max_newidle_lb_cost =
11230 (sd->max_newidle_lb_cost * 253) / 256;
11231 sd->next_decay_max_lb_cost = jiffies + HZ;
11234 max_cost += sd->max_newidle_lb_cost;
11236 if (!(sd->flags & SD_LOAD_BALANCE))
11240 * Stop the load balance at this level. There is another
11241 * CPU in our sched group which is doing load balancing more
11244 if (!continue_balancing) {
11250 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11252 need_serialize = sd->flags & SD_SERIALIZE;
11253 if (need_serialize) {
11254 if (!spin_trylock(&balancing))
11258 if (time_after_eq(jiffies, sd->last_balance + interval)) {
11259 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
11261 * The LBF_DST_PINNED logic could have changed
11262 * env->dst_cpu, so we can't know our idle
11263 * state even if we migrated tasks. Update it.
11265 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
11267 sd->last_balance = jiffies;
11268 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11270 if (need_serialize)
11271 spin_unlock(&balancing);
11273 if (time_after(next_balance, sd->last_balance + interval)) {
11274 next_balance = sd->last_balance + interval;
11275 update_next_balance = 1;
11280 * Ensure the rq-wide value also decays but keep it at a
11281 * reasonable floor to avoid funnies with rq->avg_idle.
11283 rq->max_idle_balance_cost =
11284 max((u64)sysctl_sched_migration_cost, max_cost);
11289 * next_balance will be updated only when there is a need.
11290 * When the cpu is attached to null domain for ex, it will not be
11293 if (likely(update_next_balance)) {
11294 rq->next_balance = next_balance;
11296 #ifdef CONFIG_NO_HZ_COMMON
11298 * If this CPU has been elected to perform the nohz idle
11299 * balance. Other idle CPUs have already rebalanced with
11300 * nohz_idle_balance() and nohz.next_balance has been
11301 * updated accordingly. This CPU is now running the idle load
11302 * balance for itself and we need to update the
11303 * nohz.next_balance accordingly.
11305 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
11306 nohz.next_balance = rq->next_balance;
11311 #ifdef CONFIG_NO_HZ_COMMON
11313 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
11314 * rebalancing for all the cpus for whom scheduler ticks are stopped.
11316 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11318 int this_cpu = this_rq->cpu;
11321 /* Earliest time when we have to do rebalance again */
11322 unsigned long next_balance = jiffies + 60*HZ;
11323 int update_next_balance = 0;
11326 if (idle != CPU_IDLE ||
11327 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
11330 cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
11332 for_each_cpu(balance_cpu, &cpus) {
11333 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
11337 * If this cpu gets work to do, stop the load balancing
11338 * work being done for other cpus. Next load
11339 * balancing owner will pick it up.
11341 if (need_resched())
11344 rq = cpu_rq(balance_cpu);
11347 * If time for next balance is due,
11350 if (time_after_eq(jiffies, rq->next_balance)) {
11351 raw_spin_lock_irq(&rq->lock);
11352 update_rq_clock(rq);
11353 update_idle_cpu_load(rq);
11354 raw_spin_unlock_irq(&rq->lock);
11355 rebalance_domains(rq, CPU_IDLE);
11358 if (time_after(next_balance, rq->next_balance)) {
11359 next_balance = rq->next_balance;
11360 update_next_balance = 1;
11365 * next_balance will be updated only when there is a need.
11366 * When the CPU is attached to null domain for ex, it will not be
11369 if (likely(update_next_balance))
11370 nohz.next_balance = next_balance;
11372 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
11375 #ifdef CONFIG_SCHED_HMP
11376 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11378 struct sched_domain *sd;
11381 if (rq->nr_running < 2)
11384 if (!sysctl_sched_restrict_cluster_spill ||
11385 sched_boost_policy() == SCHED_BOOST_ON_ALL)
11388 if (cpu_max_power_cost(cpu) == max_power_cost)
11392 sd = rcu_dereference_check_sched_domain(rq->sd);
11398 for_each_cpu(i, sched_domain_span(sd)) {
11399 if (cpu_load(i) < sched_spill_load &&
11400 cpu_rq(i)->nr_running <
11401 sysctl_sched_spill_nr_run) {
11402 /* Change the kick type to limit to CPUs that
11403 * are of equal or lower capacity.
11405 *type = NOHZ_KICK_RESTRICT;
11413 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11419 static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
11421 unsigned long now = jiffies;
11424 * None are in tickless mode and hence no need for NOHZ idle load
11427 if (likely(!atomic_read(&nohz.nr_cpus)))
11430 #ifdef CONFIG_SCHED_HMP
11431 return _nohz_kick_needed_hmp(rq, cpu, type);
11434 if (time_before(now, nohz.next_balance))
11437 if (rq->nr_running >= 2 &&
11438 (!energy_aware() || cpu_overutilized(cpu)))
11441 /* Do idle load balance if there have misfit task */
11442 if (energy_aware())
11443 return rq->misfit_task;
11445 return (rq->nr_running >= 2);
11449 * Current heuristic for kicking the idle load balancer in the presence
11450 * of an idle cpu in the system.
11451 * - This rq has more than one task.
11452 * - This rq has at least one CFS task and the capacity of the CPU is
11453 * significantly reduced because of RT tasks or IRQs.
11454 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
11455 * multiple busy cpu.
11456 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
11457 * domain span are idle.
11459 static inline bool nohz_kick_needed(struct rq *rq, int *type)
11461 #ifndef CONFIG_SCHED_HMP
11462 struct sched_domain *sd;
11463 struct sched_group_capacity *sgc;
11469 if (unlikely(rq->idle_balance))
11473 * We may be recently in ticked or tickless idle mode. At the first
11474 * busy tick after returning from idle, we will update the busy stats.
11476 set_cpu_sd_state_busy();
11477 nohz_balance_exit_idle(cpu);
11479 if (_nohz_kick_needed(rq, cpu, type))
11482 #ifndef CONFIG_SCHED_HMP
11484 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11486 sgc = sd->groups->sgc;
11487 nr_busy = atomic_read(&sgc->nr_busy_cpus);
11496 sd = rcu_dereference(rq->sd);
11498 if ((rq->cfs.h_nr_running >= 1) &&
11499 check_cpu_capacity(rq, sd)) {
11505 sd = rcu_dereference(per_cpu(sd_asym, cpu));
11506 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
11507 sched_domain_span(sd)) < cpu)) {
11518 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
11522 * run_rebalance_domains is triggered when needed from the scheduler tick.
11523 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
11525 static void run_rebalance_domains(struct softirq_action *h)
11527 struct rq *this_rq = this_rq();
11528 enum cpu_idle_type idle = this_rq->idle_balance ?
11529 CPU_IDLE : CPU_NOT_IDLE;
11532 * If this cpu has a pending nohz_balance_kick, then do the
11533 * balancing on behalf of the other idle cpus whose ticks are
11534 * stopped. Do nohz_idle_balance *before* rebalance_domains to
11535 * give the idle cpus a chance to load balance. Else we may
11536 * load balance only within the local sched_domain hierarchy
11537 * and abort nohz_idle_balance altogether if we pull some load.
11539 nohz_idle_balance(this_rq, idle);
11540 rebalance_domains(this_rq, idle);
11544 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
11546 void trigger_load_balance(struct rq *rq)
11548 int type = NOHZ_KICK_ANY;
11550 /* Don't need to rebalance while attached to NULL domain or
11553 if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
11556 if (time_after_eq(jiffies, rq->next_balance))
11557 raise_softirq(SCHED_SOFTIRQ);
11558 #ifdef CONFIG_NO_HZ_COMMON
11559 if (nohz_kick_needed(rq, &type))
11560 nohz_balancer_kick(type);
11564 static void rq_online_fair(struct rq *rq)
11568 update_runtime_enabled(rq);
11571 static void rq_offline_fair(struct rq *rq)
11575 /* Ensure any throttled groups are reachable by pick_next_task */
11576 unthrottle_offline_cfs_rqs(rq);
11579 #endif /* CONFIG_SMP */
11582 * scheduler tick hitting a task of our scheduling class:
11584 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11586 struct cfs_rq *cfs_rq;
11587 struct sched_entity *se = &curr->se;
11589 for_each_sched_entity(se) {
11590 cfs_rq = cfs_rq_of(se);
11591 entity_tick(cfs_rq, se, queued);
11594 if (static_branch_unlikely(&sched_numa_balancing))
11595 task_tick_numa(rq, curr);
11598 if (energy_aware() &&
11599 !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
11600 rq->rd->overutilized = true;
11601 trace_sched_overutilized(true);
11604 rq->misfit_task = !task_fits_max(curr, rq->cpu);
11610 * called on fork with the child task as argument from the parent's context
11611 * - child not yet on the tasklist
11612 * - preemption disabled
11614 static void task_fork_fair(struct task_struct *p)
11616 struct cfs_rq *cfs_rq;
11617 struct sched_entity *se = &p->se, *curr;
11618 struct rq *rq = this_rq();
11620 raw_spin_lock(&rq->lock);
11621 update_rq_clock(rq);
11623 cfs_rq = task_cfs_rq(current);
11624 curr = cfs_rq->curr;
11626 update_curr(cfs_rq);
11627 se->vruntime = curr->vruntime;
11629 place_entity(cfs_rq, se, 1);
11631 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11633 * Upon rescheduling, sched_class::put_prev_task() will place
11634 * 'current' within the tree based on its new key value.
11636 swap(curr->vruntime, se->vruntime);
11640 se->vruntime -= cfs_rq->min_vruntime;
11641 raw_spin_unlock(&rq->lock);
11645 * Priority of the task has changed. Check to see if we preempt
11646 * the current task.
11649 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11651 if (!task_on_rq_queued(p))
11655 * Reschedule if we are currently running on this runqueue and
11656 * our priority decreased, or if we are not currently running on
11657 * this runqueue and our priority is higher than the current's
11659 if (rq->curr == p) {
11660 if (p->prio > oldprio)
11663 check_preempt_curr(rq, p, 0);
11666 static inline bool vruntime_normalized(struct task_struct *p)
11668 struct sched_entity *se = &p->se;
11671 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11672 * the dequeue_entity(.flags=0) will already have normalized the
11679 * When !on_rq, vruntime of the task has usually NOT been normalized.
11680 * But there are some cases where it has already been normalized:
11682 * - A forked child which is waiting for being woken up by
11683 * wake_up_new_task().
11684 * - A task which has been woken up by try_to_wake_up() and
11685 * waiting for actually being woken up by sched_ttwu_pending().
11687 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
11693 #ifdef CONFIG_FAIR_GROUP_SCHED
11695 * Propagate the changes of the sched_entity across the tg tree to make it
11696 * visible to the root
11698 static void propagate_entity_cfs_rq(struct sched_entity *se)
11700 struct cfs_rq *cfs_rq;
11702 /* Start to propagate at parent */
11705 for_each_sched_entity(se) {
11706 cfs_rq = cfs_rq_of(se);
11708 if (cfs_rq_throttled(cfs_rq))
11711 update_load_avg(se, UPDATE_TG);
11715 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11718 static void detach_entity_cfs_rq(struct sched_entity *se)
11720 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11722 /* Catch up with the cfs_rq and remove our load when we leave */
11723 update_load_avg(se, 0);
11724 detach_entity_load_avg(cfs_rq, se);
11725 update_tg_load_avg(cfs_rq, false);
11726 propagate_entity_cfs_rq(se);
11729 static void attach_entity_cfs_rq(struct sched_entity *se)
11731 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11733 #ifdef CONFIG_FAIR_GROUP_SCHED
11735 * Since the real-depth could have been changed (only FAIR
11736 * class maintain depth value), reset depth properly.
11738 se->depth = se->parent ? se->parent->depth + 1 : 0;
11741 /* Synchronize entity with its cfs_rq */
11742 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11743 attach_entity_load_avg(cfs_rq, se);
11744 update_tg_load_avg(cfs_rq, false);
11745 propagate_entity_cfs_rq(se);
11748 static void detach_task_cfs_rq(struct task_struct *p)
11750 struct sched_entity *se = &p->se;
11751 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11753 if (!vruntime_normalized(p)) {
11755 * Fix up our vruntime so that the current sleep doesn't
11756 * cause 'unlimited' sleep bonus.
11758 place_entity(cfs_rq, se, 0);
11759 se->vruntime -= cfs_rq->min_vruntime;
11762 detach_entity_cfs_rq(se);
11765 static void attach_task_cfs_rq(struct task_struct *p)
11767 struct sched_entity *se = &p->se;
11768 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11770 attach_entity_cfs_rq(se);
11772 if (!vruntime_normalized(p))
11773 se->vruntime += cfs_rq->min_vruntime;
11776 static void switched_from_fair(struct rq *rq, struct task_struct *p)
11778 detach_task_cfs_rq(p);
11781 static void switched_to_fair(struct rq *rq, struct task_struct *p)
11783 attach_task_cfs_rq(p);
11785 if (task_on_rq_queued(p)) {
11787 * We were most likely switched from sched_rt, so
11788 * kick off the schedule if running, otherwise just see
11789 * if we can still preempt the current task.
11794 check_preempt_curr(rq, p, 0);
11798 /* Account for a task changing its policy or group.
11800 * This routine is mostly called to set cfs_rq->curr field when a task
11801 * migrates between groups/classes.
11803 static void set_curr_task_fair(struct rq *rq)
11805 struct sched_entity *se = &rq->curr->se;
11807 for_each_sched_entity(se) {
11808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11810 set_next_entity(cfs_rq, se);
11811 /* ensure bandwidth has been allocated on our new cfs_rq */
11812 account_cfs_rq_runtime(cfs_rq, 0);
11816 void init_cfs_rq(struct cfs_rq *cfs_rq)
11818 cfs_rq->tasks_timeline = RB_ROOT;
11819 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11820 #ifndef CONFIG_64BIT
11821 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11824 #ifdef CONFIG_FAIR_GROUP_SCHED
11825 cfs_rq->propagate_avg = 0;
11827 atomic_long_set(&cfs_rq->removed_load_avg, 0);
11828 atomic_long_set(&cfs_rq->removed_util_avg, 0);
11832 #ifdef CONFIG_FAIR_GROUP_SCHED
11833 static void task_set_group_fair(struct task_struct *p)
11835 struct sched_entity *se = &p->se;
11837 set_task_rq(p, task_cpu(p));
11838 se->depth = se->parent ? se->parent->depth + 1 : 0;
11841 static void task_move_group_fair(struct task_struct *p)
11843 detach_task_cfs_rq(p);
11844 set_task_rq(p, task_cpu(p));
11847 /* Tell se's cfs_rq has been changed -- migrated */
11848 p->se.avg.last_update_time = 0;
11850 attach_task_cfs_rq(p);
11853 static void task_change_group_fair(struct task_struct *p, int type)
11856 case TASK_SET_GROUP:
11857 task_set_group_fair(p);
11860 case TASK_MOVE_GROUP:
11861 task_move_group_fair(p);
11866 void free_fair_sched_group(struct task_group *tg)
11870 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11872 for_each_possible_cpu(i) {
11874 kfree(tg->cfs_rq[i]);
11883 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11885 struct sched_entity *se;
11886 struct cfs_rq *cfs_rq;
11890 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
11893 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
11897 tg->shares = NICE_0_LOAD;
11899 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11901 for_each_possible_cpu(i) {
11904 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11905 GFP_KERNEL, cpu_to_node(i));
11909 se = kzalloc_node(sizeof(struct sched_entity),
11910 GFP_KERNEL, cpu_to_node(i));
11914 init_cfs_rq(cfs_rq);
11915 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11916 init_entity_runnable_average(se);
11918 raw_spin_lock_irq(&rq->lock);
11919 post_init_entity_util_avg(se);
11920 raw_spin_unlock_irq(&rq->lock);
11931 void unregister_fair_sched_group(struct task_group *tg)
11933 unsigned long flags;
11937 for_each_possible_cpu(cpu) {
11939 remove_entity_load_avg(tg->se[cpu]);
11942 * Only empty task groups can be destroyed; so we can speculatively
11943 * check on_list without danger of it being re-added.
11945 if (!tg->cfs_rq[cpu]->on_list)
11950 raw_spin_lock_irqsave(&rq->lock, flags);
11951 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11952 raw_spin_unlock_irqrestore(&rq->lock, flags);
11956 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11957 struct sched_entity *se, int cpu,
11958 struct sched_entity *parent)
11960 struct rq *rq = cpu_rq(cpu);
11964 init_cfs_rq_runtime(cfs_rq);
11966 tg->cfs_rq[cpu] = cfs_rq;
11969 /* se could be NULL for root_task_group */
11974 se->cfs_rq = &rq->cfs;
11977 se->cfs_rq = parent->my_q;
11978 se->depth = parent->depth + 1;
11982 /* guarantee group entities always have weight */
11983 update_load_set(&se->load, NICE_0_LOAD);
11984 se->parent = parent;
11987 static DEFINE_MUTEX(shares_mutex);
11989 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11992 unsigned long flags;
11995 * We can't change the weight of the root cgroup.
12000 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
12002 mutex_lock(&shares_mutex);
12003 if (tg->shares == shares)
12006 tg->shares = shares;
12007 for_each_possible_cpu(i) {
12008 struct rq *rq = cpu_rq(i);
12009 struct sched_entity *se;
12012 /* Propagate contribution to hierarchy */
12013 raw_spin_lock_irqsave(&rq->lock, flags);
12015 /* Possible calls to update_curr() need rq clock */
12016 update_rq_clock(rq);
12017 for_each_sched_entity(se) {
12018 update_load_avg(se, UPDATE_TG);
12019 update_cfs_shares(se);
12021 raw_spin_unlock_irqrestore(&rq->lock, flags);
12025 mutex_unlock(&shares_mutex);
12028 #else /* CONFIG_FAIR_GROUP_SCHED */
12030 void free_fair_sched_group(struct task_group *tg) { }
12032 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12037 void unregister_fair_sched_group(struct task_group *tg) { }
12039 #endif /* CONFIG_FAIR_GROUP_SCHED */
12042 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
12044 struct sched_entity *se = &task->se;
12045 unsigned int rr_interval = 0;
12048 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
12051 if (rq->cfs.load.weight)
12052 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
12054 return rr_interval;
12058 * All the scheduling class methods:
12060 const struct sched_class fair_sched_class = {
12061 .next = &idle_sched_class,
12062 .enqueue_task = enqueue_task_fair,
12063 .dequeue_task = dequeue_task_fair,
12064 .yield_task = yield_task_fair,
12065 .yield_to_task = yield_to_task_fair,
12067 .check_preempt_curr = check_preempt_wakeup,
12069 .pick_next_task = pick_next_task_fair,
12070 .put_prev_task = put_prev_task_fair,
12073 .select_task_rq = select_task_rq_fair,
12074 .migrate_task_rq = migrate_task_rq_fair,
12076 .rq_online = rq_online_fair,
12077 .rq_offline = rq_offline_fair,
12079 .task_waking = task_waking_fair,
12080 .task_dead = task_dead_fair,
12081 .set_cpus_allowed = set_cpus_allowed_common,
12084 .set_curr_task = set_curr_task_fair,
12085 .task_tick = task_tick_fair,
12086 .task_fork = task_fork_fair,
12088 .prio_changed = prio_changed_fair,
12089 .switched_from = switched_from_fair,
12090 .switched_to = switched_to_fair,
12092 .get_rr_interval = get_rr_interval_fair,
12094 .update_curr = update_curr_fair,
12096 #ifdef CONFIG_FAIR_GROUP_SCHED
12097 .task_change_group = task_change_group_fair,
12099 #ifdef CONFIG_SCHED_HMP
12100 .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
12101 .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
12102 .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
12106 #ifdef CONFIG_SCHED_DEBUG
12107 void print_cfs_stats(struct seq_file *m, int cpu)
12109 struct cfs_rq *cfs_rq;
12112 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
12113 print_cfs_rq(m, cpu, cfs_rq);
12117 #ifdef CONFIG_NUMA_BALANCING
12118 void show_numa_stats(struct task_struct *p, struct seq_file *m)
12121 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
12123 for_each_online_node(node) {
12124 if (p->numa_faults) {
12125 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
12126 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
12128 if (p->numa_group) {
12129 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
12130 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
12132 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
12135 #endif /* CONFIG_NUMA_BALANCING */
12136 #endif /* CONFIG_SCHED_DEBUG */
12138 __init void init_sched_fair_class(void)
12141 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
12143 #ifdef CONFIG_NO_HZ_COMMON
12144 nohz.next_balance = jiffies;
12145 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
12146 cpu_notifier(sched_ilb_notifier, 0);