2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 #include <linux/module.h>
36 #include <trace/events/sched.h>
41 * Targeted preemption latency for CPU-bound tasks:
42 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
44 * NOTE: this latency value is not the same as the concept of
45 * 'timeslice length' - timeslices in CFS are of variable length
46 * and have no persistent notion like in traditional, time-slice
47 * based scheduling concepts.
49 * (to see the precise effective timeslice length of your workload,
50 * run vmstat and monitor the context-switches (cs) field)
52 unsigned int sysctl_sched_latency = 6000000ULL;
53 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55 unsigned int sysctl_sched_sync_hint_enable = 1;
56 unsigned int sysctl_sched_cstate_aware = 1;
59 * The initial- and re-scaling of tunables is configurable
60 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
63 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
64 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
65 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
67 enum sched_tunable_scaling sysctl_sched_tunable_scaling
68 = SCHED_TUNABLESCALING_LOG;
71 * Minimal preemption granularity for CPU-bound tasks:
72 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
74 unsigned int sysctl_sched_min_granularity = 750000ULL;
75 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
78 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
80 static unsigned int sched_nr_latency = 8;
83 * After fork, child runs first. If set to 0 (default) then
84 * parent will (try to) run first.
86 unsigned int sysctl_sched_child_runs_first __read_mostly;
89 * SCHED_OTHER wake-up granularity.
90 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
92 * This option delays the preemption effects of decoupled workloads
93 * and reduces their over-scheduling. Synchronous workloads will still
94 * have immediate wakeup/sleep latencies.
96 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
97 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
99 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
102 * The exponential sliding window over which load is averaged for shares
106 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
108 #ifdef CONFIG_CFS_BANDWIDTH
110 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
111 * each time a cfs_rq requests quota.
113 * Note: in the case that the slice exceeds the runtime remaining (either due
114 * to consumption or the quota being specified to be smaller than the slice)
115 * we will always only issue the remaining available time.
117 * default: 5 msec, units: microseconds
119 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
123 * The margin used when comparing utilization with CPU capacity:
124 * util * margin < capacity * 1024
126 unsigned int capacity_margin = 1280; /* ~20% */
128 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
134 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
140 static inline void update_load_set(struct load_weight *lw, unsigned long w)
147 * Increase the granularity value when there are more CPUs,
148 * because with more CPUs the 'effective latency' as visible
149 * to users decreases. But the relationship is not linear,
150 * so pick a second-best guess by going with the log2 of the
153 * This idea comes from the SD scheduler of Con Kolivas:
155 static unsigned int get_update_sysctl_factor(void)
157 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
160 switch (sysctl_sched_tunable_scaling) {
161 case SCHED_TUNABLESCALING_NONE:
164 case SCHED_TUNABLESCALING_LINEAR:
167 case SCHED_TUNABLESCALING_LOG:
169 factor = 1 + ilog2(cpus);
176 static void update_sysctl(void)
178 unsigned int factor = get_update_sysctl_factor();
180 #define SET_SYSCTL(name) \
181 (sysctl_##name = (factor) * normalized_sysctl_##name)
182 SET_SYSCTL(sched_min_granularity);
183 SET_SYSCTL(sched_latency);
184 SET_SYSCTL(sched_wakeup_granularity);
188 void sched_init_granularity(void)
193 #define WMULT_CONST (~0U)
194 #define WMULT_SHIFT 32
196 static void __update_inv_weight(struct load_weight *lw)
200 if (likely(lw->inv_weight))
203 w = scale_load_down(lw->weight);
205 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
207 else if (unlikely(!w))
208 lw->inv_weight = WMULT_CONST;
210 lw->inv_weight = WMULT_CONST / w;
214 * delta_exec * weight / lw.weight
216 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
218 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
219 * we're guaranteed shift stays positive because inv_weight is guaranteed to
220 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
222 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
223 * weight/lw.weight <= 1, and therefore our shift will also be positive.
225 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
227 u64 fact = scale_load_down(weight);
228 int shift = WMULT_SHIFT;
230 __update_inv_weight(lw);
232 if (unlikely(fact >> 32)) {
239 /* hint to use a 32x32->64 mul */
240 fact = (u64)(u32)fact * lw->inv_weight;
247 return mul_u64_u32_shr(delta_exec, fact, shift);
251 static int active_load_balance_cpu_stop(void *data);
254 const struct sched_class fair_sched_class;
256 /**************************************************************
257 * CFS operations on generic schedulable entities:
260 #ifdef CONFIG_FAIR_GROUP_SCHED
262 /* cpu runqueue to which this cfs_rq is attached */
263 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
268 /* An entity is a task if it doesn't "own" a runqueue */
269 #define entity_is_task(se) (!se->my_q)
271 static inline struct task_struct *task_of(struct sched_entity *se)
273 #ifdef CONFIG_SCHED_DEBUG
274 WARN_ON_ONCE(!entity_is_task(se));
276 return container_of(se, struct task_struct, se);
279 /* Walk up scheduling entities hierarchy */
280 #define for_each_sched_entity(se) \
281 for (; se; se = se->parent)
283 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
288 /* runqueue on which this entity is (to be) queued */
289 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
294 /* runqueue "owned" by this group */
295 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
300 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
302 if (!cfs_rq->on_list) {
303 struct rq *rq = rq_of(cfs_rq);
304 int cpu = cpu_of(rq);
306 * Ensure we either appear before our parent (if already
307 * enqueued) or force our parent to appear after us when it is
308 * enqueued. The fact that we always enqueue bottom-up
309 * reduces this to two cases and a special case for the root
310 * cfs_rq. Furthermore, it also means that we will always reset
311 * tmp_alone_branch either when the branch is connected
312 * to a tree or when we reach the beg of the tree
314 if (cfs_rq->tg->parent &&
315 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
317 * If parent is already on the list, we add the child
318 * just before. Thanks to circular linked property of
319 * the list, this means to put the child at the tail
320 * of the list that starts by parent.
322 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
323 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
325 * The branch is now connected to its tree so we can
326 * reset tmp_alone_branch to the beginning of the
329 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
330 } else if (!cfs_rq->tg->parent) {
332 * cfs rq without parent should be put
333 * at the tail of the list.
335 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
336 &rq->leaf_cfs_rq_list);
338 * We have reach the beg of a tree so we can reset
339 * tmp_alone_branch to the beginning of the list.
341 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
344 * The parent has not already been added so we want to
345 * make sure that it will be put after us.
346 * tmp_alone_branch points to the beg of the branch
347 * where we will add parent.
349 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
350 rq->tmp_alone_branch);
352 * update tmp_alone_branch to points to the new beg
355 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
362 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
364 if (cfs_rq->on_list) {
365 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
370 /* Iterate thr' all leaf cfs_rq's on a runqueue */
371 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
372 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
374 /* Do the two (enqueued) entities belong to the same group ? */
375 static inline struct cfs_rq *
376 is_same_group(struct sched_entity *se, struct sched_entity *pse)
378 if (se->cfs_rq == pse->cfs_rq)
384 static inline struct sched_entity *parent_entity(struct sched_entity *se)
390 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
392 int se_depth, pse_depth;
395 * preemption test can be made between sibling entities who are in the
396 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
397 * both tasks until we find their ancestors who are siblings of common
401 /* First walk up until both entities are at same depth */
402 se_depth = (*se)->depth;
403 pse_depth = (*pse)->depth;
405 while (se_depth > pse_depth) {
407 *se = parent_entity(*se);
410 while (pse_depth > se_depth) {
412 *pse = parent_entity(*pse);
415 while (!is_same_group(*se, *pse)) {
416 *se = parent_entity(*se);
417 *pse = parent_entity(*pse);
421 #else /* !CONFIG_FAIR_GROUP_SCHED */
423 static inline struct task_struct *task_of(struct sched_entity *se)
425 return container_of(se, struct task_struct, se);
428 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
430 return container_of(cfs_rq, struct rq, cfs);
433 #define entity_is_task(se) 1
435 #define for_each_sched_entity(se) \
436 for (; se; se = NULL)
438 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
440 return &task_rq(p)->cfs;
443 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
445 struct task_struct *p = task_of(se);
446 struct rq *rq = task_rq(p);
451 /* runqueue "owned" by this group */
452 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
461 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
465 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
466 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
468 static inline struct sched_entity *parent_entity(struct sched_entity *se)
474 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
478 #endif /* CONFIG_FAIR_GROUP_SCHED */
480 static __always_inline
481 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
483 /**************************************************************
484 * Scheduling class tree data structure manipulation methods:
487 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
489 s64 delta = (s64)(vruntime - max_vruntime);
491 max_vruntime = vruntime;
496 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
498 s64 delta = (s64)(vruntime - min_vruntime);
500 min_vruntime = vruntime;
505 static inline int entity_before(struct sched_entity *a,
506 struct sched_entity *b)
508 return (s64)(a->vruntime - b->vruntime) < 0;
511 static void update_min_vruntime(struct cfs_rq *cfs_rq)
513 u64 vruntime = cfs_rq->min_vruntime;
516 vruntime = cfs_rq->curr->vruntime;
518 if (cfs_rq->rb_leftmost) {
519 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
524 vruntime = se->vruntime;
526 vruntime = min_vruntime(vruntime, se->vruntime);
529 /* ensure we never gain time by being placed backwards. */
530 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
533 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
538 * Enqueue an entity into the rb-tree:
540 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
543 struct rb_node *parent = NULL;
544 struct sched_entity *entry;
548 * Find the right place in the rbtree:
552 entry = rb_entry(parent, struct sched_entity, run_node);
554 * We dont care about collisions. Nodes with
555 * the same key stay together.
557 if (entity_before(se, entry)) {
558 link = &parent->rb_left;
560 link = &parent->rb_right;
566 * Maintain a cache of leftmost tree entries (it is frequently
570 cfs_rq->rb_leftmost = &se->run_node;
572 rb_link_node(&se->run_node, parent, link);
573 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
576 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
578 if (cfs_rq->rb_leftmost == &se->run_node) {
579 struct rb_node *next_node;
581 next_node = rb_next(&se->run_node);
582 cfs_rq->rb_leftmost = next_node;
585 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
588 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
590 struct rb_node *left = cfs_rq->rb_leftmost;
595 return rb_entry(left, struct sched_entity, run_node);
598 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
600 struct rb_node *next = rb_next(&se->run_node);
605 return rb_entry(next, struct sched_entity, run_node);
608 #ifdef CONFIG_SCHED_DEBUG
609 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
611 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
616 return rb_entry(last, struct sched_entity, run_node);
619 /**************************************************************
620 * Scheduling class statistics methods:
623 int sched_proc_update_handler(struct ctl_table *table, int write,
624 void __user *buffer, size_t *lenp,
627 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
628 unsigned int factor = get_update_sysctl_factor();
633 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
634 sysctl_sched_min_granularity);
636 #define WRT_SYSCTL(name) \
637 (normalized_sysctl_##name = sysctl_##name / (factor))
638 WRT_SYSCTL(sched_min_granularity);
639 WRT_SYSCTL(sched_latency);
640 WRT_SYSCTL(sched_wakeup_granularity);
650 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
652 if (unlikely(se->load.weight != NICE_0_LOAD))
653 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
659 * The idea is to set a period in which each task runs once.
661 * When there are too many tasks (sched_nr_latency) we have to stretch
662 * this period because otherwise the slices get too small.
664 * p = (nr <= nl) ? l : l*nr/nl
666 static u64 __sched_period(unsigned long nr_running)
668 if (unlikely(nr_running > sched_nr_latency))
669 return nr_running * sysctl_sched_min_granularity;
671 return sysctl_sched_latency;
675 * We calculate the wall-time slice from the period by taking a part
676 * proportional to the weight.
680 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
682 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
684 for_each_sched_entity(se) {
685 struct load_weight *load;
686 struct load_weight lw;
688 cfs_rq = cfs_rq_of(se);
689 load = &cfs_rq->load;
691 if (unlikely(!se->on_rq)) {
694 update_load_add(&lw, se->load.weight);
697 slice = __calc_delta(slice, se->load.weight, load);
703 * We calculate the vruntime slice of a to-be-inserted task.
707 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
709 return calc_delta_fair(sched_slice(cfs_rq, se), se);
713 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
714 static unsigned long task_h_load(struct task_struct *p);
717 * We choose a half-life close to 1 scheduling period.
718 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
719 * dependent on this value.
721 #define LOAD_AVG_PERIOD 32
722 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
723 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
725 /* Give new sched_entity start runnable values to heavy its load in infant time */
726 void init_entity_runnable_average(struct sched_entity *se)
728 struct sched_avg *sa = &se->avg;
730 sa->last_update_time = 0;
732 * sched_avg's period_contrib should be strictly less then 1024, so
733 * we give it 1023 to make sure it is almost a period (1024us), and
734 * will definitely be update (after enqueue).
736 sa->period_contrib = 1023;
738 * Tasks are intialized with full load to be seen as heavy tasks until
739 * they get a chance to stabilize to their real load level.
740 * Group entities are intialized with zero load to reflect the fact that
741 * nothing has been attached to the task group yet.
743 if (entity_is_task(se))
744 sa->load_avg = scale_load_down(se->load.weight);
745 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
747 * In previous Android versions, we used to have:
748 * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
749 * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
750 * However, that functionality has been moved to enqueue.
751 * It is unclear if we should restore this in enqueue.
754 * At this point, util_avg won't be used in select_task_rq_fair anyway
758 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
761 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
762 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
763 static void attach_entity_cfs_rq(struct sched_entity *se);
764 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
767 * With new tasks being created, their initial util_avgs are extrapolated
768 * based on the cfs_rq's current util_avg:
770 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
772 * However, in many cases, the above util_avg does not give a desired
773 * value. Moreover, the sum of the util_avgs may be divergent, such
774 * as when the series is a harmonic series.
776 * To solve this problem, we also cap the util_avg of successive tasks to
777 * only 1/2 of the left utilization budget:
779 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
781 * where n denotes the nth task.
783 * For example, a simplest series from the beginning would be like:
785 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
786 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
788 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
789 * if util_avg > util_avg_cap.
791 void post_init_entity_util_avg(struct sched_entity *se)
793 struct cfs_rq *cfs_rq = cfs_rq_of(se);
794 struct sched_avg *sa = &se->avg;
795 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
798 if (cfs_rq->avg.util_avg != 0) {
799 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
800 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
802 if (sa->util_avg > cap)
808 * If we wish to restore tuning via setting initial util,
809 * this is where we should do it.
811 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
814 if (entity_is_task(se)) {
815 struct task_struct *p = task_of(se);
816 if (p->sched_class != &fair_sched_class) {
818 * For !fair tasks do:
820 update_cfs_rq_load_avg(now, cfs_rq, false);
821 attach_entity_load_avg(cfs_rq, se);
822 switched_from_fair(rq, p);
824 * such that the next switched_to_fair() has the
827 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
832 attach_entity_cfs_rq(se);
835 #else /* !CONFIG_SMP */
836 void init_entity_runnable_average(struct sched_entity *se)
839 void post_init_entity_util_avg(struct sched_entity *se)
842 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
845 #endif /* CONFIG_SMP */
848 * Update the current task's runtime statistics.
850 static void update_curr(struct cfs_rq *cfs_rq)
852 struct sched_entity *curr = cfs_rq->curr;
853 u64 now = rq_clock_task(rq_of(cfs_rq));
859 delta_exec = now - curr->exec_start;
860 if (unlikely((s64)delta_exec <= 0))
863 curr->exec_start = now;
865 schedstat_set(curr->statistics.exec_max,
866 max(delta_exec, curr->statistics.exec_max));
868 curr->sum_exec_runtime += delta_exec;
869 schedstat_add(cfs_rq, exec_clock, delta_exec);
871 curr->vruntime += calc_delta_fair(delta_exec, curr);
872 update_min_vruntime(cfs_rq);
874 if (entity_is_task(curr)) {
875 struct task_struct *curtask = task_of(curr);
877 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
878 cpuacct_charge(curtask, delta_exec);
879 account_group_exec_runtime(curtask, delta_exec);
882 account_cfs_rq_runtime(cfs_rq, delta_exec);
885 static void update_curr_fair(struct rq *rq)
887 update_curr(cfs_rq_of(&rq->curr->se));
890 #ifdef CONFIG_SCHEDSTATS
892 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
894 u64 wait_start = rq_clock(rq_of(cfs_rq));
896 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
897 likely(wait_start > se->statistics.wait_start))
898 wait_start -= se->statistics.wait_start;
900 se->statistics.wait_start = wait_start;
904 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
906 struct task_struct *p;
907 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
909 if (entity_is_task(se)) {
911 if (task_on_rq_migrating(p)) {
913 * Preserve migrating task's wait time so wait_start
914 * time stamp can be adjusted to accumulate wait time
915 * prior to migration.
917 se->statistics.wait_start = delta;
920 trace_sched_stat_wait(p, delta);
923 se->statistics.wait_max = max(se->statistics.wait_max, delta);
924 se->statistics.wait_count++;
925 se->statistics.wait_sum += delta;
926 se->statistics.wait_start = 0;
930 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
935 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
941 * Task is being enqueued - update stats:
943 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
946 * Are we enqueueing a waiting task? (for current tasks
947 * a dequeue/enqueue event is a NOP)
949 if (se != cfs_rq->curr)
950 update_stats_wait_start(cfs_rq, se);
954 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
957 * Mark the end of the wait period if dequeueing a
960 if (se != cfs_rq->curr)
961 update_stats_wait_end(cfs_rq, se);
965 * We are picking a new current task - update its stats:
968 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
971 * We are starting a new run period:
973 se->exec_start = rq_clock_task(rq_of(cfs_rq));
976 /**************************************************
977 * Scheduling class queueing methods:
980 #ifdef CONFIG_NUMA_BALANCING
982 * Approximate time to scan a full NUMA task in ms. The task scan period is
983 * calculated based on the tasks virtual memory size and
984 * numa_balancing_scan_size.
986 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
987 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
989 /* Portion of address space to scan in MB */
990 unsigned int sysctl_numa_balancing_scan_size = 256;
992 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
993 unsigned int sysctl_numa_balancing_scan_delay = 1000;
995 static unsigned int task_nr_scan_windows(struct task_struct *p)
997 unsigned long rss = 0;
998 unsigned long nr_scan_pages;
1001 * Calculations based on RSS as non-present and empty pages are skipped
1002 * by the PTE scanner and NUMA hinting faults should be trapped based
1005 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1006 rss = get_mm_rss(p->mm);
1008 rss = nr_scan_pages;
1010 rss = round_up(rss, nr_scan_pages);
1011 return rss / nr_scan_pages;
1014 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1015 #define MAX_SCAN_WINDOW 2560
1017 static unsigned int task_scan_min(struct task_struct *p)
1019 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1020 unsigned int scan, floor;
1021 unsigned int windows = 1;
1023 if (scan_size < MAX_SCAN_WINDOW)
1024 windows = MAX_SCAN_WINDOW / scan_size;
1025 floor = 1000 / windows;
1027 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1028 return max_t(unsigned int, floor, scan);
1031 static unsigned int task_scan_max(struct task_struct *p)
1033 unsigned int smin = task_scan_min(p);
1036 /* Watch for min being lower than max due to floor calculations */
1037 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1038 return max(smin, smax);
1041 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1043 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1044 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1047 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1049 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1050 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1056 spinlock_t lock; /* nr_tasks, tasks */
1060 struct rcu_head rcu;
1061 nodemask_t active_nodes;
1062 unsigned long total_faults;
1064 * Faults_cpu is used to decide whether memory should move
1065 * towards the CPU. As a consequence, these stats are weighted
1066 * more by CPU use than by memory faults.
1068 unsigned long *faults_cpu;
1069 unsigned long faults[0];
1072 /* Shared or private faults. */
1073 #define NR_NUMA_HINT_FAULT_TYPES 2
1075 /* Memory and CPU locality */
1076 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1078 /* Averaged statistics, and temporary buffers. */
1079 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1081 pid_t task_numa_group_id(struct task_struct *p)
1083 return p->numa_group ? p->numa_group->gid : 0;
1087 * The averaged statistics, shared & private, memory & cpu,
1088 * occupy the first half of the array. The second half of the
1089 * array is for current counters, which are averaged into the
1090 * first set by task_numa_placement.
1092 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1094 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1097 static inline unsigned long task_faults(struct task_struct *p, int nid)
1099 if (!p->numa_faults)
1102 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1103 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1106 static inline unsigned long group_faults(struct task_struct *p, int nid)
1111 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1112 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1115 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1117 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1118 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1121 /* Handle placement on systems where not all nodes are directly connected. */
1122 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1123 int maxdist, bool task)
1125 unsigned long score = 0;
1129 * All nodes are directly connected, and the same distance
1130 * from each other. No need for fancy placement algorithms.
1132 if (sched_numa_topology_type == NUMA_DIRECT)
1136 * This code is called for each node, introducing N^2 complexity,
1137 * which should be ok given the number of nodes rarely exceeds 8.
1139 for_each_online_node(node) {
1140 unsigned long faults;
1141 int dist = node_distance(nid, node);
1144 * The furthest away nodes in the system are not interesting
1145 * for placement; nid was already counted.
1147 if (dist == sched_max_numa_distance || node == nid)
1151 * On systems with a backplane NUMA topology, compare groups
1152 * of nodes, and move tasks towards the group with the most
1153 * memory accesses. When comparing two nodes at distance
1154 * "hoplimit", only nodes closer by than "hoplimit" are part
1155 * of each group. Skip other nodes.
1157 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1161 /* Add up the faults from nearby nodes. */
1163 faults = task_faults(p, node);
1165 faults = group_faults(p, node);
1168 * On systems with a glueless mesh NUMA topology, there are
1169 * no fixed "groups of nodes". Instead, nodes that are not
1170 * directly connected bounce traffic through intermediate
1171 * nodes; a numa_group can occupy any set of nodes.
1172 * The further away a node is, the less the faults count.
1173 * This seems to result in good task placement.
1175 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1176 faults *= (sched_max_numa_distance - dist);
1177 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1187 * These return the fraction of accesses done by a particular task, or
1188 * task group, on a particular numa node. The group weight is given a
1189 * larger multiplier, in order to group tasks together that are almost
1190 * evenly spread out between numa nodes.
1192 static inline unsigned long task_weight(struct task_struct *p, int nid,
1195 unsigned long faults, total_faults;
1197 if (!p->numa_faults)
1200 total_faults = p->total_numa_faults;
1205 faults = task_faults(p, nid);
1206 faults += score_nearby_nodes(p, nid, dist, true);
1208 return 1000 * faults / total_faults;
1211 static inline unsigned long group_weight(struct task_struct *p, int nid,
1214 unsigned long faults, total_faults;
1219 total_faults = p->numa_group->total_faults;
1224 faults = group_faults(p, nid);
1225 faults += score_nearby_nodes(p, nid, dist, false);
1227 return 1000 * faults / total_faults;
1230 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1231 int src_nid, int dst_cpu)
1233 struct numa_group *ng = p->numa_group;
1234 int dst_nid = cpu_to_node(dst_cpu);
1235 int last_cpupid, this_cpupid;
1237 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1240 * Multi-stage node selection is used in conjunction with a periodic
1241 * migration fault to build a temporal task<->page relation. By using
1242 * a two-stage filter we remove short/unlikely relations.
1244 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1245 * a task's usage of a particular page (n_p) per total usage of this
1246 * page (n_t) (in a given time-span) to a probability.
1248 * Our periodic faults will sample this probability and getting the
1249 * same result twice in a row, given these samples are fully
1250 * independent, is then given by P(n)^2, provided our sample period
1251 * is sufficiently short compared to the usage pattern.
1253 * This quadric squishes small probabilities, making it less likely we
1254 * act on an unlikely task<->page relation.
1256 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1257 if (!cpupid_pid_unset(last_cpupid) &&
1258 cpupid_to_nid(last_cpupid) != dst_nid)
1261 /* Always allow migrate on private faults */
1262 if (cpupid_match_pid(p, last_cpupid))
1265 /* A shared fault, but p->numa_group has not been set up yet. */
1270 * Do not migrate if the destination is not a node that
1271 * is actively used by this numa group.
1273 if (!node_isset(dst_nid, ng->active_nodes))
1277 * Source is a node that is not actively used by this
1278 * numa group, while the destination is. Migrate.
1280 if (!node_isset(src_nid, ng->active_nodes))
1284 * Both source and destination are nodes in active
1285 * use by this numa group. Maximize memory bandwidth
1286 * by migrating from more heavily used groups, to less
1287 * heavily used ones, spreading the load around.
1288 * Use a 1/4 hysteresis to avoid spurious page movement.
1290 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1293 static unsigned long weighted_cpuload(const int cpu);
1294 static unsigned long source_load(int cpu, int type);
1295 static unsigned long target_load(int cpu, int type);
1296 static unsigned long capacity_of(int cpu);
1297 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1299 /* Cached statistics for all CPUs within a node */
1301 unsigned long nr_running;
1304 /* Total compute capacity of CPUs on a node */
1305 unsigned long compute_capacity;
1307 /* Approximate capacity in terms of runnable tasks on a node */
1308 unsigned long task_capacity;
1309 int has_free_capacity;
1313 * XXX borrowed from update_sg_lb_stats
1315 static void update_numa_stats(struct numa_stats *ns, int nid)
1317 int smt, cpu, cpus = 0;
1318 unsigned long capacity;
1320 memset(ns, 0, sizeof(*ns));
1321 for_each_cpu(cpu, cpumask_of_node(nid)) {
1322 struct rq *rq = cpu_rq(cpu);
1324 ns->nr_running += rq->nr_running;
1325 ns->load += weighted_cpuload(cpu);
1326 ns->compute_capacity += capacity_of(cpu);
1332 * If we raced with hotplug and there are no CPUs left in our mask
1333 * the @ns structure is NULL'ed and task_numa_compare() will
1334 * not find this node attractive.
1336 * We'll either bail at !has_free_capacity, or we'll detect a huge
1337 * imbalance and bail there.
1342 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1343 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1344 capacity = cpus / smt; /* cores */
1346 ns->task_capacity = min_t(unsigned, capacity,
1347 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1348 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1351 struct task_numa_env {
1352 struct task_struct *p;
1354 int src_cpu, src_nid;
1355 int dst_cpu, dst_nid;
1357 struct numa_stats src_stats, dst_stats;
1362 struct task_struct *best_task;
1367 static void task_numa_assign(struct task_numa_env *env,
1368 struct task_struct *p, long imp)
1371 put_task_struct(env->best_task);
1374 env->best_imp = imp;
1375 env->best_cpu = env->dst_cpu;
1378 static bool load_too_imbalanced(long src_load, long dst_load,
1379 struct task_numa_env *env)
1382 long orig_src_load, orig_dst_load;
1383 long src_capacity, dst_capacity;
1386 * The load is corrected for the CPU capacity available on each node.
1389 * ------------ vs ---------
1390 * src_capacity dst_capacity
1392 src_capacity = env->src_stats.compute_capacity;
1393 dst_capacity = env->dst_stats.compute_capacity;
1395 /* We care about the slope of the imbalance, not the direction. */
1396 if (dst_load < src_load)
1397 swap(dst_load, src_load);
1399 /* Is the difference below the threshold? */
1400 imb = dst_load * src_capacity * 100 -
1401 src_load * dst_capacity * env->imbalance_pct;
1406 * The imbalance is above the allowed threshold.
1407 * Compare it with the old imbalance.
1409 orig_src_load = env->src_stats.load;
1410 orig_dst_load = env->dst_stats.load;
1412 if (orig_dst_load < orig_src_load)
1413 swap(orig_dst_load, orig_src_load);
1415 old_imb = orig_dst_load * src_capacity * 100 -
1416 orig_src_load * dst_capacity * env->imbalance_pct;
1418 /* Would this change make things worse? */
1419 return (imb > old_imb);
1423 * This checks if the overall compute and NUMA accesses of the system would
1424 * be improved if the source tasks was migrated to the target dst_cpu taking
1425 * into account that it might be best if task running on the dst_cpu should
1426 * be exchanged with the source task
1428 static void task_numa_compare(struct task_numa_env *env,
1429 long taskimp, long groupimp)
1431 struct rq *src_rq = cpu_rq(env->src_cpu);
1432 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1433 struct task_struct *cur;
1434 long src_load, dst_load;
1436 long imp = env->p->numa_group ? groupimp : taskimp;
1438 int dist = env->dist;
1439 bool assigned = false;
1443 raw_spin_lock_irq(&dst_rq->lock);
1446 * No need to move the exiting task or idle task.
1448 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1452 * The task_struct must be protected here to protect the
1453 * p->numa_faults access in the task_weight since the
1454 * numa_faults could already be freed in the following path:
1455 * finish_task_switch()
1456 * --> put_task_struct()
1457 * --> __put_task_struct()
1458 * --> task_numa_free()
1460 get_task_struct(cur);
1463 raw_spin_unlock_irq(&dst_rq->lock);
1466 * Because we have preemption enabled we can get migrated around and
1467 * end try selecting ourselves (current == env->p) as a swap candidate.
1473 * "imp" is the fault differential for the source task between the
1474 * source and destination node. Calculate the total differential for
1475 * the source task and potential destination task. The more negative
1476 * the value is, the more rmeote accesses that would be expected to
1477 * be incurred if the tasks were swapped.
1480 /* Skip this swap candidate if cannot move to the source cpu */
1481 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1485 * If dst and source tasks are in the same NUMA group, or not
1486 * in any group then look only at task weights.
1488 if (cur->numa_group == env->p->numa_group) {
1489 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1490 task_weight(cur, env->dst_nid, dist);
1492 * Add some hysteresis to prevent swapping the
1493 * tasks within a group over tiny differences.
1495 if (cur->numa_group)
1499 * Compare the group weights. If a task is all by
1500 * itself (not part of a group), use the task weight
1503 if (cur->numa_group)
1504 imp += group_weight(cur, env->src_nid, dist) -
1505 group_weight(cur, env->dst_nid, dist);
1507 imp += task_weight(cur, env->src_nid, dist) -
1508 task_weight(cur, env->dst_nid, dist);
1512 if (imp <= env->best_imp && moveimp <= env->best_imp)
1516 /* Is there capacity at our destination? */
1517 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1518 !env->dst_stats.has_free_capacity)
1524 /* Balance doesn't matter much if we're running a task per cpu */
1525 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1526 dst_rq->nr_running == 1)
1530 * In the overloaded case, try and keep the load balanced.
1533 load = task_h_load(env->p);
1534 dst_load = env->dst_stats.load + load;
1535 src_load = env->src_stats.load - load;
1537 if (moveimp > imp && moveimp > env->best_imp) {
1539 * If the improvement from just moving env->p direction is
1540 * better than swapping tasks around, check if a move is
1541 * possible. Store a slightly smaller score than moveimp,
1542 * so an actually idle CPU will win.
1544 if (!load_too_imbalanced(src_load, dst_load, env)) {
1546 put_task_struct(cur);
1552 if (imp <= env->best_imp)
1556 load = task_h_load(cur);
1561 if (load_too_imbalanced(src_load, dst_load, env))
1565 * One idle CPU per node is evaluated for a task numa move.
1566 * Call select_idle_sibling to maybe find a better one.
1569 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1574 task_numa_assign(env, cur, imp);
1578 * The dst_rq->curr isn't assigned. The protection for task_struct is
1581 if (cur && !assigned)
1582 put_task_struct(cur);
1585 static void task_numa_find_cpu(struct task_numa_env *env,
1586 long taskimp, long groupimp)
1590 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1591 /* Skip this CPU if the source task cannot migrate */
1592 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1596 task_numa_compare(env, taskimp, groupimp);
1600 /* Only move tasks to a NUMA node less busy than the current node. */
1601 static bool numa_has_capacity(struct task_numa_env *env)
1603 struct numa_stats *src = &env->src_stats;
1604 struct numa_stats *dst = &env->dst_stats;
1606 if (src->has_free_capacity && !dst->has_free_capacity)
1610 * Only consider a task move if the source has a higher load
1611 * than the destination, corrected for CPU capacity on each node.
1613 * src->load dst->load
1614 * --------------------- vs ---------------------
1615 * src->compute_capacity dst->compute_capacity
1617 if (src->load * dst->compute_capacity * env->imbalance_pct >
1619 dst->load * src->compute_capacity * 100)
1625 static int task_numa_migrate(struct task_struct *p)
1627 struct task_numa_env env = {
1630 .src_cpu = task_cpu(p),
1631 .src_nid = task_node(p),
1633 .imbalance_pct = 112,
1639 struct sched_domain *sd;
1640 unsigned long taskweight, groupweight;
1642 long taskimp, groupimp;
1645 * Pick the lowest SD_NUMA domain, as that would have the smallest
1646 * imbalance and would be the first to start moving tasks about.
1648 * And we want to avoid any moving of tasks about, as that would create
1649 * random movement of tasks -- counter the numa conditions we're trying
1653 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1655 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1659 * Cpusets can break the scheduler domain tree into smaller
1660 * balance domains, some of which do not cross NUMA boundaries.
1661 * Tasks that are "trapped" in such domains cannot be migrated
1662 * elsewhere, so there is no point in (re)trying.
1664 if (unlikely(!sd)) {
1665 p->numa_preferred_nid = task_node(p);
1669 env.dst_nid = p->numa_preferred_nid;
1670 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1671 taskweight = task_weight(p, env.src_nid, dist);
1672 groupweight = group_weight(p, env.src_nid, dist);
1673 update_numa_stats(&env.src_stats, env.src_nid);
1674 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1675 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1676 update_numa_stats(&env.dst_stats, env.dst_nid);
1678 /* Try to find a spot on the preferred nid. */
1679 if (numa_has_capacity(&env))
1680 task_numa_find_cpu(&env, taskimp, groupimp);
1683 * Look at other nodes in these cases:
1684 * - there is no space available on the preferred_nid
1685 * - the task is part of a numa_group that is interleaved across
1686 * multiple NUMA nodes; in order to better consolidate the group,
1687 * we need to check other locations.
1689 if (env.best_cpu == -1 || (p->numa_group &&
1690 nodes_weight(p->numa_group->active_nodes) > 1)) {
1691 for_each_online_node(nid) {
1692 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1695 dist = node_distance(env.src_nid, env.dst_nid);
1696 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1698 taskweight = task_weight(p, env.src_nid, dist);
1699 groupweight = group_weight(p, env.src_nid, dist);
1702 /* Only consider nodes where both task and groups benefit */
1703 taskimp = task_weight(p, nid, dist) - taskweight;
1704 groupimp = group_weight(p, nid, dist) - groupweight;
1705 if (taskimp < 0 && groupimp < 0)
1710 update_numa_stats(&env.dst_stats, env.dst_nid);
1711 if (numa_has_capacity(&env))
1712 task_numa_find_cpu(&env, taskimp, groupimp);
1717 * If the task is part of a workload that spans multiple NUMA nodes,
1718 * and is migrating into one of the workload's active nodes, remember
1719 * this node as the task's preferred numa node, so the workload can
1721 * A task that migrated to a second choice node will be better off
1722 * trying for a better one later. Do not set the preferred node here.
1724 if (p->numa_group) {
1725 if (env.best_cpu == -1)
1730 if (node_isset(nid, p->numa_group->active_nodes))
1731 sched_setnuma(p, env.dst_nid);
1734 /* No better CPU than the current one was found. */
1735 if (env.best_cpu == -1)
1739 * Reset the scan period if the task is being rescheduled on an
1740 * alternative node to recheck if the tasks is now properly placed.
1742 p->numa_scan_period = task_scan_min(p);
1744 if (env.best_task == NULL) {
1745 ret = migrate_task_to(p, env.best_cpu);
1747 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1751 ret = migrate_swap(p, env.best_task);
1753 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1754 put_task_struct(env.best_task);
1758 /* Attempt to migrate a task to a CPU on the preferred node. */
1759 static void numa_migrate_preferred(struct task_struct *p)
1761 unsigned long interval = HZ;
1763 /* This task has no NUMA fault statistics yet */
1764 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1767 /* Periodically retry migrating the task to the preferred node */
1768 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1769 p->numa_migrate_retry = jiffies + interval;
1771 /* Success if task is already running on preferred CPU */
1772 if (task_node(p) == p->numa_preferred_nid)
1775 /* Otherwise, try migrate to a CPU on the preferred node */
1776 task_numa_migrate(p);
1780 * Find the nodes on which the workload is actively running. We do this by
1781 * tracking the nodes from which NUMA hinting faults are triggered. This can
1782 * be different from the set of nodes where the workload's memory is currently
1785 * The bitmask is used to make smarter decisions on when to do NUMA page
1786 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1787 * are added when they cause over 6/16 of the maximum number of faults, but
1788 * only removed when they drop below 3/16.
1790 static void update_numa_active_node_mask(struct numa_group *numa_group)
1792 unsigned long faults, max_faults = 0;
1795 for_each_online_node(nid) {
1796 faults = group_faults_cpu(numa_group, nid);
1797 if (faults > max_faults)
1798 max_faults = faults;
1801 for_each_online_node(nid) {
1802 faults = group_faults_cpu(numa_group, nid);
1803 if (!node_isset(nid, numa_group->active_nodes)) {
1804 if (faults > max_faults * 6 / 16)
1805 node_set(nid, numa_group->active_nodes);
1806 } else if (faults < max_faults * 3 / 16)
1807 node_clear(nid, numa_group->active_nodes);
1812 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1813 * increments. The more local the fault statistics are, the higher the scan
1814 * period will be for the next scan window. If local/(local+remote) ratio is
1815 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1816 * the scan period will decrease. Aim for 70% local accesses.
1818 #define NUMA_PERIOD_SLOTS 10
1819 #define NUMA_PERIOD_THRESHOLD 7
1822 * Increase the scan period (slow down scanning) if the majority of
1823 * our memory is already on our local node, or if the majority of
1824 * the page accesses are shared with other processes.
1825 * Otherwise, decrease the scan period.
1827 static void update_task_scan_period(struct task_struct *p,
1828 unsigned long shared, unsigned long private)
1830 unsigned int period_slot;
1834 unsigned long remote = p->numa_faults_locality[0];
1835 unsigned long local = p->numa_faults_locality[1];
1838 * If there were no record hinting faults then either the task is
1839 * completely idle or all activity is areas that are not of interest
1840 * to automatic numa balancing. Related to that, if there were failed
1841 * migration then it implies we are migrating too quickly or the local
1842 * node is overloaded. In either case, scan slower
1844 if (local + shared == 0 || p->numa_faults_locality[2]) {
1845 p->numa_scan_period = min(p->numa_scan_period_max,
1846 p->numa_scan_period << 1);
1848 p->mm->numa_next_scan = jiffies +
1849 msecs_to_jiffies(p->numa_scan_period);
1855 * Prepare to scale scan period relative to the current period.
1856 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1857 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1858 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1860 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1861 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1862 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1863 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1866 diff = slot * period_slot;
1868 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1871 * Scale scan rate increases based on sharing. There is an
1872 * inverse relationship between the degree of sharing and
1873 * the adjustment made to the scanning period. Broadly
1874 * speaking the intent is that there is little point
1875 * scanning faster if shared accesses dominate as it may
1876 * simply bounce migrations uselessly
1878 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1879 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1882 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1883 task_scan_min(p), task_scan_max(p));
1884 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1888 * Get the fraction of time the task has been running since the last
1889 * NUMA placement cycle. The scheduler keeps similar statistics, but
1890 * decays those on a 32ms period, which is orders of magnitude off
1891 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1892 * stats only if the task is so new there are no NUMA statistics yet.
1894 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1896 u64 runtime, delta, now;
1897 /* Use the start of this time slice to avoid calculations. */
1898 now = p->se.exec_start;
1899 runtime = p->se.sum_exec_runtime;
1901 if (p->last_task_numa_placement) {
1902 delta = runtime - p->last_sum_exec_runtime;
1903 *period = now - p->last_task_numa_placement;
1905 /* Avoid time going backwards, prevent potential divide error: */
1906 if (unlikely((s64)*period < 0))
1909 delta = p->se.avg.load_sum / p->se.load.weight;
1910 *period = LOAD_AVG_MAX;
1913 p->last_sum_exec_runtime = runtime;
1914 p->last_task_numa_placement = now;
1920 * Determine the preferred nid for a task in a numa_group. This needs to
1921 * be done in a way that produces consistent results with group_weight,
1922 * otherwise workloads might not converge.
1924 static int preferred_group_nid(struct task_struct *p, int nid)
1929 /* Direct connections between all NUMA nodes. */
1930 if (sched_numa_topology_type == NUMA_DIRECT)
1934 * On a system with glueless mesh NUMA topology, group_weight
1935 * scores nodes according to the number of NUMA hinting faults on
1936 * both the node itself, and on nearby nodes.
1938 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1939 unsigned long score, max_score = 0;
1940 int node, max_node = nid;
1942 dist = sched_max_numa_distance;
1944 for_each_online_node(node) {
1945 score = group_weight(p, node, dist);
1946 if (score > max_score) {
1955 * Finding the preferred nid in a system with NUMA backplane
1956 * interconnect topology is more involved. The goal is to locate
1957 * tasks from numa_groups near each other in the system, and
1958 * untangle workloads from different sides of the system. This requires
1959 * searching down the hierarchy of node groups, recursively searching
1960 * inside the highest scoring group of nodes. The nodemask tricks
1961 * keep the complexity of the search down.
1963 nodes = node_online_map;
1964 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1965 unsigned long max_faults = 0;
1966 nodemask_t max_group = NODE_MASK_NONE;
1969 /* Are there nodes at this distance from each other? */
1970 if (!find_numa_distance(dist))
1973 for_each_node_mask(a, nodes) {
1974 unsigned long faults = 0;
1975 nodemask_t this_group;
1976 nodes_clear(this_group);
1978 /* Sum group's NUMA faults; includes a==b case. */
1979 for_each_node_mask(b, nodes) {
1980 if (node_distance(a, b) < dist) {
1981 faults += group_faults(p, b);
1982 node_set(b, this_group);
1983 node_clear(b, nodes);
1987 /* Remember the top group. */
1988 if (faults > max_faults) {
1989 max_faults = faults;
1990 max_group = this_group;
1992 * subtle: at the smallest distance there is
1993 * just one node left in each "group", the
1994 * winner is the preferred nid.
1999 /* Next round, evaluate the nodes within max_group. */
2007 static void task_numa_placement(struct task_struct *p)
2009 int seq, nid, max_nid = -1, max_group_nid = -1;
2010 unsigned long max_faults = 0, max_group_faults = 0;
2011 unsigned long fault_types[2] = { 0, 0 };
2012 unsigned long total_faults;
2013 u64 runtime, period;
2014 spinlock_t *group_lock = NULL;
2017 * The p->mm->numa_scan_seq field gets updated without
2018 * exclusive access. Use READ_ONCE() here to ensure
2019 * that the field is read in a single access:
2021 seq = READ_ONCE(p->mm->numa_scan_seq);
2022 if (p->numa_scan_seq == seq)
2024 p->numa_scan_seq = seq;
2025 p->numa_scan_period_max = task_scan_max(p);
2027 total_faults = p->numa_faults_locality[0] +
2028 p->numa_faults_locality[1];
2029 runtime = numa_get_avg_runtime(p, &period);
2031 /* If the task is part of a group prevent parallel updates to group stats */
2032 if (p->numa_group) {
2033 group_lock = &p->numa_group->lock;
2034 spin_lock_irq(group_lock);
2037 /* Find the node with the highest number of faults */
2038 for_each_online_node(nid) {
2039 /* Keep track of the offsets in numa_faults array */
2040 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2041 unsigned long faults = 0, group_faults = 0;
2044 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2045 long diff, f_diff, f_weight;
2047 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2048 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2049 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2050 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2052 /* Decay existing window, copy faults since last scan */
2053 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2054 fault_types[priv] += p->numa_faults[membuf_idx];
2055 p->numa_faults[membuf_idx] = 0;
2058 * Normalize the faults_from, so all tasks in a group
2059 * count according to CPU use, instead of by the raw
2060 * number of faults. Tasks with little runtime have
2061 * little over-all impact on throughput, and thus their
2062 * faults are less important.
2064 f_weight = div64_u64(runtime << 16, period + 1);
2065 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2067 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2068 p->numa_faults[cpubuf_idx] = 0;
2070 p->numa_faults[mem_idx] += diff;
2071 p->numa_faults[cpu_idx] += f_diff;
2072 faults += p->numa_faults[mem_idx];
2073 p->total_numa_faults += diff;
2074 if (p->numa_group) {
2076 * safe because we can only change our own group
2078 * mem_idx represents the offset for a given
2079 * nid and priv in a specific region because it
2080 * is at the beginning of the numa_faults array.
2082 p->numa_group->faults[mem_idx] += diff;
2083 p->numa_group->faults_cpu[mem_idx] += f_diff;
2084 p->numa_group->total_faults += diff;
2085 group_faults += p->numa_group->faults[mem_idx];
2089 if (faults > max_faults) {
2090 max_faults = faults;
2094 if (group_faults > max_group_faults) {
2095 max_group_faults = group_faults;
2096 max_group_nid = nid;
2100 update_task_scan_period(p, fault_types[0], fault_types[1]);
2102 if (p->numa_group) {
2103 update_numa_active_node_mask(p->numa_group);
2104 spin_unlock_irq(group_lock);
2105 max_nid = preferred_group_nid(p, max_group_nid);
2109 /* Set the new preferred node */
2110 if (max_nid != p->numa_preferred_nid)
2111 sched_setnuma(p, max_nid);
2113 if (task_node(p) != p->numa_preferred_nid)
2114 numa_migrate_preferred(p);
2118 static inline int get_numa_group(struct numa_group *grp)
2120 return atomic_inc_not_zero(&grp->refcount);
2123 static inline void put_numa_group(struct numa_group *grp)
2125 if (atomic_dec_and_test(&grp->refcount))
2126 kfree_rcu(grp, rcu);
2129 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2132 struct numa_group *grp, *my_grp;
2133 struct task_struct *tsk;
2135 int cpu = cpupid_to_cpu(cpupid);
2138 if (unlikely(!p->numa_group)) {
2139 unsigned int size = sizeof(struct numa_group) +
2140 4*nr_node_ids*sizeof(unsigned long);
2142 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2146 atomic_set(&grp->refcount, 1);
2147 spin_lock_init(&grp->lock);
2149 /* Second half of the array tracks nids where faults happen */
2150 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2153 node_set(task_node(current), grp->active_nodes);
2155 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2156 grp->faults[i] = p->numa_faults[i];
2158 grp->total_faults = p->total_numa_faults;
2161 rcu_assign_pointer(p->numa_group, grp);
2165 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2167 if (!cpupid_match_pid(tsk, cpupid))
2170 grp = rcu_dereference(tsk->numa_group);
2174 my_grp = p->numa_group;
2179 * Only join the other group if its bigger; if we're the bigger group,
2180 * the other task will join us.
2182 if (my_grp->nr_tasks > grp->nr_tasks)
2186 * Tie-break on the grp address.
2188 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2191 /* Always join threads in the same process. */
2192 if (tsk->mm == current->mm)
2195 /* Simple filter to avoid false positives due to PID collisions */
2196 if (flags & TNF_SHARED)
2199 /* Update priv based on whether false sharing was detected */
2202 if (join && !get_numa_group(grp))
2210 BUG_ON(irqs_disabled());
2211 double_lock_irq(&my_grp->lock, &grp->lock);
2213 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2214 my_grp->faults[i] -= p->numa_faults[i];
2215 grp->faults[i] += p->numa_faults[i];
2217 my_grp->total_faults -= p->total_numa_faults;
2218 grp->total_faults += p->total_numa_faults;
2223 spin_unlock(&my_grp->lock);
2224 spin_unlock_irq(&grp->lock);
2226 rcu_assign_pointer(p->numa_group, grp);
2228 put_numa_group(my_grp);
2237 * Get rid of NUMA staticstics associated with a task (either current or dead).
2238 * If @final is set, the task is dead and has reached refcount zero, so we can
2239 * safely free all relevant data structures. Otherwise, there might be
2240 * concurrent reads from places like load balancing and procfs, and we should
2241 * reset the data back to default state without freeing ->numa_faults.
2243 void task_numa_free(struct task_struct *p, bool final)
2245 struct numa_group *grp = p->numa_group;
2246 unsigned long *numa_faults = p->numa_faults;
2247 unsigned long flags;
2254 spin_lock_irqsave(&grp->lock, flags);
2255 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2256 grp->faults[i] -= p->numa_faults[i];
2257 grp->total_faults -= p->total_numa_faults;
2260 spin_unlock_irqrestore(&grp->lock, flags);
2261 RCU_INIT_POINTER(p->numa_group, NULL);
2262 put_numa_group(grp);
2266 p->numa_faults = NULL;
2269 p->total_numa_faults = 0;
2270 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2276 * Got a PROT_NONE fault for a page on @node.
2278 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2280 struct task_struct *p = current;
2281 bool migrated = flags & TNF_MIGRATED;
2282 int cpu_node = task_node(current);
2283 int local = !!(flags & TNF_FAULT_LOCAL);
2286 if (!static_branch_likely(&sched_numa_balancing))
2289 /* for example, ksmd faulting in a user's mm */
2293 /* Allocate buffer to track faults on a per-node basis */
2294 if (unlikely(!p->numa_faults)) {
2295 int size = sizeof(*p->numa_faults) *
2296 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2298 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2299 if (!p->numa_faults)
2302 p->total_numa_faults = 0;
2303 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2307 * First accesses are treated as private, otherwise consider accesses
2308 * to be private if the accessing pid has not changed
2310 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2313 priv = cpupid_match_pid(p, last_cpupid);
2314 if (!priv && !(flags & TNF_NO_GROUP))
2315 task_numa_group(p, last_cpupid, flags, &priv);
2319 * If a workload spans multiple NUMA nodes, a shared fault that
2320 * occurs wholly within the set of nodes that the workload is
2321 * actively using should be counted as local. This allows the
2322 * scan rate to slow down when a workload has settled down.
2324 if (!priv && !local && p->numa_group &&
2325 node_isset(cpu_node, p->numa_group->active_nodes) &&
2326 node_isset(mem_node, p->numa_group->active_nodes))
2329 task_numa_placement(p);
2332 * Retry task to preferred node migration periodically, in case it
2333 * case it previously failed, or the scheduler moved us.
2335 if (time_after(jiffies, p->numa_migrate_retry))
2336 numa_migrate_preferred(p);
2339 p->numa_pages_migrated += pages;
2340 if (flags & TNF_MIGRATE_FAIL)
2341 p->numa_faults_locality[2] += pages;
2343 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2344 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2345 p->numa_faults_locality[local] += pages;
2348 static void reset_ptenuma_scan(struct task_struct *p)
2351 * We only did a read acquisition of the mmap sem, so
2352 * p->mm->numa_scan_seq is written to without exclusive access
2353 * and the update is not guaranteed to be atomic. That's not
2354 * much of an issue though, since this is just used for
2355 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2356 * expensive, to avoid any form of compiler optimizations:
2358 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2359 p->mm->numa_scan_offset = 0;
2363 * The expensive part of numa migration is done from task_work context.
2364 * Triggered from task_tick_numa().
2366 void task_numa_work(struct callback_head *work)
2368 unsigned long migrate, next_scan, now = jiffies;
2369 struct task_struct *p = current;
2370 struct mm_struct *mm = p->mm;
2371 struct vm_area_struct *vma;
2372 unsigned long start, end;
2373 unsigned long nr_pte_updates = 0;
2374 long pages, virtpages;
2376 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2378 work->next = work; /* protect against double add */
2380 * Who cares about NUMA placement when they're dying.
2382 * NOTE: make sure not to dereference p->mm before this check,
2383 * exit_task_work() happens _after_ exit_mm() so we could be called
2384 * without p->mm even though we still had it when we enqueued this
2387 if (p->flags & PF_EXITING)
2390 if (!mm->numa_next_scan) {
2391 mm->numa_next_scan = now +
2392 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2396 * Enforce maximal scan/migration frequency..
2398 migrate = mm->numa_next_scan;
2399 if (time_before(now, migrate))
2402 if (p->numa_scan_period == 0) {
2403 p->numa_scan_period_max = task_scan_max(p);
2404 p->numa_scan_period = task_scan_min(p);
2407 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2408 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2412 * Delay this task enough that another task of this mm will likely win
2413 * the next time around.
2415 p->node_stamp += 2 * TICK_NSEC;
2417 start = mm->numa_scan_offset;
2418 pages = sysctl_numa_balancing_scan_size;
2419 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2420 virtpages = pages * 8; /* Scan up to this much virtual space */
2425 if (!down_read_trylock(&mm->mmap_sem))
2427 vma = find_vma(mm, start);
2429 reset_ptenuma_scan(p);
2433 for (; vma; vma = vma->vm_next) {
2434 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2435 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2440 * Shared library pages mapped by multiple processes are not
2441 * migrated as it is expected they are cache replicated. Avoid
2442 * hinting faults in read-only file-backed mappings or the vdso
2443 * as migrating the pages will be of marginal benefit.
2446 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2450 * Skip inaccessible VMAs to avoid any confusion between
2451 * PROT_NONE and NUMA hinting ptes
2453 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2457 start = max(start, vma->vm_start);
2458 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2459 end = min(end, vma->vm_end);
2460 nr_pte_updates = change_prot_numa(vma, start, end);
2463 * Try to scan sysctl_numa_balancing_size worth of
2464 * hpages that have at least one present PTE that
2465 * is not already pte-numa. If the VMA contains
2466 * areas that are unused or already full of prot_numa
2467 * PTEs, scan up to virtpages, to skip through those
2471 pages -= (end - start) >> PAGE_SHIFT;
2472 virtpages -= (end - start) >> PAGE_SHIFT;
2475 if (pages <= 0 || virtpages <= 0)
2479 } while (end != vma->vm_end);
2484 * It is possible to reach the end of the VMA list but the last few
2485 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2486 * would find the !migratable VMA on the next scan but not reset the
2487 * scanner to the start so check it now.
2490 mm->numa_scan_offset = start;
2492 reset_ptenuma_scan(p);
2493 up_read(&mm->mmap_sem);
2497 * Drive the periodic memory faults..
2499 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2501 struct callback_head *work = &curr->numa_work;
2505 * We don't care about NUMA placement if we don't have memory.
2507 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2511 * Using runtime rather than walltime has the dual advantage that
2512 * we (mostly) drive the selection from busy threads and that the
2513 * task needs to have done some actual work before we bother with
2516 now = curr->se.sum_exec_runtime;
2517 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2519 if (now > curr->node_stamp + period) {
2520 if (!curr->node_stamp)
2521 curr->numa_scan_period = task_scan_min(curr);
2522 curr->node_stamp += period;
2524 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2525 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2526 task_work_add(curr, work, true);
2531 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2535 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2539 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2542 #endif /* CONFIG_NUMA_BALANCING */
2545 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2547 update_load_add(&cfs_rq->load, se->load.weight);
2548 if (!parent_entity(se))
2549 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2551 if (entity_is_task(se)) {
2552 struct rq *rq = rq_of(cfs_rq);
2554 account_numa_enqueue(rq, task_of(se));
2555 list_add(&se->group_node, &rq->cfs_tasks);
2558 cfs_rq->nr_running++;
2562 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2564 update_load_sub(&cfs_rq->load, se->load.weight);
2565 if (!parent_entity(se))
2566 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2567 if (entity_is_task(se)) {
2568 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2569 list_del_init(&se->group_node);
2571 cfs_rq->nr_running--;
2574 #ifdef CONFIG_FAIR_GROUP_SCHED
2576 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2578 long tg_weight, load, shares;
2581 * This really should be: cfs_rq->avg.load_avg, but instead we use
2582 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2583 * the shares for small weight interactive tasks.
2585 load = scale_load_down(cfs_rq->load.weight);
2587 tg_weight = atomic_long_read(&tg->load_avg);
2589 /* Ensure tg_weight >= load */
2590 tg_weight -= cfs_rq->tg_load_avg_contrib;
2593 shares = (tg->shares * load);
2595 shares /= tg_weight;
2597 if (shares < MIN_SHARES)
2598 shares = MIN_SHARES;
2599 if (shares > tg->shares)
2600 shares = tg->shares;
2604 # else /* CONFIG_SMP */
2605 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2609 # endif /* CONFIG_SMP */
2611 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2612 unsigned long weight)
2615 /* commit outstanding execution time */
2616 if (cfs_rq->curr == se)
2617 update_curr(cfs_rq);
2618 account_entity_dequeue(cfs_rq, se);
2621 update_load_set(&se->load, weight);
2624 account_entity_enqueue(cfs_rq, se);
2627 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2629 static void update_cfs_shares(struct sched_entity *se)
2631 struct cfs_rq *cfs_rq = group_cfs_rq(se);
2632 struct task_group *tg;
2638 if (throttled_hierarchy(cfs_rq))
2644 if (likely(se->load.weight == tg->shares))
2647 shares = calc_cfs_shares(cfs_rq, tg);
2649 reweight_entity(cfs_rq_of(se), se, shares);
2652 #else /* CONFIG_FAIR_GROUP_SCHED */
2653 static inline void update_cfs_shares(struct sched_entity *se)
2656 #endif /* CONFIG_FAIR_GROUP_SCHED */
2659 u32 sched_get_wake_up_idle(struct task_struct *p)
2661 u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2665 EXPORT_SYMBOL(sched_get_wake_up_idle);
2667 int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2669 int enable = !!wake_up_idle;
2672 p->flags |= PF_WAKE_UP_IDLE;
2674 p->flags &= ~PF_WAKE_UP_IDLE;
2678 EXPORT_SYMBOL(sched_set_wake_up_idle);
2680 static const u32 runnable_avg_yN_inv[] = {
2681 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2682 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2683 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2684 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2685 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2686 0x85aac367, 0x82cd8698,
2690 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2691 * over-estimates when re-combining.
2693 static const u32 runnable_avg_yN_sum[] = {
2694 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2695 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2696 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2701 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2703 static __always_inline u64 decay_load(u64 val, u64 n)
2705 unsigned int local_n;
2709 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2712 /* after bounds checking we can collapse to 32-bit */
2716 * As y^PERIOD = 1/2, we can combine
2717 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2718 * With a look-up table which covers y^n (n<PERIOD)
2720 * To achieve constant time decay_load.
2722 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2723 val >>= local_n / LOAD_AVG_PERIOD;
2724 local_n %= LOAD_AVG_PERIOD;
2727 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2732 * For updates fully spanning n periods, the contribution to runnable
2733 * average will be: \Sum 1024*y^n
2735 * We can compute this reasonably efficiently by combining:
2736 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2738 static u32 __compute_runnable_contrib(u64 n)
2742 if (likely(n <= LOAD_AVG_PERIOD))
2743 return runnable_avg_yN_sum[n];
2744 else if (unlikely(n >= LOAD_AVG_MAX_N))
2745 return LOAD_AVG_MAX;
2747 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2749 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2750 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2752 n -= LOAD_AVG_PERIOD;
2753 } while (n > LOAD_AVG_PERIOD);
2755 contrib = decay_load(contrib, n);
2756 return contrib + runnable_avg_yN_sum[n];
2759 #ifdef CONFIG_SCHED_HMP
2761 /* CPU selection flag */
2762 #define SBC_FLAG_PREV_CPU 0x1
2763 #define SBC_FLAG_BEST_CAP_CPU 0x2
2764 #define SBC_FLAG_CPU_COST 0x4
2765 #define SBC_FLAG_MIN_COST 0x8
2766 #define SBC_FLAG_IDLE_LEAST_LOADED 0x10
2767 #define SBC_FLAG_IDLE_CSTATE 0x20
2768 #define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
2769 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
2770 #define SBC_FLAG_CSTATE_LOAD 0x100
2771 #define SBC_FLAG_BEST_SIBLING 0x200
2772 #define SBC_FLAG_WAKER_CPU 0x400
2773 #define SBC_FLAG_PACK_TASK 0x800
2775 /* Cluster selection flag */
2776 #define SBC_FLAG_COLOC_CLUSTER 0x10000
2777 #define SBC_FLAG_WAKER_CLUSTER 0x20000
2778 #define SBC_FLAG_BACKUP_CLUSTER 0x40000
2779 #define SBC_FLAG_BOOST_CLUSTER 0x80000
2781 struct cpu_select_env {
2782 struct task_struct *p;
2783 struct related_thread_group *rtg;
2786 u8 need_waker_cluster:1;
2788 enum sched_boost_policy boost_policy;
2791 DECLARE_BITMAP(candidate_list, NR_CPUS);
2792 DECLARE_BITMAP(backup_list, NR_CPUS);
2796 u32 sbc_best_cluster_flag;
2797 struct cpumask search_cpus;
2800 struct cluster_cpu_stats {
2801 int best_idle_cpu, least_loaded_cpu;
2802 int best_capacity_cpu, best_cpu, best_sibling_cpu;
2803 int min_cost, best_sibling_cpu_cost;
2804 int best_cpu_wakeup_latency;
2805 u64 min_load, best_load, best_sibling_cpu_load;
2806 s64 highest_spare_capacity;
2810 * Should task be woken to any available idle cpu?
2812 * Waking tasks to idle cpu has mixed implications on both performance and
2813 * power. In many cases, scheduler can't estimate correctly impact of using idle
2814 * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
2815 * module to pass a strong hint to scheduler that the task in question should be
2816 * woken to idle cpu, generally to improve performance.
2818 static inline int wake_to_idle(struct task_struct *p)
2820 return (current->flags & PF_WAKE_UP_IDLE) ||
2821 (p->flags & PF_WAKE_UP_IDLE);
2824 static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
2828 total_load = env->task_load + env->cpu_load;
2830 if (total_load > sched_spill_load ||
2831 (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
2837 static int skip_cpu(int cpu, struct cpu_select_env *env)
2839 int tcpu = task_cpu(env->p);
2845 if (is_reserved(cpu))
2848 switch (env->reason) {
2850 skip = !idle_cpu(cpu);
2852 case IRQLOAD_MIGRATION:
2853 /* Purposely fall through */
2855 skip = (cpu == tcpu);
2863 acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
2870 tcpu = task_cpu(env->p);
2871 switch (env->reason) {
2873 return cluster->capacity > cpu_capacity(tcpu);
2875 case DOWN_MIGRATION:
2876 return cluster->capacity < cpu_capacity(tcpu);
2886 skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
2888 if (!test_bit(cluster->id, env->candidate_list))
2891 if (!acceptable_capacity(cluster, env)) {
2892 __clear_bit(cluster->id, env->candidate_list);
2899 static struct sched_cluster *
2900 select_least_power_cluster(struct cpu_select_env *env)
2902 struct sched_cluster *cluster;
2905 int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
2907 env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
2909 if (task_load_will_fit(env->p, env->task_load,
2910 cpu, env->boost_policy)) {
2911 env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
2913 if (env->boost_policy == SCHED_BOOST_NONE)
2914 return env->rtg->preferred_cluster;
2916 for_each_sched_cluster(cluster) {
2917 if (cluster != env->rtg->preferred_cluster) {
2918 __set_bit(cluster->id,
2920 __clear_bit(cluster->id,
2921 env->candidate_list);
2925 return env->rtg->preferred_cluster;
2929 * Since the task load does not fit on the preferred
2930 * cluster anymore, pretend that the task does not
2931 * have any preferred cluster. This allows the waking
2932 * task to get the appropriate CPU it needs as per the
2933 * non co-location placement policy without having to
2934 * wait until the preferred cluster is updated.
2939 for_each_sched_cluster(cluster) {
2940 if (!skip_cluster(cluster, env)) {
2941 int cpu = cluster_first_cpu(cluster);
2943 env->task_load = scale_load_to_cpu(task_load(env->p),
2945 if (task_load_will_fit(env->p, env->task_load, cpu,
2949 __set_bit(cluster->id, env->backup_list);
2950 __clear_bit(cluster->id, env->candidate_list);
2957 static struct sched_cluster *
2958 next_candidate(const unsigned long *list, int start, int end)
2962 cluster_id = find_next_bit(list, end, start - 1 + 1);
2963 if (cluster_id >= end)
2966 return sched_cluster[cluster_id];
2970 update_spare_capacity(struct cluster_cpu_stats *stats,
2971 struct cpu_select_env *env, int cpu, int capacity,
2974 s64 spare_capacity = sched_ravg_window - cpu_load;
2976 if (spare_capacity > 0 &&
2977 (spare_capacity > stats->highest_spare_capacity ||
2978 (spare_capacity == stats->highest_spare_capacity &&
2979 ((!env->need_waker_cluster &&
2980 capacity > cpu_capacity(stats->best_capacity_cpu)) ||
2981 (env->need_waker_cluster &&
2982 cpu_rq(cpu)->nr_running <
2983 cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
2985 * If sync waker is the only runnable of CPU, cr_avg of the
2986 * CPU is 0 so we have high chance to place the wakee on the
2987 * waker's CPU which likely causes preemtion of the waker.
2988 * This can lead migration of preempted waker. Place the
2989 * wakee on the real idle CPU when it's possible by checking
2990 * nr_running to avoid such preemption.
2992 stats->highest_spare_capacity = spare_capacity;
2993 stats->best_capacity_cpu = cpu;
2997 static inline void find_backup_cluster(
2998 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3000 struct sched_cluster *next = NULL;
3002 struct cpumask search_cpus;
3004 while (!bitmap_empty(env->backup_list, num_clusters)) {
3005 next = next_candidate(env->backup_list, 0, num_clusters);
3006 __clear_bit(next->id, env->backup_list);
3008 cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
3009 for_each_cpu(i, &search_cpus) {
3010 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3011 sched_irqload(i), power_cost(i, task_load(env->p) +
3012 cpu_cravg_sync(i, env->sync)), 0);
3014 update_spare_capacity(stats, env, i, next->capacity,
3015 cpu_load_sync(i, env->sync));
3017 env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
3021 struct sched_cluster *
3022 next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
3023 struct cluster_cpu_stats *stats)
3025 struct sched_cluster *next = NULL;
3027 __clear_bit(cluster->id, env->candidate_list);
3029 if (env->rtg && preferred_cluster(cluster, env->p))
3033 if (bitmap_empty(env->candidate_list, num_clusters))
3036 next = next_candidate(env->candidate_list, 0, num_clusters);
3038 if (next->min_power_cost > stats->min_cost) {
3039 clear_bit(next->id, env->candidate_list);
3044 if (skip_cluster(next, env))
3049 env->task_load = scale_load_to_cpu(task_load(env->p),
3050 cluster_first_cpu(next));
3054 #ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
3055 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3056 struct cpu_select_env *env, int cpu_cost)
3059 int prev_cpu = env->prev_cpu;
3061 wakeup_latency = cpu_rq(cpu)->wakeup_latency;
3063 if (env->need_idle) {
3064 stats->min_cost = cpu_cost;
3065 if (idle_cpu(cpu)) {
3066 if (wakeup_latency < stats->best_cpu_wakeup_latency ||
3067 (wakeup_latency == stats->best_cpu_wakeup_latency &&
3069 stats->best_idle_cpu = cpu;
3070 stats->best_cpu_wakeup_latency = wakeup_latency;
3073 if (env->cpu_load < stats->min_load ||
3074 (env->cpu_load == stats->min_load &&
3076 stats->least_loaded_cpu = cpu;
3077 stats->min_load = env->cpu_load;
3084 if (cpu_cost < stats->min_cost) {
3085 stats->min_cost = cpu_cost;
3086 stats->best_cpu_wakeup_latency = wakeup_latency;
3087 stats->best_load = env->cpu_load;
3088 stats->best_cpu = cpu;
3089 env->sbc_best_flag = SBC_FLAG_CPU_COST;
3093 /* CPU cost is the same. Start breaking the tie by C-state */
3095 if (wakeup_latency > stats->best_cpu_wakeup_latency)
3098 if (wakeup_latency < stats->best_cpu_wakeup_latency) {
3099 stats->best_cpu_wakeup_latency = wakeup_latency;
3100 stats->best_load = env->cpu_load;
3101 stats->best_cpu = cpu;
3102 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
3106 /* C-state is the same. Use prev CPU to break the tie */
3107 if (cpu == prev_cpu) {
3108 stats->best_cpu = cpu;
3109 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
3113 if (stats->best_cpu != prev_cpu &&
3114 ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
3115 (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
3116 stats->best_load = env->cpu_load;
3117 stats->best_cpu = cpu;
3118 env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
3121 #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3122 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3123 struct cpu_select_env *env, int cpu_cost)
3125 int prev_cpu = env->prev_cpu;
3127 if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
3128 if (stats->best_sibling_cpu_cost > cpu_cost ||
3129 (stats->best_sibling_cpu_cost == cpu_cost &&
3130 stats->best_sibling_cpu_load > env->cpu_load)) {
3131 stats->best_sibling_cpu_cost = cpu_cost;
3132 stats->best_sibling_cpu_load = env->cpu_load;
3133 stats->best_sibling_cpu = cpu;
3137 if ((cpu_cost < stats->min_cost) ||
3138 ((stats->best_cpu != prev_cpu &&
3139 stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
3140 if (env->need_idle) {
3141 if (idle_cpu(cpu)) {
3142 stats->min_cost = cpu_cost;
3143 stats->best_idle_cpu = cpu;
3146 stats->min_cost = cpu_cost;
3147 stats->min_load = env->cpu_load;
3148 stats->best_cpu = cpu;
3149 env->sbc_best_flag = SBC_FLAG_MIN_COST;
3153 #endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3155 static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3156 struct cpu_select_env *env)
3161 * We try to find the least loaded *busy* CPU irrespective
3162 * of the power cost.
3165 cpu_cost = cpu_min_power_cost(cpu);
3168 cpu_cost = power_cost(cpu, task_load(env->p) +
3169 cpu_cravg_sync(cpu, env->sync));
3171 if (cpu_cost <= stats->min_cost)
3172 __update_cluster_stats(cpu, stats, env, cpu_cost);
3175 static void find_best_cpu_in_cluster(struct sched_cluster *c,
3176 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3179 struct cpumask search_cpus;
3181 cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
3183 env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
3185 for_each_cpu(i, &search_cpus) {
3186 env->cpu_load = cpu_load_sync(i, env->sync);
3188 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3190 power_cost(i, task_load(env->p) +
3191 cpu_cravg_sync(i, env->sync)), 0);
3193 if (skip_cpu(i, env))
3196 update_spare_capacity(stats, env, i, c->capacity,
3200 * need_idle takes precedence over sched boost but when both
3201 * are set, idlest CPU with in all the clusters is selected
3202 * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
3203 * big cluster is selected within boost_policy = BOOST_ON_BIG.
3205 if ((!env->need_idle &&
3206 env->boost_policy != SCHED_BOOST_NONE) ||
3207 env->need_waker_cluster ||
3208 sched_cpu_high_irqload(i) ||
3209 spill_threshold_crossed(env, cpu_rq(i)))
3212 update_cluster_stats(i, stats, env);
3216 static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
3218 stats->best_cpu = stats->best_idle_cpu = -1;
3219 stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
3220 stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
3221 stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
3222 stats->highest_spare_capacity = 0;
3223 stats->least_loaded_cpu = -1;
3224 stats->best_cpu_wakeup_latency = INT_MAX;
3225 /* No need to initialize stats->best_load */
3228 static inline bool env_has_special_flags(struct cpu_select_env *env)
3230 if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
3238 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3241 struct task_struct *task = env->p;
3242 struct sched_cluster *cluster;
3244 if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
3247 prev_cpu = env->prev_cpu;
3248 if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
3251 if (task->ravg.mark_start - task->last_cpu_selected_ts >=
3252 sched_long_cpu_selection_threshold)
3256 * This function should be used by task wake up path only as it's
3257 * assuming p->last_switch_out_ts as last sleep time.
3258 * p->last_switch_out_ts can denote last preemption time as well as
3261 if (task->ravg.mark_start - task->last_switch_out_ts >=
3262 sched_short_sleep_task_threshold)
3265 env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
3266 cluster = cpu_rq(prev_cpu)->cluster;
3268 if (!task_load_will_fit(task, env->task_load, prev_cpu,
3269 sched_boost_policy())) {
3271 __set_bit(cluster->id, env->backup_list);
3272 __clear_bit(cluster->id, env->candidate_list);
3276 env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
3277 if (sched_cpu_high_irqload(prev_cpu) ||
3278 spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
3279 update_spare_capacity(stats, env, prev_cpu,
3280 cluster->capacity, env->cpu_load);
3281 cpumask_clear_cpu(prev_cpu, &env->search_cpus);
3289 wake_to_waker_cluster(struct cpu_select_env *env)
3292 task_load(current) > sched_big_waker_task_load &&
3293 task_load(env->p) < sched_small_wakee_task_load;
3297 bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
3299 return sysctl_sched_prefer_sync_wakee_to_waker &&
3300 cpu_rq(cpu)->nr_running == 1 &&
3301 cpumask_test_cpu(cpu, &env->search_cpus);
3305 cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
3307 return cpumask_intersects(&env->search_cpus, &cluster->cpus);
3310 /* return cheapest cpu that can fit this task */
3311 static int select_best_cpu(struct task_struct *p, int target, int reason,
3314 struct sched_cluster *cluster, *pref_cluster = NULL;
3315 struct cluster_cpu_stats stats;
3316 struct related_thread_group *grp;
3317 unsigned int sbc_flag = 0;
3318 int cpu = raw_smp_processor_id();
3321 struct cpu_select_env env = {
3324 .need_idle = wake_to_idle(p),
3325 .need_waker_cluster = 0,
3330 .sbc_best_cluster_flag = 0,
3334 env.boost_policy = task_sched_boost(p) ?
3335 sched_boost_policy() : SCHED_BOOST_NONE;
3337 bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
3338 bitmap_zero(env.backup_list, NR_CPUS);
3340 cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
3341 cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
3343 init_cluster_cpu_stats(&stats);
3344 special = env_has_special_flags(&env);
3348 grp = task_related_thread_group(p);
3350 if (grp && grp->preferred_cluster) {
3351 pref_cluster = grp->preferred_cluster;
3352 if (!cluster_allowed(&env, pref_cluster))
3353 clear_bit(pref_cluster->id, env.candidate_list);
3356 } else if (!special) {
3357 cluster = cpu_rq(cpu)->cluster;
3358 if (wake_to_waker_cluster(&env)) {
3359 if (bias_to_waker_cpu(&env, cpu)) {
3361 sbc_flag = SBC_FLAG_WAKER_CLUSTER |
3364 } else if (cluster_allowed(&env, cluster)) {
3365 env.need_waker_cluster = 1;
3366 bitmap_zero(env.candidate_list, NR_CPUS);
3367 __set_bit(cluster->id, env.candidate_list);
3368 env.sbc_best_cluster_flag =
3369 SBC_FLAG_WAKER_CLUSTER;
3371 } else if (bias_to_prev_cpu(&env, &stats)) {
3372 sbc_flag = SBC_FLAG_PREV_CPU;
3377 if (!special && is_short_burst_task(p)) {
3378 env.pack_task = true;
3379 sbc_flag = SBC_FLAG_PACK_TASK;
3382 cluster = select_least_power_cluster(&env);
3388 * 'cluster' now points to the minimum power cluster which can satisfy
3389 * task's perf goals. Walk down the cluster list starting with that
3390 * cluster. For non-small tasks, skip clusters that don't have
3391 * mostly_idle/idle cpus
3395 find_best_cpu_in_cluster(cluster, &env, &stats);
3397 } while ((cluster = next_best_cluster(cluster, &env, &stats)));
3399 if (env.need_idle) {
3400 if (stats.best_idle_cpu >= 0) {
3401 target = stats.best_idle_cpu;
3402 sbc_flag |= SBC_FLAG_IDLE_CSTATE;
3403 } else if (stats.least_loaded_cpu >= 0) {
3404 target = stats.least_loaded_cpu;
3405 sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
3407 } else if (stats.best_cpu >= 0) {
3408 if (stats.best_sibling_cpu >= 0 &&
3409 stats.best_cpu != task_cpu(p) &&
3410 stats.min_cost == stats.best_sibling_cpu_cost) {
3411 stats.best_cpu = stats.best_sibling_cpu;
3412 sbc_flag |= SBC_FLAG_BEST_SIBLING;
3414 sbc_flag |= env.sbc_best_flag;
3415 target = stats.best_cpu;
3417 if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
3423 * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
3424 * backup_list = little cluster, candidate_list = none and
3425 * stats->best_capacity_cpu points the best spare capacity
3426 * CPU among the CPUs in the big cluster.
3428 if (env.boost_policy == SCHED_BOOST_ON_BIG &&
3429 stats.best_capacity_cpu >= 0)
3430 sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
3432 find_backup_cluster(&env, &stats);
3434 if (stats.best_capacity_cpu >= 0) {
3435 target = stats.best_capacity_cpu;
3436 sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
3439 p->last_cpu_selected_ts = sched_ktime_clock();
3441 sbc_flag |= env.sbc_best_cluster_flag;
3443 trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
3444 env.reason, env.sync, env.need_idle, sbc_flag, target);
3448 #ifdef CONFIG_CFS_BANDWIDTH
3450 static inline struct task_group *next_task_group(struct task_group *tg)
3452 tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
3454 return (&tg->list == &task_groups) ? NULL : tg;
3457 /* Iterate over all cfs_rq in a cpu */
3458 #define for_each_cfs_rq(cfs_rq, tg, cpu) \
3459 for (tg = container_of(&task_groups, struct task_group, list); \
3460 ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
3462 void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
3464 struct task_group *tg;
3465 struct cfs_rq *cfs_rq;
3469 for_each_cfs_rq(cfs_rq, tg, cpu)
3470 reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
3475 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
3477 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3478 struct task_struct *p, int change_cra);
3479 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3480 struct task_struct *p, int change_cra);
3482 /* Add task's contribution to a cpu' HMP statistics */
3483 void _inc_hmp_sched_stats_fair(struct rq *rq,
3484 struct task_struct *p, int change_cra)
3486 struct cfs_rq *cfs_rq;
3487 struct sched_entity *se = &p->se;
3490 * Although below check is not strictly required (as
3491 * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
3492 * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
3493 * efficiency by short-circuiting for_each_sched_entity() loop when
3494 * sched_disable_window_stats
3496 if (sched_disable_window_stats)
3499 for_each_sched_entity(se) {
3500 cfs_rq = cfs_rq_of(se);
3501 inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3502 if (cfs_rq_throttled(cfs_rq))
3506 /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3508 inc_rq_hmp_stats(rq, p, change_cra);
3511 /* Remove task's contribution from a cpu' HMP statistics */
3513 _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
3515 struct cfs_rq *cfs_rq;
3516 struct sched_entity *se = &p->se;
3518 /* See comment on efficiency in _inc_hmp_sched_stats_fair */
3519 if (sched_disable_window_stats)
3522 for_each_sched_entity(se) {
3523 cfs_rq = cfs_rq_of(se);
3524 dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3525 if (cfs_rq_throttled(cfs_rq))
3529 /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3531 dec_rq_hmp_stats(rq, p, change_cra);
3534 static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3536 _inc_hmp_sched_stats_fair(rq, p, 1);
3539 static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3541 _dec_hmp_sched_stats_fair(rq, p, 1);
3544 static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3545 u32 new_task_load, u32 new_pred_demand)
3547 struct cfs_rq *cfs_rq;
3548 struct sched_entity *se = &p->se;
3549 s64 task_load_delta = (s64)new_task_load - task_load(p);
3550 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3552 for_each_sched_entity(se) {
3553 cfs_rq = cfs_rq_of(se);
3555 fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
3558 fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
3559 if (cfs_rq_throttled(cfs_rq))
3563 /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
3565 fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
3568 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3572 static int task_will_be_throttled(struct task_struct *p);
3574 #else /* CONFIG_CFS_BANDWIDTH */
3576 inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
3579 inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3581 inc_nr_big_task(&rq->hmp_stats, p);
3582 inc_cumulative_runnable_avg(&rq->hmp_stats, p);
3586 dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
3588 dec_nr_big_task(&rq->hmp_stats, p);
3589 dec_cumulative_runnable_avg(&rq->hmp_stats, p);
3592 fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3593 u32 new_task_load, u32 new_pred_demand)
3595 s64 task_load_delta = (s64)new_task_load - task_load(p);
3596 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3598 fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
3600 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3603 static inline int task_will_be_throttled(struct task_struct *p)
3608 void _inc_hmp_sched_stats_fair(struct rq *rq,
3609 struct task_struct *p, int change_cra)
3611 inc_nr_big_task(&rq->hmp_stats, p);
3614 #endif /* CONFIG_CFS_BANDWIDTH */
3617 * Reset balance_interval at all sched_domain levels of given cpu, so that it
3620 static inline void reset_balance_interval(int cpu)
3622 struct sched_domain *sd;
3624 if (cpu >= nr_cpu_ids)
3628 for_each_domain(cpu, sd)
3629 sd->balance_interval = 0;
3634 * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
3635 * cpu as per its demand or priority)
3637 * Returns reason why task needs to be migrated
3639 static inline int migration_needed(struct task_struct *p, int cpu)
3642 struct related_thread_group *grp;
3644 if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
3647 /* No need to migrate task that is about to be throttled */
3648 if (task_will_be_throttled(p))
3651 if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
3652 cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
3653 return UP_MIGRATION;
3655 if (sched_cpu_high_irqload(cpu))
3656 return IRQLOAD_MIGRATION;
3658 nice = task_nice(p);
3660 grp = task_related_thread_group(p);
3662 * Don't assume higher capacity means higher power. If the task
3663 * is running on the power efficient CPU, avoid migrating it
3664 * to a lower capacity cluster.
3666 if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
3667 upmigrate_discouraged(p)) &&
3668 cpu_capacity(cpu) > min_capacity &&
3669 cpu_max_power_cost(cpu) == max_power_cost) {
3671 return DOWN_MIGRATION;
3674 if (!task_will_fit(p, cpu)) {
3676 return UP_MIGRATION;
3684 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
3686 unsigned long flags;
3689 /* Invoke active balance to force migrate currently running task */
3690 raw_spin_lock_irqsave(&rq->lock, flags);
3691 if (!rq->active_balance) {
3692 rq->active_balance = 1;
3693 rq->push_cpu = new_cpu;
3698 raw_spin_unlock_irqrestore(&rq->lock, flags);
3703 static DEFINE_RAW_SPINLOCK(migration_lock);
3705 static bool do_migration(int reason, int new_cpu, int cpu)
3707 if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
3708 && same_cluster(new_cpu, cpu))
3711 /* Inter cluster high irqload migrations are OK */
3712 return new_cpu != cpu;
3716 * Check if currently running task should be migrated to a better cpu.
3718 * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
3720 void check_for_migration(struct rq *rq, struct task_struct *p)
3722 int cpu = cpu_of(rq), new_cpu;
3723 int active_balance = 0, reason;
3725 reason = migration_needed(p, cpu);
3729 raw_spin_lock(&migration_lock);
3730 new_cpu = select_best_cpu(p, cpu, reason, 0);
3732 if (do_migration(reason, new_cpu, cpu)) {
3733 active_balance = kick_active_balance(rq, p, new_cpu);
3735 mark_reserved(new_cpu);
3738 raw_spin_unlock(&migration_lock);
3741 stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
3742 &rq->active_balance_work);
3745 #ifdef CONFIG_CFS_BANDWIDTH
3747 static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
3749 cfs_rq->hmp_stats.nr_big_tasks = 0;
3750 cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
3751 cfs_rq->hmp_stats.pred_demands_sum = 0;
3754 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3755 struct task_struct *p, int change_cra)
3757 inc_nr_big_task(&cfs_rq->hmp_stats, p);
3759 inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3762 static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3763 struct task_struct *p, int change_cra)
3765 dec_nr_big_task(&cfs_rq->hmp_stats, p);
3767 dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3770 static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3771 struct cfs_rq *cfs_rq)
3773 stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
3774 stats->cumulative_runnable_avg +=
3775 cfs_rq->hmp_stats.cumulative_runnable_avg;
3776 stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
3779 static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3780 struct cfs_rq *cfs_rq)
3782 stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
3783 stats->cumulative_runnable_avg -=
3784 cfs_rq->hmp_stats.cumulative_runnable_avg;
3785 stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
3787 BUG_ON(stats->nr_big_tasks < 0 ||
3788 (s64)stats->cumulative_runnable_avg < 0);
3789 BUG_ON((s64)stats->pred_demands_sum < 0);
3792 #else /* CONFIG_CFS_BANDWIDTH */
3794 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3795 struct task_struct *p, int change_cra) { }
3797 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3798 struct task_struct *p, int change_cra) { }
3800 #endif /* CONFIG_CFS_BANDWIDTH */
3802 #else /* CONFIG_SCHED_HMP */
3804 static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
3806 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3807 struct task_struct *p, int change_cra) { }
3809 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3810 struct task_struct *p, int change_cra) { }
3812 #define dec_throttled_cfs_rq_hmp_stats(...)
3813 #define inc_throttled_cfs_rq_hmp_stats(...)
3815 #endif /* CONFIG_SCHED_HMP */
3817 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
3818 #error "load tracking assumes 2^10 as unit"
3821 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
3824 * We can represent the historical contribution to runnable average as the
3825 * coefficients of a geometric series. To do this we sub-divide our runnable
3826 * history into segments of approximately 1ms (1024us); label the segment that
3827 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3829 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3831 * (now) (~1ms ago) (~2ms ago)
3833 * Let u_i denote the fraction of p_i that the entity was runnable.
3835 * We then designate the fractions u_i as our co-efficients, yielding the
3836 * following representation of historical load:
3837 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3839 * We choose y based on the with of a reasonably scheduling period, fixing:
3842 * This means that the contribution to load ~32ms ago (u_32) will be weighted
3843 * approximately half as much as the contribution to load within the last ms
3846 * When a period "rolls over" and we have new u_0`, multiplying the previous
3847 * sum again by y is sufficient to update:
3848 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3849 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3851 static __always_inline int
3852 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3853 unsigned long weight, int running, struct cfs_rq *cfs_rq)
3855 u64 delta, scaled_delta, periods;
3857 unsigned int delta_w, scaled_delta_w, decayed = 0;
3858 unsigned long scale_freq, scale_cpu;
3860 delta = now - sa->last_update_time;
3862 * This should only happen when time goes backwards, which it
3863 * unfortunately does during sched clock init when we swap over to TSC.
3865 if ((s64)delta < 0) {
3866 sa->last_update_time = now;
3871 * Use 1024ns as the unit of measurement since it's a reasonable
3872 * approximation of 1us and fast to compute.
3877 sa->last_update_time = now;
3879 scale_freq = arch_scale_freq_capacity(NULL, cpu);
3880 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3881 trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
3883 /* delta_w is the amount already accumulated against our next period */
3884 delta_w = sa->period_contrib;
3885 if (delta + delta_w >= 1024) {
3888 /* how much left for next period will start over, we don't know yet */
3889 sa->period_contrib = 0;
3892 * Now that we know we're crossing a period boundary, figure
3893 * out how much from delta we need to complete the current
3894 * period and accrue it.
3896 delta_w = 1024 - delta_w;
3897 scaled_delta_w = cap_scale(delta_w, scale_freq);
3899 sa->load_sum += weight * scaled_delta_w;
3901 cfs_rq->runnable_load_sum +=
3902 weight * scaled_delta_w;
3906 sa->util_sum += scaled_delta_w * scale_cpu;
3910 /* Figure out how many additional periods this update spans */
3911 periods = delta / 1024;
3914 sa->load_sum = decay_load(sa->load_sum, periods + 1);
3916 cfs_rq->runnable_load_sum =
3917 decay_load(cfs_rq->runnable_load_sum, periods + 1);
3919 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
3921 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
3922 contrib = __compute_runnable_contrib(periods);
3923 contrib = cap_scale(contrib, scale_freq);
3925 sa->load_sum += weight * contrib;
3927 cfs_rq->runnable_load_sum += weight * contrib;
3930 sa->util_sum += contrib * scale_cpu;
3933 /* Remainder of delta accrued against u_0` */
3934 scaled_delta = cap_scale(delta, scale_freq);
3936 sa->load_sum += weight * scaled_delta;
3938 cfs_rq->runnable_load_sum += weight * scaled_delta;
3942 sa->util_sum += scaled_delta * scale_cpu;
3944 sa->period_contrib += delta;
3947 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
3949 cfs_rq->runnable_load_avg =
3950 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3952 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
3959 * Signed add and clamp on underflow.
3961 * Explicitly do a load-store to ensure the intermediate value never hits
3962 * memory. This allows lockless observations without ever seeing the negative
3965 #define add_positive(_ptr, _val) do { \
3966 typeof(_ptr) ptr = (_ptr); \
3967 typeof(_val) val = (_val); \
3968 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3972 if (val < 0 && res > var) \
3975 WRITE_ONCE(*ptr, res); \
3978 #ifdef CONFIG_FAIR_GROUP_SCHED
3980 * update_tg_load_avg - update the tg's load avg
3981 * @cfs_rq: the cfs_rq whose avg changed
3982 * @force: update regardless of how small the difference
3984 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3985 * However, because tg->load_avg is a global value there are performance
3988 * In order to avoid having to look at the other cfs_rq's, we use a
3989 * differential update where we store the last value we propagated. This in
3990 * turn allows skipping updates if the differential is 'small'.
3992 * Updating tg's load_avg is necessary before update_cfs_share() (which is
3993 * done) and effective_load() (which is not done because it is too costly).
3995 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3997 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4000 * No need to update load_avg for root_task_group as it is not used.
4002 if (cfs_rq->tg == &root_task_group)
4005 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4006 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4007 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4012 * Called within set_task_rq() right before setting a task's cpu. The
4013 * caller only guarantees p->pi_lock is held; no other assumptions,
4014 * including the state of rq->lock, should be made.
4016 void set_task_rq_fair(struct sched_entity *se,
4017 struct cfs_rq *prev, struct cfs_rq *next)
4019 if (!sched_feat(ATTACH_AGE_LOAD))
4023 * We are supposed to update the task to "current" time, then its up to
4024 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4025 * getting what current time is, so simply throw away the out-of-date
4026 * time. This will result in the wakee task is less decayed, but giving
4027 * the wakee more load sounds not bad.
4029 if (se->avg.last_update_time && prev) {
4030 u64 p_last_update_time;
4031 u64 n_last_update_time;
4033 #ifndef CONFIG_64BIT
4034 u64 p_last_update_time_copy;
4035 u64 n_last_update_time_copy;
4038 p_last_update_time_copy = prev->load_last_update_time_copy;
4039 n_last_update_time_copy = next->load_last_update_time_copy;
4043 p_last_update_time = prev->avg.last_update_time;
4044 n_last_update_time = next->avg.last_update_time;
4046 } while (p_last_update_time != p_last_update_time_copy ||
4047 n_last_update_time != n_last_update_time_copy);
4049 p_last_update_time = prev->avg.last_update_time;
4050 n_last_update_time = next->avg.last_update_time;
4052 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
4053 &se->avg, 0, 0, NULL);
4054 se->avg.last_update_time = n_last_update_time;
4058 /* Take into account change of utilization of a child task group */
4060 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
4062 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4063 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
4065 /* Nothing to update */
4069 /* Set new sched_entity's utilization */
4070 se->avg.util_avg = gcfs_rq->avg.util_avg;
4071 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
4073 /* Update parent cfs_rq utilization */
4074 add_positive(&cfs_rq->avg.util_avg, delta);
4075 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
4078 /* Take into account change of load of a child task group */
4080 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
4082 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4083 long delta, load = gcfs_rq->avg.load_avg;
4086 * If the load of group cfs_rq is null, the load of the
4087 * sched_entity will also be null so we can skip the formula
4092 /* Get tg's load and ensure tg_load > 0 */
4093 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
4095 /* Ensure tg_load >= load and updated with current load*/
4096 tg_load -= gcfs_rq->tg_load_avg_contrib;
4100 * We need to compute a correction term in the case that the
4101 * task group is consuming more CPU than a task of equal
4102 * weight. A task with a weight equals to tg->shares will have
4103 * a load less or equal to scale_load_down(tg->shares).
4104 * Similarly, the sched_entities that represent the task group
4105 * at parent level, can't have a load higher than
4106 * scale_load_down(tg->shares). And the Sum of sched_entities'
4107 * load must be <= scale_load_down(tg->shares).
4109 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
4110 /* scale gcfs_rq's load into tg's shares*/
4111 load *= scale_load_down(gcfs_rq->tg->shares);
4116 delta = load - se->avg.load_avg;
4118 /* Nothing to update */
4122 /* Set new sched_entity's load */
4123 se->avg.load_avg = load;
4124 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
4126 /* Update parent cfs_rq load */
4127 add_positive(&cfs_rq->avg.load_avg, delta);
4128 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
4131 * If the sched_entity is already enqueued, we also have to update the
4132 * runnable load avg.
4135 /* Update parent cfs_rq runnable_load_avg */
4136 add_positive(&cfs_rq->runnable_load_avg, delta);
4137 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
4141 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
4143 cfs_rq->propagate_avg = 1;
4146 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
4148 struct cfs_rq *cfs_rq = group_cfs_rq(se);
4150 if (!cfs_rq->propagate_avg)
4153 cfs_rq->propagate_avg = 0;
4157 /* Update task and its cfs_rq load average */
4158 static inline int propagate_entity_load_avg(struct sched_entity *se)
4160 struct cfs_rq *cfs_rq;
4162 if (entity_is_task(se))
4165 if (!test_and_clear_tg_cfs_propagate(se))
4168 cfs_rq = cfs_rq_of(se);
4170 set_tg_cfs_propagate(cfs_rq);
4172 update_tg_cfs_util(cfs_rq, se);
4173 update_tg_cfs_load(cfs_rq, se);
4178 #else /* CONFIG_FAIR_GROUP_SCHED */
4180 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
4182 static inline int propagate_entity_load_avg(struct sched_entity *se)
4187 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
4189 #endif /* CONFIG_FAIR_GROUP_SCHED */
4191 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
4193 if (&this_rq()->cfs == cfs_rq) {
4195 * There are a few boundary cases this might miss but it should
4196 * get called often enough that that should (hopefully) not be
4197 * a real problem -- added to that it only calls on the local
4198 * CPU, so if we enqueue remotely we'll miss an update, but
4199 * the next tick/schedule should update.
4201 * It will not get called when we go idle, because the idle
4202 * thread is a different class (!fair), nor will the utilization
4203 * number include things like RT tasks.
4205 * As is, the util number is not freq-invariant (we'd have to
4206 * implement arch_scale_freq_capacity() for that).
4210 cpufreq_update_util(rq_of(cfs_rq), 0);
4214 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
4217 * Unsigned subtract and clamp on underflow.
4219 * Explicitly do a load-store to ensure the intermediate value never hits
4220 * memory. This allows lockless observations without ever seeing the negative
4223 #define sub_positive(_ptr, _val) do { \
4224 typeof(_ptr) ptr = (_ptr); \
4225 typeof(*ptr) val = (_val); \
4226 typeof(*ptr) res, var = READ_ONCE(*ptr); \
4230 WRITE_ONCE(*ptr, res); \
4234 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4235 * @now: current time, as per cfs_rq_clock_task()
4236 * @cfs_rq: cfs_rq to update
4237 * @update_freq: should we call cfs_rq_util_change() or will the call do so
4239 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4240 * avg. The immediate corollary is that all (fair) tasks must be attached, see
4241 * post_init_entity_util_avg().
4243 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4245 * Returns true if the load decayed or we removed load.
4247 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4248 * call update_tg_load_avg() when this function returns true.
4251 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4253 struct sched_avg *sa = &cfs_rq->avg;
4254 int decayed, removed = 0, removed_util = 0;
4256 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
4257 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
4258 sub_positive(&sa->load_avg, r);
4259 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
4261 set_tg_cfs_propagate(cfs_rq);
4264 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
4265 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
4266 sub_positive(&sa->util_avg, r);
4267 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
4269 set_tg_cfs_propagate(cfs_rq);
4272 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
4273 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
4275 #ifndef CONFIG_64BIT
4277 cfs_rq->load_last_update_time_copy = sa->last_update_time;
4280 /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
4281 if (cfs_rq == &rq_of(cfs_rq)->cfs)
4282 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
4284 if (update_freq && (decayed || removed_util))
4285 cfs_rq_util_change(cfs_rq);
4287 return decayed || removed;
4291 * Optional action to be done while updating the load average
4293 #define UPDATE_TG 0x1
4294 #define SKIP_AGE_LOAD 0x2
4296 /* Update task and its cfs_rq load average */
4297 static inline void update_load_avg(struct sched_entity *se, int flags)
4299 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4300 u64 now = cfs_rq_clock_task(cfs_rq);
4301 int cpu = cpu_of(rq_of(cfs_rq));
4306 * Track task load average for carrying it to new CPU after migrated, and
4307 * track group sched_entity load average for task_h_load calc in migration
4309 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
4310 __update_load_avg(now, cpu, &se->avg,
4311 se->on_rq * scale_load_down(se->load.weight),
4312 cfs_rq->curr == se, NULL);
4315 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
4316 decayed |= propagate_entity_load_avg(se);
4318 if (decayed && (flags & UPDATE_TG))
4319 update_tg_load_avg(cfs_rq, 0);
4321 if (entity_is_task(se)) {
4322 #ifdef CONFIG_SCHED_WALT
4323 ptr = (void *)&(task_of(se)->ravg);
4325 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
4330 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4331 * @cfs_rq: cfs_rq to attach to
4332 * @se: sched_entity to attach
4334 * Must call update_cfs_rq_load_avg() before this, since we rely on
4335 * cfs_rq->avg.last_update_time being current.
4337 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4339 se->avg.last_update_time = cfs_rq->avg.last_update_time;
4340 cfs_rq->avg.load_avg += se->avg.load_avg;
4341 cfs_rq->avg.load_sum += se->avg.load_sum;
4342 cfs_rq->avg.util_avg += se->avg.util_avg;
4343 cfs_rq->avg.util_sum += se->avg.util_sum;
4344 set_tg_cfs_propagate(cfs_rq);
4346 cfs_rq_util_change(cfs_rq);
4350 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4351 * @cfs_rq: cfs_rq to detach from
4352 * @se: sched_entity to detach
4354 * Must call update_cfs_rq_load_avg() before this, since we rely on
4355 * cfs_rq->avg.last_update_time being current.
4357 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4360 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
4361 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
4362 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4363 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4364 set_tg_cfs_propagate(cfs_rq);
4366 cfs_rq_util_change(cfs_rq);
4369 /* Add the load generated by se into cfs_rq's load average */
4371 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4373 struct sched_avg *sa = &se->avg;
4375 cfs_rq->runnable_load_avg += sa->load_avg;
4376 cfs_rq->runnable_load_sum += sa->load_sum;
4378 if (!sa->last_update_time) {
4379 attach_entity_load_avg(cfs_rq, se);
4380 update_tg_load_avg(cfs_rq, 0);
4384 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
4386 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4388 cfs_rq->runnable_load_avg =
4389 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
4390 cfs_rq->runnable_load_sum =
4391 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
4394 #ifndef CONFIG_64BIT
4395 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4397 u64 last_update_time_copy;
4398 u64 last_update_time;
4401 last_update_time_copy = cfs_rq->load_last_update_time_copy;
4403 last_update_time = cfs_rq->avg.last_update_time;
4404 } while (last_update_time != last_update_time_copy);
4406 return last_update_time;
4409 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4411 return cfs_rq->avg.last_update_time;
4416 * Synchronize entity load avg of dequeued entity without locking
4419 void sync_entity_load_avg(struct sched_entity *se)
4421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4422 u64 last_update_time;
4424 last_update_time = cfs_rq_last_update_time(cfs_rq);
4425 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
4429 * Task first catches up with cfs_rq, and then subtract
4430 * itself from the cfs_rq (task must be off the queue now).
4432 void remove_entity_load_avg(struct sched_entity *se)
4434 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4437 * tasks cannot exit without having gone through wake_up_new_task() ->
4438 * post_init_entity_util_avg() which will have added things to the
4439 * cfs_rq, so we can remove unconditionally.
4441 * Similarly for groups, they will have passed through
4442 * post_init_entity_util_avg() before unregister_sched_fair_group()
4446 sync_entity_load_avg(se);
4447 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
4448 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
4452 * Update the rq's load with the elapsed running time before entering
4453 * idle. if the last scheduled task is not a CFS task, idle_enter will
4454 * be the only way to update the runnable statistic.
4456 void idle_enter_fair(struct rq *this_rq)
4461 * Update the rq's load with the elapsed idle time before a task is
4462 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
4463 * be the only way to update the runnable statistic.
4465 void idle_exit_fair(struct rq *this_rq)
4469 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
4471 return cfs_rq->runnable_load_avg;
4474 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4476 return cfs_rq->avg.load_avg;
4479 static int idle_balance(struct rq *this_rq);
4481 #else /* CONFIG_SMP */
4484 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4489 #define UPDATE_TG 0x0
4490 #define SKIP_AGE_LOAD 0x0
4492 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
4494 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4496 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4497 static inline void remove_entity_load_avg(struct sched_entity *se) {}
4500 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4502 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4504 static inline int idle_balance(struct rq *rq)
4509 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4510 struct task_struct *p, int change_cra) { }
4512 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4513 struct task_struct *p, int change_cra) { }
4515 #endif /* CONFIG_SMP */
4517 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
4519 #ifdef CONFIG_SCHEDSTATS
4520 struct task_struct *tsk = NULL;
4522 if (entity_is_task(se))
4525 if (se->statistics.sleep_start) {
4526 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
4531 if (unlikely(delta > se->statistics.sleep_max))
4532 se->statistics.sleep_max = delta;
4534 se->statistics.sleep_start = 0;
4535 se->statistics.sum_sleep_runtime += delta;
4538 account_scheduler_latency(tsk, delta >> 10, 1);
4539 trace_sched_stat_sleep(tsk, delta);
4542 if (se->statistics.block_start) {
4543 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
4548 if (unlikely(delta > se->statistics.block_max))
4549 se->statistics.block_max = delta;
4551 se->statistics.block_start = 0;
4552 se->statistics.sum_sleep_runtime += delta;
4555 if (tsk->in_iowait) {
4556 se->statistics.iowait_sum += delta;
4557 se->statistics.iowait_count++;
4558 trace_sched_stat_iowait(tsk, delta);
4561 trace_sched_stat_blocked(tsk, delta);
4562 trace_sched_blocked_reason(tsk);
4565 * Blocking time is in units of nanosecs, so shift by
4566 * 20 to get a milliseconds-range estimation of the
4567 * amount of time that the task spent sleeping:
4569 if (unlikely(prof_on == SLEEP_PROFILING)) {
4570 profile_hits(SLEEP_PROFILING,
4571 (void *)get_wchan(tsk),
4574 account_scheduler_latency(tsk, delta >> 10, 0);
4580 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4582 #ifdef CONFIG_SCHED_DEBUG
4583 s64 d = se->vruntime - cfs_rq->min_vruntime;
4588 if (d > 3*sysctl_sched_latency)
4589 schedstat_inc(cfs_rq, nr_spread_over);
4594 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4596 u64 vruntime = cfs_rq->min_vruntime;
4599 * The 'current' period is already promised to the current tasks,
4600 * however the extra weight of the new task will slow them down a
4601 * little, place the new task so that it fits in the slot that
4602 * stays open at the end.
4604 if (initial && sched_feat(START_DEBIT))
4605 vruntime += sched_vslice(cfs_rq, se);
4607 /* sleeps up to a single latency don't count. */
4609 unsigned long thresh = sysctl_sched_latency;
4612 * Halve their sleep time's effect, to allow
4613 * for a gentler effect of sleepers:
4615 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4621 /* ensure we never gain time by being placed backwards. */
4622 se->vruntime = max_vruntime(se->vruntime, vruntime);
4625 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4628 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4631 * Update the normalized vruntime before updating min_vruntime
4632 * through calling update_curr().
4634 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
4635 se->vruntime += cfs_rq->min_vruntime;
4638 * Update run-time statistics of the 'current'.
4640 update_curr(cfs_rq);
4641 update_load_avg(se, UPDATE_TG);
4642 enqueue_entity_load_avg(cfs_rq, se);
4643 update_cfs_shares(se);
4644 account_entity_enqueue(cfs_rq, se);
4646 if (flags & ENQUEUE_WAKEUP) {
4647 place_entity(cfs_rq, se, 0);
4648 enqueue_sleeper(cfs_rq, se);
4651 update_stats_enqueue(cfs_rq, se);
4652 check_spread(cfs_rq, se);
4653 if (se != cfs_rq->curr)
4654 __enqueue_entity(cfs_rq, se);
4657 if (cfs_rq->nr_running == 1) {
4658 list_add_leaf_cfs_rq(cfs_rq);
4659 check_enqueue_throttle(cfs_rq);
4663 static void __clear_buddies_last(struct sched_entity *se)
4665 for_each_sched_entity(se) {
4666 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4667 if (cfs_rq->last != se)
4670 cfs_rq->last = NULL;
4674 static void __clear_buddies_next(struct sched_entity *se)
4676 for_each_sched_entity(se) {
4677 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4678 if (cfs_rq->next != se)
4681 cfs_rq->next = NULL;
4685 static void __clear_buddies_skip(struct sched_entity *se)
4687 for_each_sched_entity(se) {
4688 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4689 if (cfs_rq->skip != se)
4692 cfs_rq->skip = NULL;
4696 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4698 if (cfs_rq->last == se)
4699 __clear_buddies_last(se);
4701 if (cfs_rq->next == se)
4702 __clear_buddies_next(se);
4704 if (cfs_rq->skip == se)
4705 __clear_buddies_skip(se);
4708 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4711 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4714 * Update run-time statistics of the 'current'.
4716 update_curr(cfs_rq);
4719 * When dequeuing a sched_entity, we must:
4720 * - Update loads to have both entity and cfs_rq synced with now.
4721 * - Substract its load from the cfs_rq->runnable_avg.
4722 * - Substract its previous weight from cfs_rq->load.weight.
4723 * - For group entity, update its weight to reflect the new share
4724 * of its group cfs_rq.
4726 update_load_avg(se, UPDATE_TG);
4727 dequeue_entity_load_avg(cfs_rq, se);
4729 update_stats_dequeue(cfs_rq, se);
4730 if (flags & DEQUEUE_SLEEP) {
4731 #ifdef CONFIG_SCHEDSTATS
4732 if (entity_is_task(se)) {
4733 struct task_struct *tsk = task_of(se);
4735 if (tsk->state & TASK_INTERRUPTIBLE)
4736 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
4737 if (tsk->state & TASK_UNINTERRUPTIBLE)
4738 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
4743 clear_buddies(cfs_rq, se);
4745 if (se != cfs_rq->curr)
4746 __dequeue_entity(cfs_rq, se);
4748 account_entity_dequeue(cfs_rq, se);
4751 * Normalize the entity after updating the min_vruntime because the
4752 * update can refer to the ->curr item and we need to reflect this
4753 * movement in our normalized position.
4755 if (!(flags & DEQUEUE_SLEEP))
4756 se->vruntime -= cfs_rq->min_vruntime;
4758 /* return excess runtime on last dequeue */
4759 return_cfs_rq_runtime(cfs_rq);
4761 update_min_vruntime(cfs_rq);
4762 update_cfs_shares(se);
4766 * Preempt the current task with a newly woken task if needed:
4769 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4771 unsigned long ideal_runtime, delta_exec;
4772 struct sched_entity *se;
4775 ideal_runtime = sched_slice(cfs_rq, curr);
4776 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4777 if (delta_exec > ideal_runtime) {
4778 resched_curr(rq_of(cfs_rq));
4780 * The current task ran long enough, ensure it doesn't get
4781 * re-elected due to buddy favours.
4783 clear_buddies(cfs_rq, curr);
4788 * Ensure that a task that missed wakeup preemption by a
4789 * narrow margin doesn't have to wait for a full slice.
4790 * This also mitigates buddy induced latencies under load.
4792 if (delta_exec < sysctl_sched_min_granularity)
4795 se = __pick_first_entity(cfs_rq);
4796 delta = curr->vruntime - se->vruntime;
4801 if (delta > ideal_runtime)
4802 resched_curr(rq_of(cfs_rq));
4806 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4808 /* 'current' is not kept within the tree. */
4811 * Any task has to be enqueued before it get to execute on
4812 * a CPU. So account for the time it spent waiting on the
4815 update_stats_wait_end(cfs_rq, se);
4816 __dequeue_entity(cfs_rq, se);
4817 update_load_avg(se, UPDATE_TG);
4820 update_stats_curr_start(cfs_rq, se);
4822 #ifdef CONFIG_SCHEDSTATS
4824 * Track our maximum slice length, if the CPU's load is at
4825 * least twice that of our own weight (i.e. dont track it
4826 * when there are only lesser-weight tasks around):
4828 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4829 se->statistics.slice_max = max(se->statistics.slice_max,
4830 se->sum_exec_runtime - se->prev_sum_exec_runtime);
4833 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4837 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4840 * Pick the next process, keeping these things in mind, in this order:
4841 * 1) keep things fair between processes/task groups
4842 * 2) pick the "next" process, since someone really wants that to run
4843 * 3) pick the "last" process, for cache locality
4844 * 4) do not run the "skip" process, if something else is available
4846 static struct sched_entity *
4847 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4849 struct sched_entity *left = __pick_first_entity(cfs_rq);
4850 struct sched_entity *se;
4853 * If curr is set we have to see if its left of the leftmost entity
4854 * still in the tree, provided there was anything in the tree at all.
4856 if (!left || (curr && entity_before(curr, left)))
4859 se = left; /* ideally we run the leftmost entity */
4862 * Avoid running the skip buddy, if running something else can
4863 * be done without getting too unfair.
4865 if (cfs_rq->skip == se) {
4866 struct sched_entity *second;
4869 second = __pick_first_entity(cfs_rq);
4871 second = __pick_next_entity(se);
4872 if (!second || (curr && entity_before(curr, second)))
4876 if (second && wakeup_preempt_entity(second, left) < 1)
4881 * Prefer last buddy, try to return the CPU to a preempted task.
4883 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4887 * Someone really wants this to run. If it's not unfair, run it.
4889 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4892 clear_buddies(cfs_rq, se);
4897 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4899 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4902 * If still on the runqueue then deactivate_task()
4903 * was not called and update_curr() has to be done:
4906 update_curr(cfs_rq);
4908 /* throttle cfs_rqs exceeding runtime */
4909 check_cfs_rq_runtime(cfs_rq);
4911 check_spread(cfs_rq, prev);
4913 update_stats_wait_start(cfs_rq, prev);
4914 /* Put 'current' back into the tree. */
4915 __enqueue_entity(cfs_rq, prev);
4916 /* in !on_rq case, update occurred at dequeue */
4917 update_load_avg(prev, 0);
4919 cfs_rq->curr = NULL;
4923 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4926 * Update run-time statistics of the 'current'.
4928 update_curr(cfs_rq);
4931 * Ensure that runnable average is periodically updated.
4933 update_load_avg(curr, UPDATE_TG);
4934 update_cfs_shares(curr);
4936 #ifdef CONFIG_SCHED_HRTICK
4938 * queued ticks are scheduled to match the slice, so don't bother
4939 * validating it and just reschedule.
4942 resched_curr(rq_of(cfs_rq));
4946 * don't let the period tick interfere with the hrtick preemption
4948 if (!sched_feat(DOUBLE_TICK) &&
4949 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4953 if (cfs_rq->nr_running > 1)
4954 check_preempt_tick(cfs_rq, curr);
4958 /**************************************************
4959 * CFS bandwidth control machinery
4962 #ifdef CONFIG_CFS_BANDWIDTH
4964 #ifdef HAVE_JUMP_LABEL
4965 static struct static_key __cfs_bandwidth_used;
4967 static inline bool cfs_bandwidth_used(void)
4969 return static_key_false(&__cfs_bandwidth_used);
4972 void cfs_bandwidth_usage_inc(void)
4974 static_key_slow_inc(&__cfs_bandwidth_used);
4977 void cfs_bandwidth_usage_dec(void)
4979 static_key_slow_dec(&__cfs_bandwidth_used);
4981 #else /* HAVE_JUMP_LABEL */
4982 static bool cfs_bandwidth_used(void)
4987 void cfs_bandwidth_usage_inc(void) {}
4988 void cfs_bandwidth_usage_dec(void) {}
4989 #endif /* HAVE_JUMP_LABEL */
4992 * default period for cfs group bandwidth.
4993 * default: 0.1s, units: nanoseconds
4995 static inline u64 default_cfs_period(void)
4997 return 100000000ULL;
5000 static inline u64 sched_cfs_bandwidth_slice(void)
5002 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5006 * Replenish runtime according to assigned quota and update expiration time.
5007 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
5008 * additional synchronization around rq->lock.
5010 * requires cfs_b->lock
5012 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5016 if (cfs_b->quota == RUNTIME_INF)
5019 now = sched_clock_cpu(smp_processor_id());
5020 cfs_b->runtime = cfs_b->quota;
5021 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
5024 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5026 return &tg->cfs_bandwidth;
5029 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
5030 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5032 if (unlikely(cfs_rq->throttle_count))
5033 return cfs_rq->throttled_clock_task;
5035 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
5038 /* returns 0 on failure to allocate runtime */
5039 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5041 struct task_group *tg = cfs_rq->tg;
5042 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
5043 u64 amount = 0, min_amount, expires;
5045 /* note: this is a positive sum as runtime_remaining <= 0 */
5046 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
5048 raw_spin_lock(&cfs_b->lock);
5049 if (cfs_b->quota == RUNTIME_INF)
5050 amount = min_amount;
5052 start_cfs_bandwidth(cfs_b);
5054 if (cfs_b->runtime > 0) {
5055 amount = min(cfs_b->runtime, min_amount);
5056 cfs_b->runtime -= amount;
5060 expires = cfs_b->runtime_expires;
5061 raw_spin_unlock(&cfs_b->lock);
5063 cfs_rq->runtime_remaining += amount;
5065 * we may have advanced our local expiration to account for allowed
5066 * spread between our sched_clock and the one on which runtime was
5069 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
5070 cfs_rq->runtime_expires = expires;
5072 return cfs_rq->runtime_remaining > 0;
5076 * Note: This depends on the synchronization provided by sched_clock and the
5077 * fact that rq->clock snapshots this value.
5079 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5081 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5083 /* if the deadline is ahead of our clock, nothing to do */
5084 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
5087 if (cfs_rq->runtime_remaining < 0)
5091 * If the local deadline has passed we have to consider the
5092 * possibility that our sched_clock is 'fast' and the global deadline
5093 * has not truly expired.
5095 * Fortunately we can check determine whether this the case by checking
5096 * whether the global deadline has advanced. It is valid to compare
5097 * cfs_b->runtime_expires without any locks since we only care about
5098 * exact equality, so a partial write will still work.
5101 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
5102 /* extend local deadline, drift is bounded above by 2 ticks */
5103 cfs_rq->runtime_expires += TICK_NSEC;
5105 /* global deadline is ahead, expiration has passed */
5106 cfs_rq->runtime_remaining = 0;
5110 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5112 /* dock delta_exec before expiring quota (as it could span periods) */
5113 cfs_rq->runtime_remaining -= delta_exec;
5114 expire_cfs_rq_runtime(cfs_rq);
5116 if (likely(cfs_rq->runtime_remaining > 0))
5120 * if we're unable to extend our runtime we resched so that the active
5121 * hierarchy can be throttled
5123 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5124 resched_curr(rq_of(cfs_rq));
5127 static __always_inline
5128 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5130 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5133 __account_cfs_rq_runtime(cfs_rq, delta_exec);
5136 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5138 return cfs_bandwidth_used() && cfs_rq->throttled;
5141 #ifdef CONFIG_SCHED_HMP
5143 * Check if task is part of a hierarchy where some cfs_rq does not have any
5146 * We can't rely on throttled_hierarchy() to do this test, as
5147 * cfs_rq->throttle_count will not be updated yet when this function is called
5148 * from scheduler_tick()
5150 static int task_will_be_throttled(struct task_struct *p)
5152 struct sched_entity *se = &p->se;
5153 struct cfs_rq *cfs_rq;
5155 if (!cfs_bandwidth_used())
5158 for_each_sched_entity(se) {
5159 cfs_rq = cfs_rq_of(se);
5160 if (!cfs_rq->runtime_enabled)
5162 if (cfs_rq->runtime_remaining <= 0)
5170 /* check whether cfs_rq, or any parent, is throttled */
5171 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5173 return cfs_bandwidth_used() && cfs_rq->throttle_count;
5177 * Ensure that neither of the group entities corresponding to src_cpu or
5178 * dest_cpu are members of a throttled hierarchy when performing group
5179 * load-balance operations.
5181 static inline int throttled_lb_pair(struct task_group *tg,
5182 int src_cpu, int dest_cpu)
5184 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5186 src_cfs_rq = tg->cfs_rq[src_cpu];
5187 dest_cfs_rq = tg->cfs_rq[dest_cpu];
5189 return throttled_hierarchy(src_cfs_rq) ||
5190 throttled_hierarchy(dest_cfs_rq);
5193 /* updated child weight may affect parent so we have to do this bottom up */
5194 static int tg_unthrottle_up(struct task_group *tg, void *data)
5196 struct rq *rq = data;
5197 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5199 cfs_rq->throttle_count--;
5201 if (!cfs_rq->throttle_count) {
5202 /* adjust cfs_rq_clock_task() */
5203 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
5204 cfs_rq->throttled_clock_task;
5211 static int tg_throttle_down(struct task_group *tg, void *data)
5213 struct rq *rq = data;
5214 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5216 /* group is entering throttled state, stop time */
5217 if (!cfs_rq->throttle_count)
5218 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5219 cfs_rq->throttle_count++;
5224 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
5226 struct rq *rq = rq_of(cfs_rq);
5227 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5228 struct sched_entity *se;
5229 long task_delta, dequeue = 1;
5232 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5234 /* freeze hierarchy runnable averages while throttled */
5236 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5239 task_delta = cfs_rq->h_nr_running;
5240 for_each_sched_entity(se) {
5241 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5242 /* throttled entity or throttle-on-deactivate */
5247 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5248 qcfs_rq->h_nr_running -= task_delta;
5249 dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
5251 if (qcfs_rq->load.weight)
5256 sub_nr_running(rq, task_delta);
5257 dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
5260 cfs_rq->throttled = 1;
5261 cfs_rq->throttled_clock = rq_clock(rq);
5262 raw_spin_lock(&cfs_b->lock);
5263 empty = list_empty(&cfs_b->throttled_cfs_rq);
5266 * Add to the _head_ of the list, so that an already-started
5267 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
5268 * not running add to the tail so that later runqueues don't get starved.
5270 if (cfs_b->distribute_running)
5271 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5273 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5276 * If we're the first throttled task, make sure the bandwidth
5280 start_cfs_bandwidth(cfs_b);
5282 raw_spin_unlock(&cfs_b->lock);
5284 /* Log effect on hmp stats after throttling */
5285 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5286 sched_irqload(cpu_of(rq)),
5287 power_cost(cpu_of(rq), 0),
5288 cpu_temp(cpu_of(rq)));
5291 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5293 struct rq *rq = rq_of(cfs_rq);
5294 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5295 struct sched_entity *se;
5298 struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
5300 se = cfs_rq->tg->se[cpu_of(rq)];
5302 cfs_rq->throttled = 0;
5304 update_rq_clock(rq);
5306 raw_spin_lock(&cfs_b->lock);
5307 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5308 list_del_rcu(&cfs_rq->throttled_list);
5309 raw_spin_unlock(&cfs_b->lock);
5311 /* update hierarchical throttle state */
5312 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5314 if (!cfs_rq->load.weight)
5317 task_delta = cfs_rq->h_nr_running;
5318 for_each_sched_entity(se) {
5322 cfs_rq = cfs_rq_of(se);
5324 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
5325 cfs_rq->h_nr_running += task_delta;
5326 inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
5328 if (cfs_rq_throttled(cfs_rq))
5333 add_nr_running(rq, task_delta);
5334 inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
5337 /* determine whether we need to wake up potentially idle cpu */
5338 if (rq->curr == rq->idle && rq->cfs.nr_running)
5341 /* Log effect on hmp stats after un-throttling */
5342 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5343 sched_irqload(cpu_of(rq)),
5344 power_cost(cpu_of(rq), 0),
5345 cpu_temp(cpu_of(rq)));
5348 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
5349 u64 remaining, u64 expires)
5351 struct cfs_rq *cfs_rq;
5353 u64 starting_runtime = remaining;
5356 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5358 struct rq *rq = rq_of(cfs_rq);
5360 raw_spin_lock(&rq->lock);
5361 if (!cfs_rq_throttled(cfs_rq))
5364 runtime = -cfs_rq->runtime_remaining + 1;
5365 if (runtime > remaining)
5366 runtime = remaining;
5367 remaining -= runtime;
5369 cfs_rq->runtime_remaining += runtime;
5370 cfs_rq->runtime_expires = expires;
5372 /* we check whether we're throttled above */
5373 if (cfs_rq->runtime_remaining > 0)
5374 unthrottle_cfs_rq(cfs_rq);
5377 raw_spin_unlock(&rq->lock);
5384 return starting_runtime - remaining;
5388 * Responsible for refilling a task_group's bandwidth and unthrottling its
5389 * cfs_rqs as appropriate. If there has been no activity within the last
5390 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5391 * used to track this state.
5393 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5395 u64 runtime, runtime_expires;
5398 /* no need to continue the timer with no bandwidth constraint */
5399 if (cfs_b->quota == RUNTIME_INF)
5400 goto out_deactivate;
5402 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5403 cfs_b->nr_periods += overrun;
5406 * idle depends on !throttled (for the case of a large deficit), and if
5407 * we're going inactive then everything else can be deferred
5409 if (cfs_b->idle && !throttled)
5410 goto out_deactivate;
5412 __refill_cfs_bandwidth_runtime(cfs_b);
5415 /* mark as potentially idle for the upcoming period */
5420 /* account preceding periods in which throttling occurred */
5421 cfs_b->nr_throttled += overrun;
5423 runtime_expires = cfs_b->runtime_expires;
5426 * This check is repeated as we are holding onto the new bandwidth while
5427 * we unthrottle. This can potentially race with an unthrottled group
5428 * trying to acquire new bandwidth from the global pool. This can result
5429 * in us over-using our runtime if it is all used during this loop, but
5430 * only by limited amounts in that extreme case.
5432 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
5433 runtime = cfs_b->runtime;
5434 cfs_b->distribute_running = 1;
5435 raw_spin_unlock(&cfs_b->lock);
5436 /* we can't nest cfs_b->lock while distributing bandwidth */
5437 runtime = distribute_cfs_runtime(cfs_b, runtime,
5439 raw_spin_lock(&cfs_b->lock);
5441 cfs_b->distribute_running = 0;
5442 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5444 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5448 * While we are ensured activity in the period following an
5449 * unthrottle, this also covers the case in which the new bandwidth is
5450 * insufficient to cover the existing bandwidth deficit. (Forcing the
5451 * timer to remain active while there are any throttled entities.)
5461 /* a cfs_rq won't donate quota below this amount */
5462 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5463 /* minimum remaining period time to redistribute slack quota */
5464 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5465 /* how long we wait to gather additional slack before distributing */
5466 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5469 * Are we near the end of the current quota period?
5471 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
5472 * hrtimer base being cleared by hrtimer_start. In the case of
5473 * migrate_hrtimers, base is never cleared, so we are fine.
5475 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5477 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5480 /* if the call-back is running a quota refresh is already occurring */
5481 if (hrtimer_callback_running(refresh_timer))
5484 /* is a quota refresh about to occur? */
5485 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5486 if (remaining < min_expire)
5492 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5494 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5496 /* if there's a quota refresh soon don't bother with slack */
5497 if (runtime_refresh_within(cfs_b, min_left))
5500 hrtimer_start(&cfs_b->slack_timer,
5501 ns_to_ktime(cfs_bandwidth_slack_period),
5505 /* we know any runtime found here is valid as update_curr() precedes return */
5506 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5508 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5509 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5511 if (slack_runtime <= 0)
5514 raw_spin_lock(&cfs_b->lock);
5515 if (cfs_b->quota != RUNTIME_INF &&
5516 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5517 cfs_b->runtime += slack_runtime;
5519 /* we are under rq->lock, defer unthrottling using a timer */
5520 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5521 !list_empty(&cfs_b->throttled_cfs_rq))
5522 start_cfs_slack_bandwidth(cfs_b);
5524 raw_spin_unlock(&cfs_b->lock);
5526 /* even if it's not valid for return we don't want to try again */
5527 cfs_rq->runtime_remaining -= slack_runtime;
5530 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5532 if (!cfs_bandwidth_used())
5535 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5538 __return_cfs_rq_runtime(cfs_rq);
5542 * This is done with a timer (instead of inline with bandwidth return) since
5543 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5545 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5547 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5550 /* confirm we're still not at a refresh boundary */
5551 raw_spin_lock(&cfs_b->lock);
5552 if (cfs_b->distribute_running) {
5553 raw_spin_unlock(&cfs_b->lock);
5557 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5558 raw_spin_unlock(&cfs_b->lock);
5562 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5563 runtime = cfs_b->runtime;
5565 expires = cfs_b->runtime_expires;
5567 cfs_b->distribute_running = 1;
5569 raw_spin_unlock(&cfs_b->lock);
5574 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5576 raw_spin_lock(&cfs_b->lock);
5577 if (expires == cfs_b->runtime_expires)
5578 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5579 cfs_b->distribute_running = 0;
5580 raw_spin_unlock(&cfs_b->lock);
5584 * When a group wakes up we want to make sure that its quota is not already
5585 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5586 * runtime as update_curr() throttling can not not trigger until it's on-rq.
5588 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5590 if (!cfs_bandwidth_used())
5593 /* Synchronize hierarchical throttle counter: */
5594 if (unlikely(!cfs_rq->throttle_uptodate)) {
5595 struct rq *rq = rq_of(cfs_rq);
5596 struct cfs_rq *pcfs_rq;
5597 struct task_group *tg;
5599 cfs_rq->throttle_uptodate = 1;
5601 /* Get closest up-to-date node, because leaves go first: */
5602 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
5603 pcfs_rq = tg->cfs_rq[cpu_of(rq)];
5604 if (pcfs_rq->throttle_uptodate)
5608 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5609 cfs_rq->throttled_clock_task = rq_clock_task(rq);
5613 /* an active group must be handled by the update_curr()->put() path */
5614 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5617 /* ensure the group is not already throttled */
5618 if (cfs_rq_throttled(cfs_rq))
5621 /* update runtime allocation */
5622 account_cfs_rq_runtime(cfs_rq, 0);
5623 if (cfs_rq->runtime_remaining <= 0)
5624 throttle_cfs_rq(cfs_rq);
5627 /* conditionally throttle active cfs_rq's from put_prev_entity() */
5628 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5630 if (!cfs_bandwidth_used())
5633 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5637 * it's possible for a throttled entity to be forced into a running
5638 * state (e.g. set_curr_task), in this case we're finished.
5640 if (cfs_rq_throttled(cfs_rq))
5643 throttle_cfs_rq(cfs_rq);
5647 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5649 struct cfs_bandwidth *cfs_b =
5650 container_of(timer, struct cfs_bandwidth, slack_timer);
5652 do_sched_cfs_slack_timer(cfs_b);
5654 return HRTIMER_NORESTART;
5657 extern const u64 max_cfs_quota_period;
5659 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5661 struct cfs_bandwidth *cfs_b =
5662 container_of(timer, struct cfs_bandwidth, period_timer);
5667 raw_spin_lock(&cfs_b->lock);
5669 overrun = hrtimer_forward_now(timer, cfs_b->period);
5674 u64 new, old = ktime_to_ns(cfs_b->period);
5676 new = (old * 147) / 128; /* ~115% */
5677 new = min(new, max_cfs_quota_period);
5679 cfs_b->period = ns_to_ktime(new);
5681 /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
5682 cfs_b->quota *= new;
5683 cfs_b->quota = div64_u64(cfs_b->quota, old);
5685 pr_warn_ratelimited(
5686 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
5688 div_u64(new, NSEC_PER_USEC),
5689 div_u64(cfs_b->quota, NSEC_PER_USEC));
5691 /* reset count so we don't come right back in here */
5695 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5698 cfs_b->period_active = 0;
5699 raw_spin_unlock(&cfs_b->lock);
5701 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5704 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5706 raw_spin_lock_init(&cfs_b->lock);
5708 cfs_b->quota = RUNTIME_INF;
5709 cfs_b->period = ns_to_ktime(default_cfs_period());
5711 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5712 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5713 cfs_b->period_timer.function = sched_cfs_period_timer;
5714 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5715 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5716 cfs_b->distribute_running = 0;
5719 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5721 cfs_rq->runtime_enabled = 0;
5722 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5723 init_cfs_rq_hmp_stats(cfs_rq);
5726 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5728 lockdep_assert_held(&cfs_b->lock);
5730 if (!cfs_b->period_active) {
5731 cfs_b->period_active = 1;
5732 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5733 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5737 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5739 /* init_cfs_bandwidth() was not called */
5740 if (!cfs_b->throttled_cfs_rq.next)
5743 hrtimer_cancel(&cfs_b->period_timer);
5744 hrtimer_cancel(&cfs_b->slack_timer);
5747 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5749 struct cfs_rq *cfs_rq;
5751 for_each_leaf_cfs_rq(rq, cfs_rq) {
5752 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
5754 raw_spin_lock(&cfs_b->lock);
5755 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5756 raw_spin_unlock(&cfs_b->lock);
5760 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5762 struct cfs_rq *cfs_rq;
5764 for_each_leaf_cfs_rq(rq, cfs_rq) {
5765 if (!cfs_rq->runtime_enabled)
5769 * clock_task is not advancing so we just need to make sure
5770 * there's some valid quota amount
5772 cfs_rq->runtime_remaining = 1;
5774 * Offline rq is schedulable till cpu is completely disabled
5775 * in take_cpu_down(), so we prevent new cfs throttling here.
5777 cfs_rq->runtime_enabled = 0;
5779 if (cfs_rq_throttled(cfs_rq))
5780 unthrottle_cfs_rq(cfs_rq);
5784 #else /* CONFIG_CFS_BANDWIDTH */
5785 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5787 return rq_clock_task(rq_of(cfs_rq));
5790 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5791 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5792 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5793 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5795 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5800 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5805 static inline int throttled_lb_pair(struct task_group *tg,
5806 int src_cpu, int dest_cpu)
5811 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5813 #ifdef CONFIG_FAIR_GROUP_SCHED
5814 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5817 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5821 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5822 static inline void update_runtime_enabled(struct rq *rq) {}
5823 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5825 #endif /* CONFIG_CFS_BANDWIDTH */
5827 /**************************************************
5828 * CFS operations on tasks:
5831 #ifdef CONFIG_SCHED_HRTICK
5832 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5834 struct sched_entity *se = &p->se;
5835 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5837 WARN_ON(task_rq(p) != rq);
5839 if (rq->cfs.h_nr_running > 1) {
5840 u64 slice = sched_slice(cfs_rq, se);
5841 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5842 s64 delta = slice - ran;
5849 hrtick_start(rq, delta);
5854 * called from enqueue/dequeue and updates the hrtick when the
5855 * current task is from our class.
5857 static void hrtick_update(struct rq *rq)
5859 struct task_struct *curr = rq->curr;
5861 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5864 hrtick_start_fair(rq, curr);
5866 #else /* !CONFIG_SCHED_HRTICK */
5868 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5872 static inline void hrtick_update(struct rq *rq)
5878 static bool __cpu_overutilized(int cpu, int delta);
5879 static bool cpu_overutilized(int cpu);
5880 unsigned long boosted_cpu_util(int cpu);
5882 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
5886 * The enqueue_task method is called before nr_running is
5887 * increased. Here we update the fair scheduling stats and
5888 * then put the task into the rbtree:
5891 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5893 struct cfs_rq *cfs_rq;
5894 struct sched_entity *se = &p->se;
5896 int task_new = flags & ENQUEUE_WAKEUP_NEW;
5900 * If in_iowait is set, the code below may not trigger any cpufreq
5901 * utilization updates, so do it here explicitly with the IOWAIT flag
5905 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5907 for_each_sched_entity(se) {
5910 cfs_rq = cfs_rq_of(se);
5911 enqueue_entity(cfs_rq, se, flags);
5914 * end evaluation on encountering a throttled cfs_rq
5916 * note: in the case of encountering a throttled cfs_rq we will
5917 * post the final h_nr_running increment below.
5919 if (cfs_rq_throttled(cfs_rq))
5921 cfs_rq->h_nr_running++;
5922 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5924 flags = ENQUEUE_WAKEUP;
5927 for_each_sched_entity(se) {
5928 cfs_rq = cfs_rq_of(se);
5929 cfs_rq->h_nr_running++;
5930 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
5932 if (cfs_rq_throttled(cfs_rq))
5935 update_load_avg(se, UPDATE_TG);
5936 update_cfs_shares(se);
5940 add_nr_running(rq, 1);
5941 inc_rq_hmp_stats(rq, p, 1);
5947 * Update SchedTune accounting.
5949 * We do it before updating the CPU capacity to ensure the
5950 * boost value of the current task is accounted for in the
5951 * selection of the OPP.
5953 * We do it also in the case where we enqueue a throttled task;
5954 * we could argue that a throttled task should not boost a CPU,
5956 * a) properly implementing CPU boosting considering throttled
5957 * tasks will increase a lot the complexity of the solution
5958 * b) it's not easy to quantify the benefits introduced by
5959 * such a more complex solution.
5960 * Thus, for the time being we go for the simple solution and boost
5961 * also for throttled RQs.
5963 schedtune_enqueue_task(p, cpu_of(rq));
5965 if (energy_aware() && !se) {
5966 if (!task_new && !rq->rd->overutilized &&
5967 cpu_overutilized(rq->cpu)) {
5968 rq->rd->overutilized = true;
5969 trace_sched_overutilized(true);
5973 #endif /* CONFIG_SMP */
5977 static void set_next_buddy(struct sched_entity *se);
5980 * The dequeue_task method is called before nr_running is
5981 * decreased. We remove the task from the rbtree and
5982 * update the fair scheduling stats:
5984 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5986 struct cfs_rq *cfs_rq;
5987 struct sched_entity *se = &p->se;
5988 int task_sleep = flags & DEQUEUE_SLEEP;
5990 for_each_sched_entity(se) {
5991 cfs_rq = cfs_rq_of(se);
5992 dequeue_entity(cfs_rq, se, flags);
5995 * end evaluation on encountering a throttled cfs_rq
5997 * note: in the case of encountering a throttled cfs_rq we will
5998 * post the final h_nr_running decrement below.
6000 if (cfs_rq_throttled(cfs_rq))
6002 cfs_rq->h_nr_running--;
6003 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6005 /* Don't dequeue parent if it has other entities besides us */
6006 if (cfs_rq->load.weight) {
6007 /* Avoid re-evaluating load for this entity: */
6008 se = parent_entity(se);
6010 * Bias pick_next to pick a task from this cfs_rq, as
6011 * p is sleeping when it is within its sched_slice.
6013 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
6017 flags |= DEQUEUE_SLEEP;
6020 for_each_sched_entity(se) {
6021 cfs_rq = cfs_rq_of(se);
6022 cfs_rq->h_nr_running--;
6023 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
6025 if (cfs_rq_throttled(cfs_rq))
6028 update_load_avg(se, UPDATE_TG);
6029 update_cfs_shares(se);
6033 sub_nr_running(rq, 1);
6034 dec_rq_hmp_stats(rq, p, 1);
6040 * Update SchedTune accounting
6042 * We do it before updating the CPU capacity to ensure the
6043 * boost value of the current task is accounted for in the
6044 * selection of the OPP.
6046 schedtune_dequeue_task(p, cpu_of(rq));
6048 #endif /* CONFIG_SMP */
6056 * per rq 'load' arrray crap; XXX kill this.
6060 * The exact cpuload at various idx values, calculated at every tick would be
6061 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
6063 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
6064 * on nth tick when cpu may be busy, then we have:
6065 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6066 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
6068 * decay_load_missed() below does efficient calculation of
6069 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
6070 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
6072 * The calculation is approximated on a 128 point scale.
6073 * degrade_zero_ticks is the number of ticks after which load at any
6074 * particular idx is approximated to be zero.
6075 * degrade_factor is a precomputed table, a row for each load idx.
6076 * Each column corresponds to degradation factor for a power of two ticks,
6077 * based on 128 point scale.
6079 * row 2, col 3 (=12) says that the degradation at load idx 2 after
6080 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
6082 * With this power of 2 load factors, we can degrade the load n times
6083 * by looking at 1 bits in n and doing as many mult/shift instead of
6084 * n mult/shifts needed by the exact degradation.
6086 #define DEGRADE_SHIFT 7
6087 static const unsigned char
6088 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
6089 static const unsigned char
6090 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
6091 {0, 0, 0, 0, 0, 0, 0, 0},
6092 {64, 32, 8, 0, 0, 0, 0, 0},
6093 {96, 72, 40, 12, 1, 0, 0},
6094 {112, 98, 75, 43, 15, 1, 0},
6095 {120, 112, 98, 76, 45, 16, 2} };
6098 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
6099 * would be when CPU is idle and so we just decay the old load without
6100 * adding any new load.
6102 static unsigned long
6103 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
6107 if (!missed_updates)
6110 if (missed_updates >= degrade_zero_ticks[idx])
6114 return load >> missed_updates;
6116 while (missed_updates) {
6117 if (missed_updates % 2)
6118 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
6120 missed_updates >>= 1;
6127 * Update rq->cpu_load[] statistics. This function is usually called every
6128 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
6129 * every tick. We fix it up based on jiffies.
6131 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
6132 unsigned long pending_updates)
6136 this_rq->nr_load_updates++;
6138 /* Update our load: */
6139 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
6140 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
6141 unsigned long old_load, new_load;
6143 /* scale is effectively 1 << i now, and >> i divides by scale */
6145 old_load = this_rq->cpu_load[i];
6146 old_load = decay_load_missed(old_load, pending_updates - 1, i);
6147 new_load = this_load;
6149 * Round up the averaging division if load is increasing. This
6150 * prevents us from getting stuck on 9 if the load is 10, for
6153 if (new_load > old_load)
6154 new_load += scale - 1;
6156 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
6159 sched_avg_update(this_rq);
6162 /* Used instead of source_load when we know the type == 0 */
6163 static unsigned long weighted_cpuload(const int cpu)
6165 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
6168 #ifdef CONFIG_NO_HZ_COMMON
6170 * There is no sane way to deal with nohz on smp when using jiffies because the
6171 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
6172 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
6174 * Therefore we cannot use the delta approach from the regular tick since that
6175 * would seriously skew the load calculation. However we'll make do for those
6176 * updates happening while idle (nohz_idle_balance) or coming out of idle
6177 * (tick_nohz_idle_exit).
6179 * This means we might still be one tick off for nohz periods.
6183 * Called from nohz_idle_balance() to update the load ratings before doing the
6186 static void update_idle_cpu_load(struct rq *this_rq)
6188 unsigned long curr_jiffies = READ_ONCE(jiffies);
6189 unsigned long load = weighted_cpuload(cpu_of(this_rq));
6190 unsigned long pending_updates;
6193 * bail if there's load or we're actually up-to-date.
6195 if (load || curr_jiffies == this_rq->last_load_update_tick)
6198 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6199 this_rq->last_load_update_tick = curr_jiffies;
6201 __update_cpu_load(this_rq, load, pending_updates);
6205 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
6207 void update_cpu_load_nohz(void)
6209 struct rq *this_rq = this_rq();
6210 unsigned long curr_jiffies = READ_ONCE(jiffies);
6211 unsigned long pending_updates;
6213 if (curr_jiffies == this_rq->last_load_update_tick)
6216 raw_spin_lock(&this_rq->lock);
6217 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
6218 if (pending_updates) {
6219 this_rq->last_load_update_tick = curr_jiffies;
6221 * We were idle, this means load 0, the current load might be
6222 * !0 due to remote wakeups and the sort.
6224 __update_cpu_load(this_rq, 0, pending_updates);
6226 raw_spin_unlock(&this_rq->lock);
6228 #endif /* CONFIG_NO_HZ */
6231 * Called from scheduler_tick()
6233 void update_cpu_load_active(struct rq *this_rq)
6235 unsigned long load = weighted_cpuload(cpu_of(this_rq));
6237 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
6239 this_rq->last_load_update_tick = jiffies;
6240 __update_cpu_load(this_rq, load, 1);
6244 * Return a low guess at the load of a migration-source cpu weighted
6245 * according to the scheduling class and "nice" value.
6247 * We want to under-estimate the load of migration sources, to
6248 * balance conservatively.
6250 static unsigned long source_load(int cpu, int type)
6252 struct rq *rq = cpu_rq(cpu);
6253 unsigned long total = weighted_cpuload(cpu);
6255 if (type == 0 || !sched_feat(LB_BIAS))
6258 return min(rq->cpu_load[type-1], total);
6262 * Return a high guess at the load of a migration-target cpu weighted
6263 * according to the scheduling class and "nice" value.
6265 static unsigned long target_load(int cpu, int type)
6267 struct rq *rq = cpu_rq(cpu);
6268 unsigned long total = weighted_cpuload(cpu);
6270 if (type == 0 || !sched_feat(LB_BIAS))
6273 return max(rq->cpu_load[type-1], total);
6277 static unsigned long cpu_avg_load_per_task(int cpu)
6279 struct rq *rq = cpu_rq(cpu);
6280 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
6281 unsigned long load_avg = weighted_cpuload(cpu);
6284 return load_avg / nr_running;
6289 static void record_wakee(struct task_struct *p)
6292 * Rough decay (wiping) for cost saving, don't worry
6293 * about the boundary, really active task won't care
6296 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6297 current->wakee_flips >>= 1;
6298 current->wakee_flip_decay_ts = jiffies;
6301 if (current->last_wakee != p) {
6302 current->last_wakee = p;
6303 current->wakee_flips++;
6307 static void task_waking_fair(struct task_struct *p)
6309 struct sched_entity *se = &p->se;
6310 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6313 #ifndef CONFIG_64BIT
6314 u64 min_vruntime_copy;
6317 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6319 min_vruntime = cfs_rq->min_vruntime;
6320 } while (min_vruntime != min_vruntime_copy);
6322 min_vruntime = cfs_rq->min_vruntime;
6325 se->vruntime -= min_vruntime;
6329 #ifdef CONFIG_FAIR_GROUP_SCHED
6331 * effective_load() calculates the load change as seen from the root_task_group
6333 * Adding load to a group doesn't make a group heavier, but can cause movement
6334 * of group shares between cpus. Assuming the shares were perfectly aligned one
6335 * can calculate the shift in shares.
6337 * Calculate the effective load difference if @wl is added (subtracted) to @tg
6338 * on this @cpu and results in a total addition (subtraction) of @wg to the
6339 * total group weight.
6341 * Given a runqueue weight distribution (rw_i) we can compute a shares
6342 * distribution (s_i) using:
6344 * s_i = rw_i / \Sum rw_j (1)
6346 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
6347 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
6348 * shares distribution (s_i):
6350 * rw_i = { 2, 4, 1, 0 }
6351 * s_i = { 2/7, 4/7, 1/7, 0 }
6353 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
6354 * task used to run on and the CPU the waker is running on), we need to
6355 * compute the effect of waking a task on either CPU and, in case of a sync
6356 * wakeup, compute the effect of the current task going to sleep.
6358 * So for a change of @wl to the local @cpu with an overall group weight change
6359 * of @wl we can compute the new shares distribution (s'_i) using:
6361 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
6363 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
6364 * differences in waking a task to CPU 0. The additional task changes the
6365 * weight and shares distributions like:
6367 * rw'_i = { 3, 4, 1, 0 }
6368 * s'_i = { 3/8, 4/8, 1/8, 0 }
6370 * We can then compute the difference in effective weight by using:
6372 * dw_i = S * (s'_i - s_i) (3)
6374 * Where 'S' is the group weight as seen by its parent.
6376 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
6377 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
6378 * 4/7) times the weight of the group.
6380 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6382 struct sched_entity *se = tg->se[cpu];
6384 if (!tg->parent) /* the trivial, non-cgroup case */
6387 for_each_sched_entity(se) {
6388 struct cfs_rq *cfs_rq = se->my_q;
6389 long W, w = cfs_rq_load_avg(cfs_rq);
6394 * W = @wg + \Sum rw_j
6396 W = wg + atomic_long_read(&tg->load_avg);
6398 /* Ensure \Sum rw_j >= rw_i */
6399 W -= cfs_rq->tg_load_avg_contrib;
6408 * wl = S * s'_i; see (2)
6411 wl = (w * (long)tg->shares) / W;
6416 * Per the above, wl is the new se->load.weight value; since
6417 * those are clipped to [MIN_SHARES, ...) do so now. See
6418 * calc_cfs_shares().
6420 if (wl < MIN_SHARES)
6424 * wl = dw_i = S * (s'_i - s_i); see (3)
6426 wl -= se->avg.load_avg;
6429 * Recursively apply this logic to all parent groups to compute
6430 * the final effective load change on the root group. Since
6431 * only the @tg group gets extra weight, all parent groups can
6432 * only redistribute existing shares. @wl is the shift in shares
6433 * resulting from this level per the above.
6442 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
6450 * Returns the current capacity of cpu after applying both
6451 * cpu and freq scaling.
6453 unsigned long capacity_curr_of(int cpu)
6455 return cpu_rq(cpu)->cpu_capacity_orig *
6456 arch_scale_freq_capacity(NULL, cpu)
6457 >> SCHED_CAPACITY_SHIFT;
6461 struct sched_group *sg_top;
6462 struct sched_group *sg_cap;
6470 struct task_struct *task;
6484 static int cpu_util_wake(int cpu, struct task_struct *p);
6487 * __cpu_norm_util() returns the cpu util relative to a specific capacity,
6488 * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
6489 * energy calculations.
6491 * Since util is a scale-invariant utilization defined as:
6493 * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
6495 * the normalized util can be found using the specific capacity.
6497 * capacity = capacity_orig * curr_freq/max_freq
6499 * norm_util = running_time/time ~ util/capacity
6501 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
6503 if (util >= capacity)
6504 return SCHED_CAPACITY_SCALE;
6506 return (util << SCHED_CAPACITY_SHIFT)/capacity;
6509 static unsigned long group_max_util(struct energy_env *eenv)
6511 unsigned long max_util = 0;
6515 for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
6516 util = cpu_util_wake(cpu, eenv->task);
6519 * If we are looking at the target CPU specified by the eenv,
6520 * then we should add the (estimated) utilization of the task
6521 * assuming we will wake it up on that CPU.
6523 if (unlikely(cpu == eenv->trg_cpu))
6524 util += eenv->util_delta;
6526 max_util = max(max_util, util);
6533 * group_norm_util() returns the approximated group util relative to it's
6534 * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
6535 * in energy calculations.
6537 * Since task executions may or may not overlap in time in the group the true
6538 * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
6539 * when iterating over all CPUs in the group.
6540 * The latter estimate is used as it leads to a more pessimistic energy
6541 * estimate (more busy).
6544 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
6546 unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
6547 unsigned long util, util_sum = 0;
6550 for_each_cpu(cpu, sched_group_cpus(sg)) {
6551 util = cpu_util_wake(cpu, eenv->task);
6554 * If we are looking at the target CPU specified by the eenv,
6555 * then we should add the (estimated) utilization of the task
6556 * assuming we will wake it up on that CPU.
6558 if (unlikely(cpu == eenv->trg_cpu))
6559 util += eenv->util_delta;
6561 util_sum += __cpu_norm_util(util, capacity);
6564 return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
6567 static int find_new_capacity(struct energy_env *eenv,
6568 const struct sched_group_energy * const sge)
6570 int idx, max_idx = sge->nr_cap_states - 1;
6571 unsigned long util = group_max_util(eenv);
6573 /* default is max_cap if we don't find a match */
6574 eenv->cap_idx = max_idx;
6576 for (idx = 0; idx < sge->nr_cap_states; idx++) {
6577 if (sge->cap_states[idx].cap >= util) {
6578 eenv->cap_idx = idx;
6583 return eenv->cap_idx;
6586 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
6588 int i, state = INT_MAX;
6589 int src_in_grp, dst_in_grp;
6592 /* Find the shallowest idle state in the sched group. */
6593 for_each_cpu(i, sched_group_cpus(sg))
6594 state = min(state, idle_get_state_idx(cpu_rq(i)));
6596 /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
6599 src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
6600 dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
6601 if (src_in_grp == dst_in_grp) {
6602 /* both CPUs under consideration are in the same group or not in
6603 * either group, migration should leave idle state the same.
6609 * Try to estimate if a deeper idle state is
6610 * achievable when we move the task.
6612 for_each_cpu(i, sched_group_cpus(sg)) {
6613 grp_util += cpu_util_wake(i, eenv->task);
6614 if (unlikely(i == eenv->trg_cpu))
6615 grp_util += eenv->util_delta;
6619 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
6620 /* after moving, this group is at most partly
6621 * occupied, so it should have some idle time.
6623 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
6624 int new_state = grp_util * max_idle_state_idx;
6626 /* group will have no util, use lowest state */
6627 new_state = max_idle_state_idx + 1;
6629 /* for partially idle, linearly map util to idle
6630 * states, excluding the lowest one. This does not
6631 * correspond to the state we expect to enter in
6632 * reality, but an indication of what might happen.
6634 new_state = min(max_idle_state_idx, (int)
6635 (new_state / sg->sgc->max_capacity));
6636 new_state = max_idle_state_idx - new_state;
6640 /* After moving, the group will be fully occupied
6641 * so assume it will not be idle at all.
6650 * sched_group_energy(): Computes the absolute energy consumption of cpus
6651 * belonging to the sched_group including shared resources shared only by
6652 * members of the group. Iterates over all cpus in the hierarchy below the
6653 * sched_group starting from the bottom working it's way up before going to
6654 * the next cpu until all cpus are covered at all levels. The current
6655 * implementation is likely to gather the same util statistics multiple times.
6656 * This can probably be done in a faster but more complex way.
6657 * Note: sched_group_energy() may fail when racing with sched_domain updates.
6659 static int sched_group_energy(struct energy_env *eenv)
6661 struct cpumask visit_cpus;
6662 u64 total_energy = 0;
6665 WARN_ON(!eenv->sg_top->sge);
6667 cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
6668 /* If a cpu is hotplugged in while we are in this function,
6669 * it does not appear in the existing visit_cpus mask
6670 * which came from the sched_group pointer of the
6671 * sched_domain pointed at by sd_ea for either the prev
6672 * or next cpu and was dereferenced in __energy_diff.
6673 * Since we will dereference sd_scs later as we iterate
6674 * through the CPUs we expect to visit, new CPUs can
6675 * be present which are not in the visit_cpus mask.
6676 * Guard this with cpu_count.
6678 cpu_count = cpumask_weight(&visit_cpus);
6680 while (!cpumask_empty(&visit_cpus)) {
6681 struct sched_group *sg_shared_cap = NULL;
6682 int cpu = cpumask_first(&visit_cpus);
6683 struct sched_domain *sd;
6686 * Is the group utilization affected by cpus outside this
6688 * This sd may have groups with cpus which were not present
6689 * when we took visit_cpus.
6691 sd = rcu_dereference(per_cpu(sd_scs, cpu));
6693 if (sd && sd->parent)
6694 sg_shared_cap = sd->parent->groups;
6696 for_each_domain(cpu, sd) {
6697 struct sched_group *sg = sd->groups;
6699 /* Has this sched_domain already been visited? */
6700 if (sd->child && group_first_cpu(sg) != cpu)
6704 unsigned long group_util;
6705 int sg_busy_energy, sg_idle_energy;
6706 int cap_idx, idle_idx;
6708 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
6709 eenv->sg_cap = sg_shared_cap;
6713 cap_idx = find_new_capacity(eenv, sg->sge);
6715 if (sg->group_weight == 1) {
6716 /* Remove capacity of src CPU (before task move) */
6717 if (eenv->trg_cpu == eenv->src_cpu &&
6718 cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
6719 eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
6720 eenv->cap.delta -= eenv->cap.before;
6722 /* Add capacity of dst CPU (after task move) */
6723 if (eenv->trg_cpu == eenv->dst_cpu &&
6724 cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
6725 eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
6726 eenv->cap.delta += eenv->cap.after;
6730 idle_idx = group_idle_state(eenv, sg);
6731 group_util = group_norm_util(eenv, sg);
6733 sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
6734 sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
6735 * sg->sge->idle_states[idle_idx].power);
6737 total_energy += sg_busy_energy + sg_idle_energy;
6741 * cpu_count here is the number of
6742 * cpus we expect to visit in this
6743 * calculation. If we race against
6744 * hotplug, we can have extra cpus
6745 * added to the groups we are
6746 * iterating which do not appear in
6747 * the visit_cpus mask. In that case
6748 * we are not able to calculate energy
6749 * without restarting so we will bail
6750 * out and use prev_cpu this time.
6754 cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
6758 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
6761 } while (sg = sg->next, sg != sd->groups);
6765 * If we raced with hotplug and got an sd NULL-pointer;
6766 * returning a wrong energy estimation is better than
6767 * entering an infinite loop.
6768 * Specifically: If a cpu is unplugged after we took
6769 * the visit_cpus mask, it no longer has an sd_scs
6770 * pointer, so when we dereference it, we get NULL.
6772 if (cpumask_test_cpu(cpu, &visit_cpus))
6775 cpumask_clear_cpu(cpu, &visit_cpus);
6779 eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
6783 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
6785 return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
6788 static inline unsigned long task_util(struct task_struct *p);
6791 * energy_diff(): Estimate the energy impact of changing the utilization
6792 * distribution. eenv specifies the change: utilisation amount, source, and
6793 * destination cpu. Source or destination cpu may be -1 in which case the
6794 * utilization is removed from or added to the system (e.g. task wake-up). If
6795 * both are specified, the utilization is migrated.
6797 static inline int __energy_diff(struct energy_env *eenv)
6799 struct sched_domain *sd;
6800 struct sched_group *sg;
6801 int sd_cpu = -1, energy_before = 0, energy_after = 0;
6804 struct energy_env eenv_before = {
6805 .util_delta = task_util(eenv->task),
6806 .src_cpu = eenv->src_cpu,
6807 .dst_cpu = eenv->dst_cpu,
6808 .trg_cpu = eenv->src_cpu,
6809 .nrg = { 0, 0, 0, 0},
6814 if (eenv->src_cpu == eenv->dst_cpu)
6817 sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
6818 sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
6821 return 0; /* Error */
6826 if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
6827 eenv_before.sg_top = eenv->sg_top = sg;
6829 if (sched_group_energy(&eenv_before))
6830 return 0; /* Invalid result abort */
6831 energy_before += eenv_before.energy;
6833 /* Keep track of SRC cpu (before) capacity */
6834 eenv->cap.before = eenv_before.cap.before;
6835 eenv->cap.delta = eenv_before.cap.delta;
6837 if (sched_group_energy(eenv))
6838 return 0; /* Invalid result abort */
6839 energy_after += eenv->energy;
6841 } while (sg = sg->next, sg != sd->groups);
6843 eenv->nrg.before = energy_before;
6844 eenv->nrg.after = energy_after;
6845 eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
6847 #ifndef CONFIG_SCHED_TUNE
6848 trace_sched_energy_diff(eenv->task,
6849 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6850 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6851 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6852 eenv->nrg.delta, eenv->payoff);
6855 * Dead-zone margin preventing too many migrations.
6858 margin = eenv->nrg.before >> 6; /* ~1.56% */
6860 diff = eenv->nrg.after - eenv->nrg.before;
6862 eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
6864 return eenv->nrg.diff;
6867 #ifdef CONFIG_SCHED_TUNE
6869 struct target_nrg schedtune_target_nrg;
6871 #ifdef CONFIG_CGROUP_SCHEDTUNE
6872 extern bool schedtune_initialized;
6873 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6876 * System energy normalization
6877 * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
6878 * corresponding to the specified energy variation.
6881 normalize_energy(int energy_diff)
6885 #ifdef CONFIG_CGROUP_SCHEDTUNE
6886 /* during early setup, we don't know the extents */
6887 if (unlikely(!schedtune_initialized))
6888 return energy_diff < 0 ? -1 : 1 ;
6889 #endif /* CONFIG_CGROUP_SCHEDTUNE */
6891 #ifdef CONFIG_SCHED_DEBUG
6895 /* Check for boundaries */
6896 max_delta = schedtune_target_nrg.max_power;
6897 max_delta -= schedtune_target_nrg.min_power;
6898 WARN_ON(abs(energy_diff) >= max_delta);
6902 /* Do scaling using positive numbers to increase the range */
6903 normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
6905 /* Scale by energy magnitude */
6906 normalized_nrg <<= SCHED_CAPACITY_SHIFT;
6908 /* Normalize on max energy for target platform */
6909 normalized_nrg = reciprocal_divide(
6910 normalized_nrg, schedtune_target_nrg.rdiv);
6912 return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
6916 energy_diff(struct energy_env *eenv)
6918 int boost = schedtune_task_boost(eenv->task);
6921 /* Conpute "absolute" energy diff */
6922 __energy_diff(eenv);
6924 /* Return energy diff when boost margin is 0 */
6926 trace_sched_energy_diff(eenv->task,
6927 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6928 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6929 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6930 0, -eenv->nrg.diff);
6931 return eenv->nrg.diff;
6934 /* Compute normalized energy diff */
6935 nrg_delta = normalize_energy(eenv->nrg.diff);
6936 eenv->nrg.delta = nrg_delta;
6938 eenv->payoff = schedtune_accept_deltas(
6943 trace_sched_energy_diff(eenv->task,
6944 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
6945 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
6946 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
6947 eenv->nrg.delta, eenv->payoff);
6950 * When SchedTune is enabled, the energy_diff() function will return
6951 * the computed energy payoff value. Since the energy_diff() return
6952 * value is expected to be negative by its callers, this evaluation
6953 * function return a negative value each time the evaluation return a
6954 * positive payoff, which is the condition for the acceptance of
6955 * a scheduling decision
6957 return -eenv->payoff;
6959 #else /* CONFIG_SCHED_TUNE */
6960 #define energy_diff(eenv) __energy_diff(eenv)
6964 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
6965 * A waker of many should wake a different task than the one last awakened
6966 * at a frequency roughly N times higher than one of its wakees. In order
6967 * to determine whether we should let the load spread vs consolodating to
6968 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
6969 * partner, and a factor of lls_size higher frequency in the other. With
6970 * both conditions met, we can be relatively sure that the relationship is
6971 * non-monogamous, with partner count exceeding socket size. Waker/wakee
6972 * being client/server, worker/dispatcher, interrupt source or whatever is
6973 * irrelevant, spread criteria is apparent partner count exceeds socket size.
6975 static int wake_wide(struct task_struct *p, int sibling_count_hint)
6977 unsigned int master = current->wakee_flips;
6978 unsigned int slave = p->wakee_flips;
6979 int llc_size = this_cpu_read(sd_llc_size);
6981 if (sibling_count_hint >= llc_size)
6985 swap(master, slave);
6986 if (slave < llc_size || master < slave * llc_size)
6991 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6992 int prev_cpu, int sync)
6994 s64 this_load, load;
6995 s64 this_eff_load, prev_eff_load;
6997 struct task_group *tg;
6998 unsigned long weight;
7002 this_cpu = smp_processor_id();
7003 load = source_load(prev_cpu, idx);
7004 this_load = target_load(this_cpu, idx);
7007 * If sync wakeup then subtract the (maximum possible)
7008 * effect of the currently running task from the load
7009 * of the current CPU:
7012 tg = task_group(current);
7013 weight = current->se.avg.load_avg;
7015 this_load += effective_load(tg, this_cpu, -weight, -weight);
7016 load += effective_load(tg, prev_cpu, 0, -weight);
7020 weight = p->se.avg.load_avg;
7023 * In low-load situations, where prev_cpu is idle and this_cpu is idle
7024 * due to the sync cause above having dropped this_load to 0, we'll
7025 * always have an imbalance, but there's really nothing you can do
7026 * about that, so that's good too.
7028 * Otherwise check if either cpus are near enough in load to allow this
7029 * task to be woken on this_cpu.
7031 this_eff_load = 100;
7032 this_eff_load *= capacity_of(prev_cpu);
7034 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
7035 prev_eff_load *= capacity_of(this_cpu);
7037 if (this_load > 0) {
7038 this_eff_load *= this_load +
7039 effective_load(tg, this_cpu, weight, weight);
7041 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
7044 balanced = this_eff_load <= prev_eff_load;
7046 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
7051 schedstat_inc(sd, ttwu_move_affine);
7052 schedstat_inc(p, se.statistics.nr_wakeups_affine);
7057 static inline unsigned long task_util(struct task_struct *p)
7059 return p->se.avg.util_avg;
7062 static inline unsigned long boosted_task_util(struct task_struct *task);
7064 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
7066 unsigned long capacity = capacity_of(cpu);
7068 util += boosted_task_util(p);
7070 return (capacity * 1024) > (util * capacity_margin);
7073 static inline bool task_fits_max(struct task_struct *p, int cpu)
7075 unsigned long capacity = capacity_of(cpu);
7076 unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7078 if (capacity == max_capacity)
7081 if (capacity * capacity_margin > max_capacity * 1024)
7084 return __task_fits(p, cpu, 0);
7087 static bool __cpu_overutilized(int cpu, int delta)
7089 return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
7092 static bool cpu_overutilized(int cpu)
7094 return __cpu_overutilized(cpu, 0);
7097 #ifdef CONFIG_SCHED_TUNE
7099 struct reciprocal_value schedtune_spc_rdiv;
7102 schedtune_margin(unsigned long signal, long boost)
7104 long long margin = 0;
7107 * Signal proportional compensation (SPC)
7109 * The Boost (B) value is used to compute a Margin (M) which is
7110 * proportional to the complement of the original Signal (S):
7111 * M = B * (SCHED_CAPACITY_SCALE - S)
7112 * The obtained M could be used by the caller to "boost" S.
7115 margin = SCHED_CAPACITY_SCALE - signal;
7118 margin = -signal * boost;
7120 margin = reciprocal_divide(margin, schedtune_spc_rdiv);
7128 schedtune_cpu_margin(unsigned long util, int cpu)
7130 int boost = schedtune_cpu_boost(cpu);
7135 return schedtune_margin(util, boost);
7139 schedtune_task_margin(struct task_struct *task)
7141 int boost = schedtune_task_boost(task);
7148 util = task_util(task);
7149 margin = schedtune_margin(util, boost);
7154 #else /* CONFIG_SCHED_TUNE */
7157 schedtune_cpu_margin(unsigned long util, int cpu)
7163 schedtune_task_margin(struct task_struct *task)
7168 #endif /* CONFIG_SCHED_TUNE */
7171 boosted_cpu_util(int cpu)
7173 unsigned long util = cpu_util_freq(cpu);
7174 long margin = schedtune_cpu_margin(util, cpu);
7176 trace_sched_boost_cpu(cpu, util, margin);
7178 return util + margin;
7181 static inline unsigned long
7182 boosted_task_util(struct task_struct *task)
7184 unsigned long util = task_util(task);
7185 long margin = schedtune_task_margin(task);
7187 trace_sched_boost_task(task, util, margin);
7189 return util + margin;
7192 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
7194 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
7198 * find_idlest_group finds and returns the least busy CPU group within the
7201 * Assumes p is allowed on at least one CPU in sd.
7203 static struct sched_group *
7204 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
7205 int this_cpu, int sd_flag)
7207 struct sched_group *idlest = NULL, *group = sd->groups;
7208 struct sched_group *most_spare_sg = NULL;
7209 unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
7210 unsigned long most_spare = 0, this_spare = 0;
7211 int load_idx = sd->forkexec_idx;
7212 int imbalance = 100 + (sd->imbalance_pct-100)/2;
7214 if (sd_flag & SD_BALANCE_WAKE)
7215 load_idx = sd->wake_idx;
7218 unsigned long load, avg_load, spare_cap, max_spare_cap;
7222 /* Skip over this group if it has no CPUs allowed */
7223 if (!cpumask_intersects(sched_group_cpus(group),
7224 tsk_cpus_allowed(p)))
7227 local_group = cpumask_test_cpu(this_cpu,
7228 sched_group_cpus(group));
7231 * Tally up the load of all CPUs in the group and find
7232 * the group containing the CPU with most spare capacity.
7237 for_each_cpu(i, sched_group_cpus(group)) {
7238 /* Bias balancing toward cpus of our domain */
7240 load = source_load(i, load_idx);
7242 load = target_load(i, load_idx);
7246 spare_cap = capacity_spare_wake(i, p);
7248 if (spare_cap > max_spare_cap)
7249 max_spare_cap = spare_cap;
7252 /* Adjust by relative CPU capacity of the group */
7253 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
7256 this_load = avg_load;
7257 this_spare = max_spare_cap;
7259 if (avg_load < min_load) {
7260 min_load = avg_load;
7264 if (most_spare < max_spare_cap) {
7265 most_spare = max_spare_cap;
7266 most_spare_sg = group;
7269 } while (group = group->next, group != sd->groups);
7272 * The cross-over point between using spare capacity or least load
7273 * is too conservative for high utilization tasks on partially
7274 * utilized systems if we require spare_capacity > task_util(p),
7275 * so we allow for some task stuffing by using
7276 * spare_capacity > task_util(p)/2.
7278 * Spare capacity can't be used for fork because the utilization has
7279 * not been set yet, we must first select a rq to compute the initial
7282 if (sd_flag & SD_BALANCE_FORK)
7285 if (this_spare > task_util(p) / 2 &&
7286 imbalance*this_spare > 100*most_spare)
7288 else if (most_spare > task_util(p) / 2)
7289 return most_spare_sg;
7292 if (!idlest || 100*this_load < imbalance*min_load)
7298 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
7301 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7303 unsigned long load, min_load = ULONG_MAX;
7304 unsigned int min_exit_latency = UINT_MAX;
7305 u64 latest_idle_timestamp = 0;
7306 int least_loaded_cpu = this_cpu;
7307 int shallowest_idle_cpu = -1;
7310 /* Check if we have any choice: */
7311 if (group->group_weight == 1)
7312 return cpumask_first(sched_group_cpus(group));
7314 /* Traverse only the allowed CPUs */
7315 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
7317 struct rq *rq = cpu_rq(i);
7318 struct cpuidle_state *idle = idle_get_state(rq);
7319 if (idle && idle->exit_latency < min_exit_latency) {
7321 * We give priority to a CPU whose idle state
7322 * has the smallest exit latency irrespective
7323 * of any idle timestamp.
7325 min_exit_latency = idle->exit_latency;
7326 latest_idle_timestamp = rq->idle_stamp;
7327 shallowest_idle_cpu = i;
7328 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7329 rq->idle_stamp > latest_idle_timestamp) {
7331 * If equal or no active idle state, then
7332 * the most recently idled CPU might have
7335 latest_idle_timestamp = rq->idle_stamp;
7336 shallowest_idle_cpu = i;
7338 } else if (shallowest_idle_cpu == -1) {
7339 load = weighted_cpuload(i);
7340 if (load < min_load || (load == min_load && i == this_cpu)) {
7342 least_loaded_cpu = i;
7347 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7350 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
7351 int cpu, int prev_cpu, int sd_flag)
7354 int wu = sd_flag & SD_BALANCE_WAKE;
7358 schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
7359 schedstat_inc(this_rq(), eas_stats.cas_attempts);
7362 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
7366 struct sched_group *group;
7367 struct sched_domain *tmp;
7371 schedstat_inc(sd, eas_stats.cas_attempts);
7373 if (!(sd->flags & sd_flag)) {
7378 group = find_idlest_group(sd, p, cpu, sd_flag);
7384 new_cpu = find_idlest_group_cpu(group, p, cpu);
7385 if (new_cpu == cpu) {
7386 /* Now try balancing at a lower domain level of cpu */
7391 /* Now try balancing at a lower domain level of new_cpu */
7392 cpu = cas_cpu = new_cpu;
7393 weight = sd->span_weight;
7395 for_each_domain(cpu, tmp) {
7396 if (weight <= tmp->span_weight)
7398 if (tmp->flags & sd_flag)
7401 /* while loop will break here if sd == NULL */
7404 if (wu && (cas_cpu >= 0)) {
7405 schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
7406 schedstat_inc(this_rq(), eas_stats.cas_count);
7413 * Try and locate an idle CPU in the sched_domain.
7415 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7417 struct sched_domain *sd;
7418 struct sched_group *sg;
7419 int best_idle_cpu = -1;
7420 int best_idle_cstate = INT_MAX;
7421 unsigned long best_idle_capacity = ULONG_MAX;
7423 schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
7424 schedstat_inc(this_rq(), eas_stats.sis_attempts);
7426 if (!sysctl_sched_cstate_aware) {
7427 if (idle_cpu(target)) {
7428 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
7429 schedstat_inc(this_rq(), eas_stats.sis_idle);
7434 * If the prevous cpu is cache affine and idle, don't be stupid.
7436 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
7437 schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
7438 schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
7443 if (!(current->flags & PF_WAKE_UP_IDLE) &&
7444 !(p->flags & PF_WAKE_UP_IDLE))
7448 * Otherwise, iterate the domains and find an elegible idle cpu.
7450 sd = rcu_dereference(per_cpu(sd_llc, target));
7451 for_each_lower_domain(sd) {
7455 if (!cpumask_intersects(sched_group_cpus(sg),
7456 tsk_cpus_allowed(p)))
7459 if (sysctl_sched_cstate_aware) {
7460 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7461 int idle_idx = idle_get_state_idx(cpu_rq(i));
7462 unsigned long new_usage = boosted_task_util(p);
7463 unsigned long capacity_orig = capacity_orig_of(i);
7465 if (new_usage > capacity_orig || !idle_cpu(i))
7468 if (i == target && new_usage <= capacity_curr_of(target)) {
7469 schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
7470 schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
7471 schedstat_inc(sd, eas_stats.sis_suff_cap);
7475 if (idle_idx < best_idle_cstate &&
7476 capacity_orig <= best_idle_capacity) {
7478 best_idle_cstate = idle_idx;
7479 best_idle_capacity = capacity_orig;
7483 for_each_cpu(i, sched_group_cpus(sg)) {
7484 if (i == target || !idle_cpu(i))
7488 target = cpumask_first_and(sched_group_cpus(sg),
7489 tsk_cpus_allowed(p));
7490 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
7491 schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
7492 schedstat_inc(sd, eas_stats.sis_idle_cpu);
7497 } while (sg != sd->groups);
7500 if (best_idle_cpu >= 0)
7501 target = best_idle_cpu;
7504 schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
7505 schedstat_inc(this_rq(), eas_stats.sis_count);
7511 * cpu_util_wake: Compute cpu utilization with any contributions from
7512 * the waking task p removed. check_for_migration() looks for a better CPU of
7513 * rq->curr. For that case we should return cpu util with contributions from
7514 * currently running task p removed.
7516 static int cpu_util_wake(int cpu, struct task_struct *p)
7518 unsigned long util, capacity;
7520 #ifdef CONFIG_SCHED_WALT
7522 * WALT does not decay idle tasks in the same manner
7523 * as PELT, so it makes little sense to subtract task
7524 * utilization from cpu utilization. Instead just use
7525 * cpu_util for this case.
7527 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7528 p->state == TASK_WAKING)
7529 return cpu_util(cpu);
7531 /* Task has no contribution or is new */
7532 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
7533 return cpu_util(cpu);
7535 capacity = capacity_orig_of(cpu);
7536 util = max_t(long, cpu_util(cpu) - task_util(p), 0);
7538 return (util >= capacity) ? capacity : util;
7541 static int start_cpu(bool boosted)
7543 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7545 return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
7548 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
7549 bool boosted, bool prefer_idle)
7551 unsigned long best_idle_min_cap_orig = ULONG_MAX;
7552 unsigned long min_util = boosted_task_util(p);
7553 unsigned long target_capacity = ULONG_MAX;
7554 unsigned long min_wake_util = ULONG_MAX;
7555 unsigned long target_max_spare_cap = 0;
7556 unsigned long best_active_util = ULONG_MAX;
7557 int best_idle_cstate = INT_MAX;
7558 struct sched_domain *sd;
7559 struct sched_group *sg;
7560 int best_active_cpu = -1;
7561 int best_idle_cpu = -1;
7562 int target_cpu = -1;
7567 schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
7568 schedstat_inc(this_rq(), eas_stats.fbt_attempts);
7570 /* Find start CPU based on boost value */
7571 cpu = start_cpu(boosted);
7573 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
7574 schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
7578 /* Find SD for the start CPU */
7579 sd = rcu_dereference(per_cpu(sd_ea, cpu));
7581 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
7582 schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
7586 /* Scan CPUs in all SDs */
7589 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
7590 unsigned long capacity_curr = capacity_curr_of(i);
7591 unsigned long capacity_orig = capacity_orig_of(i);
7592 unsigned long wake_util, new_util;
7597 if (walt_cpu_high_irqload(i))
7601 * p's blocked utilization is still accounted for on prev_cpu
7602 * so prev_cpu will receive a negative bias due to the double
7603 * accounting. However, the blocked utilization may be zero.
7605 wake_util = cpu_util_wake(i, p);
7606 new_util = wake_util + task_util(p);
7609 * Ensure minimum capacity to grant the required boost.
7610 * The target CPU can be already at a capacity level higher
7611 * than the one required to boost the task.
7613 new_util = max(min_util, new_util);
7614 if (new_util > capacity_orig)
7618 * Case A) Latency sensitive tasks
7620 * Unconditionally favoring tasks that prefer idle CPU to
7624 * - an idle CPU, whatever its idle_state is, since
7625 * the first CPUs we explore are more likely to be
7626 * reserved for latency sensitive tasks.
7627 * - a non idle CPU where the task fits in its current
7628 * capacity and has the maximum spare capacity.
7629 * - a non idle CPU with lower contention from other
7630 * tasks and running at the lowest possible OPP.
7632 * The last two goals tries to favor a non idle CPU
7633 * where the task can run as if it is "almost alone".
7634 * A maximum spare capacity CPU is favoured since
7635 * the task already fits into that CPU's capacity
7636 * without waiting for an OPP chance.
7638 * The following code path is the only one in the CPUs
7639 * exploration loop which is always used by
7640 * prefer_idle tasks. It exits the loop with wither a
7641 * best_active_cpu or a target_cpu which should
7642 * represent an optimal choice for latency sensitive
7648 * Case A.1: IDLE CPU
7649 * Return the first IDLE CPU we find.
7652 schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
7653 schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
7655 trace_sched_find_best_target(p,
7656 prefer_idle, min_util,
7658 best_active_cpu, i);
7664 * Case A.2: Target ACTIVE CPU
7665 * Favor CPUs with max spare capacity.
7667 if ((capacity_curr > new_util) &&
7668 (capacity_orig - new_util > target_max_spare_cap)) {
7669 target_max_spare_cap = capacity_orig - new_util;
7673 if (target_cpu != -1)
7678 * Case A.3: Backup ACTIVE CPU
7680 * - lower utilization due to other tasks
7681 * - lower utilization with the task in
7683 if (wake_util > min_wake_util)
7685 if (new_util > best_active_util)
7687 min_wake_util = wake_util;
7688 best_active_util = new_util;
7689 best_active_cpu = i;
7696 * For non latency sensitive tasks, skip CPUs that
7697 * will be overutilized by moving the task there.
7699 * The goal here is to remain in EAS mode as long as
7700 * possible at least for !prefer_idle tasks.
7702 if ((new_util * capacity_margin) >
7703 (capacity_orig * SCHED_CAPACITY_SCALE))
7707 * Case B) Non latency sensitive tasks on IDLE CPUs.
7709 * Find an optimal backup IDLE CPU for non latency
7713 * - minimizing the capacity_orig,
7714 * i.e. preferring LITTLE CPUs
7715 * - favoring shallowest idle states
7716 * i.e. avoid to wakeup deep-idle CPUs
7718 * The following code path is used by non latency
7719 * sensitive tasks if IDLE CPUs are available. If at
7720 * least one of such CPUs are available it sets the
7721 * best_idle_cpu to the most suitable idle CPU to be
7724 * If idle CPUs are available, favour these CPUs to
7725 * improve performances by spreading tasks.
7726 * Indeed, the energy_diff() computed by the caller
7727 * will take care to ensure the minimization of energy
7728 * consumptions without affecting performance.
7731 int idle_idx = idle_get_state_idx(cpu_rq(i));
7733 /* Select idle CPU with lower cap_orig */
7734 if (capacity_orig > best_idle_min_cap_orig)
7738 * Skip CPUs in deeper idle state, but only
7739 * if they are also less energy efficient.
7740 * IOW, prefer a deep IDLE LITTLE CPU vs a
7741 * shallow idle big CPU.
7743 if (sysctl_sched_cstate_aware &&
7744 best_idle_cstate <= idle_idx)
7747 /* Keep track of best idle CPU */
7748 best_idle_min_cap_orig = capacity_orig;
7749 best_idle_cstate = idle_idx;
7755 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
7757 * Pack tasks in the most energy efficient capacities.
7759 * This task packing strategy prefers more energy
7760 * efficient CPUs (i.e. pack on smaller maximum
7761 * capacity CPUs) while also trying to spread tasks to
7762 * run them all at the lower OPP.
7764 * This assumes for example that it's more energy
7765 * efficient to run two tasks on two CPUs at a lower
7766 * OPP than packing both on a single CPU but running
7767 * that CPU at an higher OPP.
7769 * Thus, this case keep track of the CPU with the
7770 * smallest maximum capacity and highest spare maximum
7774 /* Favor CPUs with smaller capacity */
7775 if (capacity_orig > target_capacity)
7778 /* Favor CPUs with maximum spare capacity */
7779 if ((capacity_orig - new_util) < target_max_spare_cap)
7782 target_max_spare_cap = capacity_orig - new_util;
7783 target_capacity = capacity_orig;
7787 } while (sg = sg->next, sg != sd->groups);
7790 * For non latency sensitive tasks, cases B and C in the previous loop,
7791 * we pick the best IDLE CPU only if we was not able to find a target
7794 * Policies priorities:
7796 * - prefer_idle tasks:
7798 * a) IDLE CPU available, we return immediately
7799 * b) ACTIVE CPU where task fits and has the bigger maximum spare
7800 * capacity (i.e. target_cpu)
7801 * c) ACTIVE CPU with less contention due to other tasks
7802 * (i.e. best_active_cpu)
7804 * - NON prefer_idle tasks:
7806 * a) ACTIVE CPU: target_cpu
7807 * b) IDLE CPU: best_idle_cpu
7809 if (target_cpu == -1)
7810 target_cpu = prefer_idle
7814 *backup_cpu = prefer_idle
7818 trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
7819 best_idle_cpu, best_active_cpu,
7822 schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
7823 schedstat_inc(this_rq(), eas_stats.fbt_count);
7829 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
7830 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
7832 * In that case WAKE_AFFINE doesn't make sense and we'll let
7833 * BALANCE_WAKE sort things out.
7835 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
7837 long min_cap, max_cap;
7839 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
7840 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7842 /* Minimum capacity is close to max, no need to abort wake_affine */
7843 if (max_cap - min_cap < max_cap >> 3)
7846 /* Bring task utilization in sync with prev_cpu */
7847 sync_entity_load_avg(&p->se);
7849 return min_cap * 1024 < task_util(p) * capacity_margin;
7852 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
7854 struct sched_domain *sd;
7855 int target_cpu = prev_cpu, tmp_target, tmp_backup;
7856 bool boosted, prefer_idle;
7858 schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
7859 schedstat_inc(this_rq(), eas_stats.secb_attempts);
7861 if (sysctl_sched_sync_hint_enable && sync) {
7862 int cpu = smp_processor_id();
7864 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7865 schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
7866 schedstat_inc(this_rq(), eas_stats.secb_sync);
7872 #ifdef CONFIG_CGROUP_SCHEDTUNE
7873 boosted = schedtune_task_boost(p) > 0;
7874 prefer_idle = schedtune_prefer_idle(p) > 0;
7876 boosted = get_sysctl_sched_cfs_boost() > 0;
7880 sync_entity_load_avg(&p->se);
7882 sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
7883 /* Find a cpu with sufficient capacity */
7884 tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
7888 if (tmp_target >= 0) {
7889 target_cpu = tmp_target;
7890 if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
7891 schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
7892 schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
7897 if (target_cpu != prev_cpu) {
7899 struct energy_env eenv = {
7900 .util_delta = task_util(p),
7901 .src_cpu = prev_cpu,
7902 .dst_cpu = target_cpu,
7904 .trg_cpu = target_cpu,
7908 #ifdef CONFIG_SCHED_WALT
7909 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7910 p->state == TASK_WAKING)
7911 delta = task_util(p);
7913 /* Not enough spare capacity on previous cpu */
7914 if (__cpu_overutilized(prev_cpu, delta)) {
7915 schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
7916 schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
7920 if (energy_diff(&eenv) >= 0) {
7921 /* No energy saving for target_cpu, try backup */
7922 target_cpu = tmp_backup;
7923 eenv.dst_cpu = target_cpu;
7924 eenv.trg_cpu = target_cpu;
7925 if (tmp_backup < 0 ||
7926 tmp_backup == prev_cpu ||
7927 energy_diff(&eenv) >= 0) {
7928 schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
7929 schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
7930 target_cpu = prev_cpu;
7935 schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
7936 schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
7940 schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
7941 schedstat_inc(this_rq(), eas_stats.secb_count);
7950 * select_task_rq_fair: Select target runqueue for the waking task in domains
7951 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
7952 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
7954 * Balances load by selecting the idlest cpu in the idlest group, or under
7955 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
7957 * Returns the target cpu number.
7959 * preempt must be disabled.
7962 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7963 int sibling_count_hint)
7965 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
7966 int cpu = smp_processor_id();
7967 int new_cpu = prev_cpu;
7968 int want_affine = 0;
7969 int sync = wake_flags & WF_SYNC;
7971 #ifdef CONFIG_SCHED_HMP
7972 return select_best_cpu(p, prev_cpu, 0, sync);
7975 if (sd_flag & SD_BALANCE_WAKE) {
7977 want_affine = !wake_wide(p, sibling_count_hint) &&
7978 !wake_cap(p, cpu, prev_cpu) &&
7979 cpumask_test_cpu(cpu, &p->cpus_allowed);
7982 if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
7983 return select_energy_cpu_brute(p, prev_cpu, sync);
7986 for_each_domain(cpu, tmp) {
7987 if (!(tmp->flags & SD_LOAD_BALANCE))
7991 * If both cpu and prev_cpu are part of this domain,
7992 * cpu is a valid SD_WAKE_AFFINE target.
7994 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7995 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8000 if (tmp->flags & sd_flag)
8002 else if (!want_affine)
8007 sd = NULL; /* Prefer wake_affine over balance flags */
8008 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
8012 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
8014 * We're going to need the task's util for capacity_spare_wake
8015 * in find_idlest_group. Sync it up to prev_cpu's
8018 sync_entity_load_avg(&p->se);
8022 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
8023 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8026 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
8034 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
8035 * cfs_rq_of(p) references at time of call are still valid and identify the
8036 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
8037 * other assumptions, including the state of rq->lock, should be made.
8039 static void migrate_task_rq_fair(struct task_struct *p)
8042 * We are supposed to update the task to "current" time, then its up to date
8043 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
8044 * what current time is, so simply throw away the out-of-date time. This
8045 * will result in the wakee task is less decayed, but giving the wakee more
8046 * load sounds not bad.
8048 remove_entity_load_avg(&p->se);
8050 /* Tell new CPU we are migrated */
8051 p->se.avg.last_update_time = 0;
8053 /* We have migrated, no longer consider this task hot */
8054 p->se.exec_start = 0;
8057 static void task_dead_fair(struct task_struct *p)
8059 remove_entity_load_avg(&p->se);
8062 #define task_fits_max(p, cpu) true
8063 #endif /* CONFIG_SMP */
8065 static unsigned long
8066 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
8068 unsigned long gran = sysctl_sched_wakeup_granularity;
8071 * Since its curr running now, convert the gran from real-time
8072 * to virtual-time in his units.
8074 * By using 'se' instead of 'curr' we penalize light tasks, so
8075 * they get preempted easier. That is, if 'se' < 'curr' then
8076 * the resulting gran will be larger, therefore penalizing the
8077 * lighter, if otoh 'se' > 'curr' then the resulting gran will
8078 * be smaller, again penalizing the lighter task.
8080 * This is especially important for buddies when the leftmost
8081 * task is higher priority than the buddy.
8083 return calc_delta_fair(gran, se);
8087 * Should 'se' preempt 'curr'.
8101 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
8103 s64 gran, vdiff = curr->vruntime - se->vruntime;
8108 gran = wakeup_gran(curr, se);
8115 static void set_last_buddy(struct sched_entity *se)
8117 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8120 for_each_sched_entity(se)
8121 cfs_rq_of(se)->last = se;
8124 static void set_next_buddy(struct sched_entity *se)
8126 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
8129 for_each_sched_entity(se)
8130 cfs_rq_of(se)->next = se;
8133 static void set_skip_buddy(struct sched_entity *se)
8135 for_each_sched_entity(se)
8136 cfs_rq_of(se)->skip = se;
8140 * Preempt the current task with a newly woken task if needed:
8142 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
8144 struct task_struct *curr = rq->curr;
8145 struct sched_entity *se = &curr->se, *pse = &p->se;
8146 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8147 int scale = cfs_rq->nr_running >= sched_nr_latency;
8148 int next_buddy_marked = 0;
8150 if (unlikely(se == pse))
8154 * This is possible from callers such as attach_tasks(), in which we
8155 * unconditionally check_prempt_curr() after an enqueue (which may have
8156 * lead to a throttle). This both saves work and prevents false
8157 * next-buddy nomination below.
8159 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8162 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
8163 set_next_buddy(pse);
8164 next_buddy_marked = 1;
8168 * We can come here with TIF_NEED_RESCHED already set from new task
8171 * Note: this also catches the edge-case of curr being in a throttled
8172 * group (e.g. via set_curr_task), since update_curr() (in the
8173 * enqueue of curr) will have resulted in resched being set. This
8174 * prevents us from potentially nominating it as a false LAST_BUDDY
8177 if (test_tsk_need_resched(curr))
8180 /* Idle tasks are by definition preempted by non-idle tasks. */
8181 if (unlikely(curr->policy == SCHED_IDLE) &&
8182 likely(p->policy != SCHED_IDLE))
8186 * Batch and idle tasks do not preempt non-idle tasks (their preemption
8187 * is driven by the tick):
8189 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8192 find_matching_se(&se, &pse);
8193 update_curr(cfs_rq_of(se));
8195 if (wakeup_preempt_entity(se, pse) == 1) {
8197 * Bias pick_next to pick the sched entity that is
8198 * triggering this preemption.
8200 if (!next_buddy_marked)
8201 set_next_buddy(pse);
8210 * Only set the backward buddy when the current task is still
8211 * on the rq. This can happen when a wakeup gets interleaved
8212 * with schedule on the ->pre_schedule() or idle_balance()
8213 * point, either of which can * drop the rq lock.
8215 * Also, during early boot the idle thread is in the fair class,
8216 * for obvious reasons its a bad idea to schedule back to it.
8218 if (unlikely(!se->on_rq || curr == rq->idle))
8221 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
8225 static struct task_struct *
8226 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
8228 struct cfs_rq *cfs_rq = &rq->cfs;
8229 struct sched_entity *se;
8230 struct task_struct *p;
8234 #ifdef CONFIG_FAIR_GROUP_SCHED
8235 if (!cfs_rq->nr_running)
8238 if (prev->sched_class != &fair_sched_class)
8242 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8243 * likely that a next task is from the same cgroup as the current.
8245 * Therefore attempt to avoid putting and setting the entire cgroup
8246 * hierarchy, only change the part that actually changes.
8250 struct sched_entity *curr = cfs_rq->curr;
8253 * Since we got here without doing put_prev_entity() we also
8254 * have to consider cfs_rq->curr. If it is still a runnable
8255 * entity, update_curr() will update its vruntime, otherwise
8256 * forget we've ever seen it.
8260 update_curr(cfs_rq);
8265 * This call to check_cfs_rq_runtime() will do the
8266 * throttle and dequeue its entity in the parent(s).
8267 * Therefore the 'simple' nr_running test will indeed
8270 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8274 se = pick_next_entity(cfs_rq, curr);
8275 cfs_rq = group_cfs_rq(se);
8281 * Since we haven't yet done put_prev_entity and if the selected task
8282 * is a different task than we started out with, try and touch the
8283 * least amount of cfs_rqs.
8286 struct sched_entity *pse = &prev->se;
8288 while (!(cfs_rq = is_same_group(se, pse))) {
8289 int se_depth = se->depth;
8290 int pse_depth = pse->depth;
8292 if (se_depth <= pse_depth) {
8293 put_prev_entity(cfs_rq_of(pse), pse);
8294 pse = parent_entity(pse);
8296 if (se_depth >= pse_depth) {
8297 set_next_entity(cfs_rq_of(se), se);
8298 se = parent_entity(se);
8302 put_prev_entity(cfs_rq, pse);
8303 set_next_entity(cfs_rq, se);
8306 if (hrtick_enabled(rq))
8307 hrtick_start_fair(rq, p);
8309 rq->misfit_task = !task_fits_max(p, rq->cpu);
8316 if (!cfs_rq->nr_running)
8319 put_prev_task(rq, prev);
8322 se = pick_next_entity(cfs_rq, NULL);
8323 set_next_entity(cfs_rq, se);
8324 cfs_rq = group_cfs_rq(se);
8329 if (hrtick_enabled(rq))
8330 hrtick_start_fair(rq, p);
8332 rq->misfit_task = !task_fits_max(p, rq->cpu);
8337 rq->misfit_task = 0;
8339 * This is OK, because current is on_cpu, which avoids it being picked
8340 * for load-balance and preemption/IRQs are still disabled avoiding
8341 * further scheduler activity on it and we're being very careful to
8342 * re-start the picking loop.
8344 lockdep_unpin_lock(&rq->lock);
8345 new_tasks = idle_balance(rq);
8346 lockdep_pin_lock(&rq->lock);
8348 * Because idle_balance() releases (and re-acquires) rq->lock, it is
8349 * possible for any higher priority task to appear. In that case we
8350 * must re-start the pick_next_entity() loop.
8362 * Account for a descheduled task:
8364 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8366 struct sched_entity *se = &prev->se;
8367 struct cfs_rq *cfs_rq;
8369 for_each_sched_entity(se) {
8370 cfs_rq = cfs_rq_of(se);
8371 put_prev_entity(cfs_rq, se);
8376 * sched_yield() is very simple
8378 * The magic of dealing with the ->skip buddy is in pick_next_entity.
8380 static void yield_task_fair(struct rq *rq)
8382 struct task_struct *curr = rq->curr;
8383 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8384 struct sched_entity *se = &curr->se;
8387 * Are we the only task in the tree?
8389 if (unlikely(rq->nr_running == 1))
8392 clear_buddies(cfs_rq, se);
8394 if (curr->policy != SCHED_BATCH) {
8395 update_rq_clock(rq);
8397 * Update run-time statistics of the 'current'.
8399 update_curr(cfs_rq);
8401 * Tell update_rq_clock() that we've just updated,
8402 * so we don't do microscopic update in schedule()
8403 * and double the fastpath cost.
8405 rq_clock_skip_update(rq, true);
8411 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
8413 struct sched_entity *se = &p->se;
8415 /* throttled hierarchies are not runnable */
8416 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8419 /* Tell the scheduler that we'd really like pse to run next. */
8422 yield_task_fair(rq);
8428 /**************************************************
8429 * Fair scheduling class load-balancing methods.
8433 * The purpose of load-balancing is to achieve the same basic fairness the
8434 * per-cpu scheduler provides, namely provide a proportional amount of compute
8435 * time to each task. This is expressed in the following equation:
8437 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
8439 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
8440 * W_i,0 is defined as:
8442 * W_i,0 = \Sum_j w_i,j (2)
8444 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
8445 * is derived from the nice value as per prio_to_weight[].
8447 * The weight average is an exponential decay average of the instantaneous
8450 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
8452 * C_i is the compute capacity of cpu i, typically it is the
8453 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8454 * can also include other factors [XXX].
8456 * To achieve this balance we define a measure of imbalance which follows
8457 * directly from (1):
8459 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
8461 * We them move tasks around to minimize the imbalance. In the continuous
8462 * function space it is obvious this converges, in the discrete case we get
8463 * a few fun cases generally called infeasible weight scenarios.
8466 * - infeasible weights;
8467 * - local vs global optima in the discrete case. ]
8472 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8473 * for all i,j solution, we create a tree of cpus that follows the hardware
8474 * topology where each level pairs two lower groups (or better). This results
8475 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
8476 * tree to only the first of the previous level and we decrease the frequency
8477 * of load-balance at each level inv. proportional to the number of cpus in
8483 * \Sum { --- * --- * 2^i } = O(n) (5)
8485 * `- size of each group
8486 * | | `- number of cpus doing load-balance
8488 * `- sum over all levels
8490 * Coupled with a limit on how many tasks we can migrate every balance pass,
8491 * this makes (5) the runtime complexity of the balancer.
8493 * An important property here is that each CPU is still (indirectly) connected
8494 * to every other cpu in at most O(log n) steps:
8496 * The adjacency matrix of the resulting graph is given by:
8499 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
8502 * And you'll find that:
8504 * A^(log_2 n)_i,j != 0 for all i,j (7)
8506 * Showing there's indeed a path between every cpu in at most O(log n) steps.
8507 * The task movement gives a factor of O(m), giving a convergence complexity
8510 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
8515 * In order to avoid CPUs going idle while there's still work to do, new idle
8516 * balancing is more aggressive and has the newly idle cpu iterate up the domain
8517 * tree itself instead of relying on other CPUs to bring it work.
8519 * This adds some complexity to both (5) and (8) but it reduces the total idle
8527 * Cgroups make a horror show out of (2), instead of a simple sum we get:
8530 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
8535 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
8537 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
8539 * The big problem is S_k, its a global sum needed to compute a local (W_i)
8542 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8543 * rewrite all of this once again.]
8546 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8548 enum fbq_type { regular, remote, all };
8557 #define LBF_ALL_PINNED 0x01
8558 #define LBF_NEED_BREAK 0x02
8559 #define LBF_DST_PINNED 0x04
8560 #define LBF_SOME_PINNED 0x08
8561 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
8562 #define LBF_IGNORE_BIG_TASKS 0x100
8563 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
8564 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
8567 struct sched_domain *sd;
8575 struct cpumask *dst_grpmask;
8577 enum cpu_idle_type idle;
8579 unsigned int src_grp_nr_running;
8580 /* The set of CPUs under consideration for load-balancing */
8581 struct cpumask *cpus;
8582 unsigned int busiest_grp_capacity;
8583 unsigned int busiest_nr_running;
8588 unsigned int loop_break;
8589 unsigned int loop_max;
8591 enum fbq_type fbq_type;
8592 enum group_type busiest_group_type;
8593 struct list_head tasks;
8594 enum sched_boost_policy boost_policy;
8598 * Is this task likely cache-hot:
8600 static int task_hot(struct task_struct *p, struct lb_env *env)
8604 lockdep_assert_held(&env->src_rq->lock);
8606 if (p->sched_class != &fair_sched_class)
8609 if (unlikely(p->policy == SCHED_IDLE))
8613 * Buddy candidates are cache hot:
8615 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8616 (&p->se == cfs_rq_of(&p->se)->next ||
8617 &p->se == cfs_rq_of(&p->se)->last))
8620 if (sysctl_sched_migration_cost == -1)
8622 if (sysctl_sched_migration_cost == 0)
8625 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8627 return delta < (s64)sysctl_sched_migration_cost;
8630 #ifdef CONFIG_NUMA_BALANCING
8632 * Returns 1, if task migration degrades locality
8633 * Returns 0, if task migration improves locality i.e migration preferred.
8634 * Returns -1, if task migration is not affected by locality.
8636 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8638 struct numa_group *numa_group = rcu_dereference(p->numa_group);
8639 unsigned long src_faults, dst_faults;
8640 int src_nid, dst_nid;
8642 if (!static_branch_likely(&sched_numa_balancing))
8645 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8648 src_nid = cpu_to_node(env->src_cpu);
8649 dst_nid = cpu_to_node(env->dst_cpu);
8651 if (src_nid == dst_nid)
8654 /* Migrating away from the preferred node is always bad. */
8655 if (src_nid == p->numa_preferred_nid) {
8656 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8662 /* Encourage migration to the preferred node. */
8663 if (dst_nid == p->numa_preferred_nid)
8667 src_faults = group_faults(p, src_nid);
8668 dst_faults = group_faults(p, dst_nid);
8670 src_faults = task_faults(p, src_nid);
8671 dst_faults = task_faults(p, dst_nid);
8674 return dst_faults < src_faults;
8678 static inline int migrate_degrades_locality(struct task_struct *p,
8686 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8689 int can_migrate_task(struct task_struct *p, struct lb_env *env)
8692 int twf, group_cpus;
8694 lockdep_assert_held(&env->src_rq->lock);
8697 * We do not migrate tasks that are:
8698 * 1) throttled_lb_pair, or
8699 * 2) cannot be migrated to this CPU due to cpus_allowed, or
8700 * 3) running (obviously), or
8701 * 4) are cache-hot on their current CPU.
8703 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8706 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
8709 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
8711 env->flags |= LBF_SOME_PINNED;
8714 * Remember if this task can be migrated to any other cpu in
8715 * our sched_group. We may want to revisit it if we couldn't
8716 * meet load balance goals by pulling other tasks on src_cpu.
8718 * Also avoid computing new_dst_cpu if we have already computed
8719 * one in current iteration.
8721 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
8724 /* Prevent to re-select dst_cpu via env's cpus */
8725 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8726 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
8727 env->flags |= LBF_DST_PINNED;
8728 env->new_dst_cpu = cpu;
8736 /* Record that we found atleast one task that could run on dst_cpu */
8737 env->flags &= ~LBF_ALL_PINNED;
8739 if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
8740 if (nr_big_tasks(env->src_rq) && !is_big_task(p))
8743 if (env->boost_policy == SCHED_BOOST_ON_BIG &&
8744 !task_sched_boost(p))
8748 twf = task_will_fit(p, env->dst_cpu);
8751 * Attempt to not pull tasks that don't fit. We may get lucky and find
8752 * one that actually fits.
8754 if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
8757 if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
8758 !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
8762 * Group imbalance can sometimes cause work to be pulled across groups
8763 * even though the group could have managed the imbalance on its own.
8764 * Prevent inter-cluster migrations for big tasks when the number of
8765 * tasks is lower than the capacity of the group.
8767 group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
8768 SCHED_CAPACITY_SCALE);
8769 if (!twf && env->busiest_nr_running <= group_cpus)
8772 if (task_running(env->src_rq, p)) {
8773 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
8778 * Aggressive migration if:
8779 * 1) IDLE or NEWLY_IDLE balance.
8780 * 2) destination numa is preferred
8781 * 3) task is cache cold, or
8782 * 4) too many balance attempts have failed.
8784 tsk_cache_hot = migrate_degrades_locality(p, env);
8785 if (tsk_cache_hot == -1)
8786 tsk_cache_hot = task_hot(p, env);
8788 if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
8789 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
8790 if (tsk_cache_hot == 1) {
8791 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
8792 schedstat_inc(p, se.statistics.nr_forced_migrations);
8797 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
8802 * detach_task() -- detach the task for the migration specified in env
8804 static void detach_task(struct task_struct *p, struct lb_env *env)
8806 lockdep_assert_held(&env->src_rq->lock);
8808 p->on_rq = TASK_ON_RQ_MIGRATING;
8809 deactivate_task(env->src_rq, p, 0);
8810 double_lock_balance(env->src_rq, env->dst_rq);
8811 set_task_cpu(p, env->dst_cpu);
8812 if (task_in_related_thread_group(p))
8813 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
8814 double_unlock_balance(env->src_rq, env->dst_rq);
8818 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
8819 * part of active balancing operations within "domain".
8821 * Returns a task if successful and NULL otherwise.
8823 static struct task_struct *detach_one_task(struct lb_env *env)
8825 struct task_struct *p, *n;
8827 lockdep_assert_held(&env->src_rq->lock);
8829 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
8830 if (!can_migrate_task(p, env))
8833 detach_task(p, env);
8836 * Right now, this is only the second place where
8837 * lb_gained[env->idle] is updated (other is detach_tasks)
8838 * so we can safely collect stats here rather than
8839 * inside detach_tasks().
8841 schedstat_inc(env->sd, lb_gained[env->idle]);
8848 static const unsigned int sched_nr_migrate_break = 32;
8851 * detach_tasks() -- tries to detach up to imbalance weighted load from
8852 * busiest_rq, as part of a balancing operation within domain "sd".
8854 * Returns number of detached tasks if successful and 0 otherwise.
8856 static int detach_tasks(struct lb_env *env)
8858 struct list_head *tasks = &env->src_rq->cfs_tasks;
8859 struct task_struct *p;
8862 int orig_loop = env->loop;
8864 lockdep_assert_held(&env->src_rq->lock);
8866 if (env->imbalance <= 0)
8869 if (!same_cluster(env->dst_cpu, env->src_cpu))
8870 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
8872 if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
8873 env->flags |= LBF_IGNORE_BIG_TASKS;
8876 while (!list_empty(tasks)) {
8878 * We don't want to steal all, otherwise we may be treated likewise,
8879 * which could at worst lead to a livelock crash.
8881 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8884 p = list_first_entry(tasks, struct task_struct, se.group_node);
8887 /* We've more or less seen every task there is, call it quits */
8888 if (env->loop > env->loop_max)
8891 /* take a breather every nr_migrate tasks */
8892 if (env->loop > env->loop_break) {
8893 env->loop_break += sched_nr_migrate_break;
8894 env->flags |= LBF_NEED_BREAK;
8898 if (!can_migrate_task(p, env))
8901 load = task_h_load(p);
8903 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8906 if ((load / 2) > env->imbalance)
8909 detach_task(p, env);
8910 list_add(&p->se.group_node, &env->tasks);
8913 env->imbalance -= load;
8915 #ifdef CONFIG_PREEMPT
8917 * NEWIDLE balancing is a source of latency, so preemptible
8918 * kernels will stop after the first task is detached to minimize
8919 * the critical section.
8921 if (env->idle == CPU_NEWLY_IDLE)
8926 * We only want to steal up to the prescribed amount of
8929 if (env->imbalance <= 0)
8934 list_move_tail(&p->se.group_node, tasks);
8937 if (env->flags & (LBF_IGNORE_BIG_TASKS |
8938 LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
8939 tasks = &env->src_rq->cfs_tasks;
8940 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
8941 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
8942 env->loop = orig_loop;
8947 * Right now, this is one of only two places we collect this stat
8948 * so we can safely collect detach_one_task() stats here rather
8949 * than inside detach_one_task().
8951 schedstat_add(env->sd, lb_gained[env->idle], detached);
8957 * attach_task() -- attach the task detached by detach_task() to its new rq.
8959 static void attach_task(struct rq *rq, struct task_struct *p)
8961 lockdep_assert_held(&rq->lock);
8963 BUG_ON(task_rq(p) != rq);
8964 activate_task(rq, p, 0);
8965 p->on_rq = TASK_ON_RQ_QUEUED;
8966 check_preempt_curr(rq, p, 0);
8970 * attach_one_task() -- attaches the task returned from detach_one_task() to
8973 static void attach_one_task(struct rq *rq, struct task_struct *p)
8975 raw_spin_lock(&rq->lock);
8977 raw_spin_unlock(&rq->lock);
8981 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8984 static void attach_tasks(struct lb_env *env)
8986 struct list_head *tasks = &env->tasks;
8987 struct task_struct *p;
8989 raw_spin_lock(&env->dst_rq->lock);
8991 while (!list_empty(tasks)) {
8992 p = list_first_entry(tasks, struct task_struct, se.group_node);
8993 list_del_init(&p->se.group_node);
8995 attach_task(env->dst_rq, p);
8998 raw_spin_unlock(&env->dst_rq->lock);
9001 #ifdef CONFIG_FAIR_GROUP_SCHED
9002 static void update_blocked_averages(int cpu)
9004 struct rq *rq = cpu_rq(cpu);
9005 struct cfs_rq *cfs_rq;
9006 unsigned long flags;
9008 raw_spin_lock_irqsave(&rq->lock, flags);
9009 update_rq_clock(rq);
9012 * Iterates the task_group tree in a bottom up fashion, see
9013 * list_add_leaf_cfs_rq() for details.
9015 for_each_leaf_cfs_rq(rq, cfs_rq) {
9016 /* throttled entities do not contribute to load */
9017 if (throttled_hierarchy(cfs_rq))
9020 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
9022 update_tg_load_avg(cfs_rq, 0);
9024 /* Propagate pending load changes to the parent */
9025 if (cfs_rq->tg->se[cpu])
9026 update_load_avg(cfs_rq->tg->se[cpu], 0);
9028 raw_spin_unlock_irqrestore(&rq->lock, flags);
9032 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9033 * This needs to be done in a top-down fashion because the load of a child
9034 * group is a fraction of its parents load.
9036 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9038 struct rq *rq = rq_of(cfs_rq);
9039 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9040 unsigned long now = jiffies;
9043 if (cfs_rq->last_h_load_update == now)
9046 WRITE_ONCE(cfs_rq->h_load_next, NULL);
9047 for_each_sched_entity(se) {
9048 cfs_rq = cfs_rq_of(se);
9049 WRITE_ONCE(cfs_rq->h_load_next, se);
9050 if (cfs_rq->last_h_load_update == now)
9055 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9056 cfs_rq->last_h_load_update = now;
9059 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9060 load = cfs_rq->h_load;
9061 load = div64_ul(load * se->avg.load_avg,
9062 cfs_rq_load_avg(cfs_rq) + 1);
9063 cfs_rq = group_cfs_rq(se);
9064 cfs_rq->h_load = load;
9065 cfs_rq->last_h_load_update = now;
9069 static unsigned long task_h_load(struct task_struct *p)
9071 struct cfs_rq *cfs_rq = task_cfs_rq(p);
9073 update_cfs_rq_h_load(cfs_rq);
9074 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9075 cfs_rq_load_avg(cfs_rq) + 1);
9078 static inline void update_blocked_averages(int cpu)
9080 struct rq *rq = cpu_rq(cpu);
9081 struct cfs_rq *cfs_rq = &rq->cfs;
9082 unsigned long flags;
9084 raw_spin_lock_irqsave(&rq->lock, flags);
9085 update_rq_clock(rq);
9086 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
9087 raw_spin_unlock_irqrestore(&rq->lock, flags);
9090 static unsigned long task_h_load(struct task_struct *p)
9092 return p->se.avg.load_avg;
9096 /********** Helpers for find_busiest_group ************************/
9099 * sg_lb_stats - stats of a sched_group required for load_balancing
9101 struct sg_lb_stats {
9102 unsigned long avg_load; /*Avg load across the CPUs of the group */
9103 unsigned long group_load; /* Total load over the CPUs of the group */
9104 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
9105 unsigned long load_per_task;
9106 unsigned long group_capacity;
9107 unsigned long group_util; /* Total utilization of the group */
9108 unsigned int sum_nr_running; /* Nr tasks running in the group */
9109 #ifdef CONFIG_SCHED_HMP
9110 unsigned long sum_nr_big_tasks;
9111 u64 group_cpu_load; /* Scaled load of all CPUs of the group */
9113 unsigned int idle_cpus;
9114 unsigned int group_weight;
9115 enum group_type group_type;
9116 int group_no_capacity;
9117 int group_misfit_task; /* A cpu has a task too big for its capacity */
9118 #ifdef CONFIG_NUMA_BALANCING
9119 unsigned int nr_numa_running;
9120 unsigned int nr_preferred_running;
9125 * sd_lb_stats - Structure to store the statistics of a sched_domain
9126 * during load balancing.
9128 struct sd_lb_stats {
9129 struct sched_group *busiest; /* Busiest group in this sd */
9130 struct sched_group *local; /* Local group in this sd */
9131 unsigned long total_load; /* Total load of all groups in sd */
9132 unsigned long total_capacity; /* Total capacity of all groups in sd */
9133 unsigned long avg_load; /* Average load across all groups in sd */
9135 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
9136 struct sg_lb_stats local_stat; /* Statistics of the local group */
9139 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9142 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9143 * local_stat because update_sg_lb_stats() does a full clear/assignment.
9144 * We must however clear busiest_stat::avg_load because
9145 * update_sd_pick_busiest() reads this before assignment.
9147 *sds = (struct sd_lb_stats){
9151 .total_capacity = 0UL,
9154 .sum_nr_running = 0,
9155 .group_type = group_other,
9156 #ifdef CONFIG_SCHED_HMP
9157 .sum_nr_big_tasks = 0UL,
9158 .group_cpu_load = 0ULL,
9164 #ifdef CONFIG_SCHED_HMP
9167 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9169 int local_cpu, busiest_cpu;
9170 int local_capacity, busiest_capacity;
9171 int local_pwr_cost, busiest_pwr_cost;
9173 int boost = sched_boost();
9175 if (!sysctl_sched_restrict_cluster_spill ||
9176 boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
9179 local_cpu = group_first_cpu(sds->local);
9180 busiest_cpu = group_first_cpu(sds->busiest);
9182 local_capacity = cpu_max_possible_capacity(local_cpu);
9183 busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
9185 local_pwr_cost = cpu_max_power_cost(local_cpu);
9186 busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
9188 if (local_pwr_cost <= busiest_pwr_cost)
9191 if (local_capacity > busiest_capacity &&
9192 sds->busiest_stat.sum_nr_big_tasks)
9195 nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
9196 if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
9197 (sds->busiest_stat.sum_nr_running <
9198 nr_cpus * sysctl_sched_spill_nr_run))
9204 #else /* CONFIG_SCHED_HMP */
9207 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
9212 #endif /* CONFIG_SCHED_HMP */
9215 * get_sd_load_idx - Obtain the load index for a given sched domain.
9216 * @sd: The sched_domain whose load_idx is to be obtained.
9217 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
9219 * Return: The load index.
9221 static inline int get_sd_load_idx(struct sched_domain *sd,
9222 enum cpu_idle_type idle)
9228 load_idx = sd->busy_idx;
9231 case CPU_NEWLY_IDLE:
9232 load_idx = sd->newidle_idx;
9235 load_idx = sd->idle_idx;
9242 static unsigned long scale_rt_capacity(int cpu)
9244 struct rq *rq = cpu_rq(cpu);
9245 u64 total, used, age_stamp, avg;
9249 * Since we're reading these variables without serialization make sure
9250 * we read them once before doing sanity checks on them.
9252 age_stamp = READ_ONCE(rq->age_stamp);
9253 avg = READ_ONCE(rq->rt_avg);
9254 delta = __rq_clock_broken(rq) - age_stamp;
9256 if (unlikely(delta < 0))
9259 total = sched_avg_period() + delta;
9261 used = div_u64(avg, total);
9264 * deadline bandwidth is defined at system level so we must
9265 * weight this bandwidth with the max capacity of the system.
9266 * As a reminder, avg_bw is 20bits width and
9267 * scale_cpu_capacity is 10 bits width
9269 used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
9271 if (likely(used < SCHED_CAPACITY_SCALE))
9272 return SCHED_CAPACITY_SCALE - used;
9277 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
9279 raw_spin_lock_init(&mcc->lock);
9284 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9286 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
9287 struct sched_group *sdg = sd->groups;
9288 struct max_cpu_capacity *mcc;
9289 unsigned long max_capacity;
9291 unsigned long flags;
9293 cpu_rq(cpu)->cpu_capacity_orig = capacity;
9295 mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
9297 raw_spin_lock_irqsave(&mcc->lock, flags);
9298 max_capacity = mcc->val;
9299 max_cap_cpu = mcc->cpu;
9301 if ((max_capacity > capacity && max_cap_cpu == cpu) ||
9302 (max_capacity < capacity)) {
9303 mcc->val = capacity;
9305 #ifdef CONFIG_SCHED_DEBUG
9306 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9307 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
9312 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9314 skip_unlock: __attribute__ ((unused));
9315 capacity *= scale_rt_capacity(cpu);
9316 capacity >>= SCHED_CAPACITY_SHIFT;
9321 cpu_rq(cpu)->cpu_capacity = capacity;
9322 sdg->sgc->capacity = capacity;
9323 sdg->sgc->max_capacity = capacity;
9324 sdg->sgc->min_capacity = capacity;
9327 void update_group_capacity(struct sched_domain *sd, int cpu)
9329 struct sched_domain *child = sd->child;
9330 struct sched_group *group, *sdg = sd->groups;
9331 unsigned long capacity, max_capacity, min_capacity;
9332 unsigned long interval;
9334 interval = msecs_to_jiffies(sd->balance_interval);
9335 interval = clamp(interval, 1UL, max_load_balance_interval);
9336 sdg->sgc->next_update = jiffies + interval;
9339 update_cpu_capacity(sd, cpu);
9345 min_capacity = ULONG_MAX;
9347 if (child->flags & SD_OVERLAP) {
9349 * SD_OVERLAP domains cannot assume that child groups
9350 * span the current group.
9353 for_each_cpu(cpu, sched_group_cpus(sdg)) {
9354 struct sched_group_capacity *sgc;
9355 struct rq *rq = cpu_rq(cpu);
9357 if (cpumask_test_cpu(cpu, cpu_isolated_mask))
9360 * build_sched_domains() -> init_sched_groups_capacity()
9361 * gets here before we've attached the domains to the
9364 * Use capacity_of(), which is set irrespective of domains
9365 * in update_cpu_capacity().
9367 * This avoids capacity from being 0 and
9368 * causing divide-by-zero issues on boot.
9370 if (unlikely(!rq->sd)) {
9371 capacity += capacity_of(cpu);
9373 sgc = rq->sd->groups->sgc;
9374 capacity += sgc->capacity;
9377 max_capacity = max(capacity, max_capacity);
9378 min_capacity = min(capacity, min_capacity);
9382 * !SD_OVERLAP domains can assume that child groups
9383 * span the current group.
9386 group = child->groups;
9388 struct sched_group_capacity *sgc = group->sgc;
9390 cpumask_t *cpus = sched_group_cpus(group);
9392 /* Revisit this later. This won't work for MT domain */
9393 if (!cpu_isolated(cpumask_first(cpus))) {
9394 capacity += sgc->capacity;
9395 max_capacity = max(sgc->max_capacity, max_capacity);
9396 min_capacity = min(sgc->min_capacity, min_capacity);
9398 group = group->next;
9399 } while (group != child->groups);
9402 sdg->sgc->capacity = capacity;
9403 sdg->sgc->max_capacity = max_capacity;
9404 sdg->sgc->min_capacity = min_capacity;
9408 * Check whether the capacity of the rq has been noticeably reduced by side
9409 * activity. The imbalance_pct is used for the threshold.
9410 * Return true is the capacity is reduced
9413 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9415 return ((rq->cpu_capacity * sd->imbalance_pct) <
9416 (rq->cpu_capacity_orig * 100));
9420 * Group imbalance indicates (and tries to solve) the problem where balancing
9421 * groups is inadequate due to tsk_cpus_allowed() constraints.
9423 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
9424 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
9427 * { 0 1 2 3 } { 4 5 6 7 }
9430 * If we were to balance group-wise we'd place two tasks in the first group and
9431 * two tasks in the second group. Clearly this is undesired as it will overload
9432 * cpu 3 and leave one of the cpus in the second group unused.
9434 * The current solution to this issue is detecting the skew in the first group
9435 * by noticing the lower domain failed to reach balance and had difficulty
9436 * moving tasks due to affinity constraints.
9438 * When this is so detected; this group becomes a candidate for busiest; see
9439 * update_sd_pick_busiest(). And calculate_imbalance() and
9440 * find_busiest_group() avoid some of the usual balance conditions to allow it
9441 * to create an effective group imbalance.
9443 * This is a somewhat tricky proposition since the next run might not find the
9444 * group imbalance and decide the groups need to be balanced again. A most
9445 * subtle and fragile situation.
9448 static inline int sg_imbalanced(struct sched_group *group)
9450 return group->sgc->imbalance;
9454 * group_has_capacity returns true if the group has spare capacity that could
9455 * be used by some tasks.
9456 * We consider that a group has spare capacity if the * number of task is
9457 * smaller than the number of CPUs or if the utilization is lower than the
9458 * available capacity for CFS tasks.
9459 * For the latter, we use a threshold to stabilize the state, to take into
9460 * account the variance of the tasks' load and to return true if the available
9461 * capacity in meaningful for the load balancer.
9462 * As an example, an available capacity of 1% can appear but it doesn't make
9463 * any benefit for the load balance.
9466 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
9468 if (sgs->sum_nr_running < sgs->group_weight)
9471 if ((sgs->group_capacity * 100) >
9472 (sgs->group_util * env->sd->imbalance_pct))
9479 * group_is_overloaded returns true if the group has more tasks than it can
9481 * group_is_overloaded is not equals to !group_has_capacity because a group
9482 * with the exact right number of tasks, has no more spare capacity but is not
9483 * overloaded so both group_has_capacity and group_is_overloaded return
9487 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
9489 if (sgs->sum_nr_running <= sgs->group_weight)
9492 if ((sgs->group_capacity * 100) <
9493 (sgs->group_util * env->sd->imbalance_pct))
9501 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
9502 * per-cpu capacity than sched_group ref.
9505 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
9507 return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
9508 ref->sgc->max_capacity;
9512 group_type group_classify(struct sched_group *group,
9513 struct sg_lb_stats *sgs, struct lb_env *env)
9515 if (sgs->group_no_capacity)
9516 return group_overloaded;
9518 if (sg_imbalanced(group))
9519 return group_imbalanced;
9521 if (sgs->group_misfit_task)
9522 return group_misfit_task;
9527 #ifdef CONFIG_NO_HZ_COMMON
9529 * idle load balancing data
9530 * - used by the nohz balance, but we want it available here
9531 * so that we can see which CPUs have no tick.
9534 cpumask_var_t idle_cpus_mask;
9536 unsigned long next_balance; /* in jiffy units */
9537 } nohz ____cacheline_aligned;
9539 static inline void update_cpu_stats_if_tickless(struct rq *rq)
9541 /* only called from update_sg_lb_stats when irqs are disabled */
9542 if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
9543 /* rate limit updates to once-per-jiffie at most */
9544 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
9547 raw_spin_lock(&rq->lock);
9548 update_rq_clock(rq);
9549 update_idle_cpu_load(rq);
9550 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
9551 raw_spin_unlock(&rq->lock);
9556 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
9560 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9561 * @env: The load balancing environment.
9562 * @group: sched_group whose statistics are to be updated.
9563 * @load_idx: Load index of sched_domain of this_cpu for load calc.
9564 * @local_group: Does group contain this_cpu.
9565 * @sgs: variable to hold the statistics for this group.
9566 * @overload: Indicate more than one runnable task for any CPU.
9567 * @overutilized: Indicate overutilization for any CPU.
9569 static inline void update_sg_lb_stats(struct lb_env *env,
9570 struct sched_group *group, int load_idx,
9571 int local_group, struct sg_lb_stats *sgs,
9572 bool *overload, bool *overutilized)
9577 memset(sgs, 0, sizeof(*sgs));
9579 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
9580 struct rq *rq = cpu_rq(i);
9582 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
9587 if (cpu_isolated(i))
9590 /* if we are entering idle and there are CPUs with
9591 * their tick stopped, do an update for them
9593 if (env->idle == CPU_NEWLY_IDLE)
9594 update_cpu_stats_if_tickless(rq);
9596 /* Bias balancing toward cpus of our domain */
9598 load = target_load(i, load_idx);
9600 load = source_load(i, load_idx);
9602 sgs->group_load += load;
9603 sgs->group_util += cpu_util(i);
9604 sgs->sum_nr_running += rq->cfs.h_nr_running;
9606 nr_running = rq->nr_running;
9610 #ifdef CONFIG_SCHED_HMP
9611 sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
9612 sgs->group_cpu_load += cpu_load(i);
9615 #ifdef CONFIG_NUMA_BALANCING
9616 sgs->nr_numa_running += rq->nr_numa_running;
9617 sgs->nr_preferred_running += rq->nr_preferred_running;
9619 sgs->sum_weighted_load += weighted_cpuload(i);
9621 * No need to call idle_cpu() if nr_running is not 0
9623 if (!nr_running && idle_cpu(i))
9626 if (energy_aware() && cpu_overutilized(i)) {
9627 *overutilized = true;
9628 if (!sgs->group_misfit_task && rq->misfit_task)
9629 sgs->group_misfit_task = capacity_of(i);
9633 /* Isolated CPU has no weight */
9634 if (!group->group_weight) {
9635 sgs->group_capacity = 0;
9637 sgs->group_no_capacity = 1;
9638 sgs->group_type = group_other;
9639 sgs->group_weight = group->group_weight;
9641 /* Adjust by relative CPU capacity of the group */
9642 sgs->group_capacity = group->sgc->capacity;
9643 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
9644 sgs->group_capacity;
9646 sgs->group_weight = group->group_weight;
9648 sgs->group_no_capacity = group_is_overloaded(env, sgs);
9649 sgs->group_type = group_classify(group, sgs, env);
9652 if (sgs->sum_nr_running)
9653 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
9656 #ifdef CONFIG_SCHED_HMP
9657 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9658 struct sd_lb_stats *sds,
9659 struct sched_group *sg,
9660 struct sg_lb_stats *sgs)
9662 if (env->idle != CPU_NOT_IDLE &&
9663 cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
9664 if (sgs->sum_nr_big_tasks >
9665 sds->busiest_stat.sum_nr_big_tasks) {
9666 env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
9674 static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
9675 struct sd_lb_stats *sds,
9676 struct sched_group *sg,
9677 struct sg_lb_stats *sgs)
9684 * update_sd_pick_busiest - return 1 on busiest group
9685 * @env: The load balancing environment.
9686 * @sds: sched_domain statistics
9687 * @sg: sched_group candidate to be checked for being the busiest
9688 * @sgs: sched_group statistics
9690 * Determine if @sg is a busier group than the previously selected
9693 * Return: %true if @sg is a busier group than the previously selected
9694 * busiest group. %false otherwise.
9696 static bool update_sd_pick_busiest(struct lb_env *env,
9697 struct sd_lb_stats *sds,
9698 struct sched_group *sg,
9699 struct sg_lb_stats *sgs)
9701 struct sg_lb_stats *busiest = &sds->busiest_stat;
9703 if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
9706 if (sgs->group_type > busiest->group_type)
9709 if (sgs->group_type < busiest->group_type)
9712 if (energy_aware()) {
9714 * Candidate sg doesn't face any serious load-balance problems
9715 * so don't pick it if the local sg is already filled up.
9717 if (sgs->group_type == group_other &&
9718 !group_has_capacity(env, &sds->local_stat))
9721 if (sgs->avg_load <= busiest->avg_load)
9724 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9728 * Candidate sg has no more than one task per CPU and
9729 * has higher per-CPU capacity. Migrating tasks to less
9730 * capable CPUs may harm throughput. Maximize throughput,
9731 * power/energy consequences are not considered.
9733 if (sgs->sum_nr_running <= sgs->group_weight &&
9734 group_smaller_cpu_capacity(sds->local, sg))
9739 /* This is the busiest node in its class. */
9740 if (!(env->sd->flags & SD_ASYM_PACKING))
9744 * ASYM_PACKING needs to move all the work to the lowest
9745 * numbered CPUs in the group, therefore mark all groups
9746 * higher than ourself as busy.
9748 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
9752 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
9759 #ifdef CONFIG_NUMA_BALANCING
9760 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9762 if (sgs->sum_nr_running > sgs->nr_numa_running)
9764 if (sgs->sum_nr_running > sgs->nr_preferred_running)
9769 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9771 if (rq->nr_running > rq->nr_numa_running)
9773 if (rq->nr_running > rq->nr_preferred_running)
9778 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9783 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9787 #endif /* CONFIG_NUMA_BALANCING */
9789 #define lb_sd_parent(sd) \
9790 (sd->parent && sd->parent->groups != sd->parent->groups->next)
9793 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9794 * @env: The load balancing environment.
9795 * @sds: variable to hold the statistics for this sched_domain.
9797 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9799 struct sched_domain *child = env->sd->child;
9800 struct sched_group *sg = env->sd->groups;
9801 struct sg_lb_stats tmp_sgs;
9802 int load_idx, prefer_sibling = 0;
9803 bool overload = false, overutilized = false;
9805 if (child && child->flags & SD_PREFER_SIBLING)
9808 load_idx = get_sd_load_idx(env->sd, env->idle);
9811 struct sg_lb_stats *sgs = &tmp_sgs;
9814 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
9817 sgs = &sds->local_stat;
9819 if (env->idle != CPU_NEWLY_IDLE ||
9820 time_after_eq(jiffies, sg->sgc->next_update))
9821 update_group_capacity(env->sd, env->dst_cpu);
9824 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
9825 &overload, &overutilized);
9831 * In case the child domain prefers tasks go to siblings
9832 * first, lower the sg capacity so that we'll try
9833 * and move all the excess tasks away. We lower the capacity
9834 * of a group only if the local group has the capacity to fit
9835 * these excess tasks. The extra check prevents the case where
9836 * you always pull from the heaviest group when it is already
9837 * under-utilized (possible with a large weight task outweighs
9838 * the tasks on the system).
9840 if (prefer_sibling && sds->local &&
9841 group_has_capacity(env, &sds->local_stat) &&
9842 (sgs->sum_nr_running > 1)) {
9843 sgs->group_no_capacity = 1;
9844 sgs->group_type = group_classify(sg, sgs, env);
9848 * Ignore task groups with misfit tasks if local group has no
9849 * capacity or if per-cpu capacity isn't higher.
9851 if (energy_aware() &&
9852 sgs->group_type == group_misfit_task &&
9853 (!group_has_capacity(env, &sds->local_stat) ||
9854 !group_smaller_cpu_capacity(sg, sds->local)))
9855 sgs->group_type = group_other;
9857 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9859 sds->busiest_stat = *sgs;
9860 env->busiest_nr_running = sgs->sum_nr_running;
9861 env->busiest_grp_capacity = sgs->group_capacity;
9865 /* Now, start updating sd_lb_stats */
9866 sds->total_load += sgs->group_load;
9867 sds->total_capacity += sgs->group_capacity;
9870 } while (sg != env->sd->groups);
9872 if (env->sd->flags & SD_NUMA)
9873 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9875 env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9877 if (!lb_sd_parent(env->sd)) {
9878 /* update overload indicator if we are at root domain */
9879 if (env->dst_rq->rd->overload != overload)
9880 env->dst_rq->rd->overload = overload;
9882 /* Update over-utilization (tipping point, U >= 0) indicator */
9883 if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
9884 env->dst_rq->rd->overutilized = overutilized;
9885 trace_sched_overutilized(overutilized);
9888 if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
9889 env->dst_rq->rd->overutilized = true;
9890 trace_sched_overutilized(true);
9897 * check_asym_packing - Check to see if the group is packed into the
9900 * This is primarily intended to used at the sibling level. Some
9901 * cores like POWER7 prefer to use lower numbered SMT threads. In the
9902 * case of POWER7, it can move to lower SMT modes only when higher
9903 * threads are idle. When in lower SMT modes, the threads will
9904 * perform better since they share less core resources. Hence when we
9905 * have idle threads, we want them to be the higher ones.
9907 * This packing function is run on idle threads. It checks to see if
9908 * the busiest CPU in this domain (core in the P7 case) has a higher
9909 * CPU number than the packing function is being run on. Here we are
9910 * assuming lower CPU number will be equivalent to lower a SMT thread
9913 * Return: 1 when packing is required and a task should be moved to
9914 * this CPU. The amount of the imbalance is returned in *imbalance.
9916 * @env: The load balancing environment.
9917 * @sds: Statistics of the sched_domain which is to be packed
9919 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9923 if (!(env->sd->flags & SD_ASYM_PACKING))
9929 busiest_cpu = group_first_cpu(sds->busiest);
9930 if (env->dst_cpu > busiest_cpu)
9933 env->imbalance = DIV_ROUND_CLOSEST(
9934 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9935 SCHED_CAPACITY_SCALE);
9941 * fix_small_imbalance - Calculate the minor imbalance that exists
9942 * amongst the groups of a sched_domain, during
9944 * @env: The load balancing environment.
9945 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9948 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9950 unsigned long tmp, capa_now = 0, capa_move = 0;
9951 unsigned int imbn = 2;
9952 unsigned long scaled_busy_load_per_task;
9953 struct sg_lb_stats *local, *busiest;
9955 local = &sds->local_stat;
9956 busiest = &sds->busiest_stat;
9958 if (!local->sum_nr_running)
9959 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9960 else if (busiest->load_per_task > local->load_per_task)
9963 scaled_busy_load_per_task =
9964 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9965 busiest->group_capacity;
9967 if (busiest->avg_load + scaled_busy_load_per_task >=
9968 local->avg_load + (scaled_busy_load_per_task * imbn)) {
9969 env->imbalance = busiest->load_per_task;
9974 * OK, we don't have enough imbalance to justify moving tasks,
9975 * however we may be able to increase total CPU capacity used by
9979 capa_now += busiest->group_capacity *
9980 min(busiest->load_per_task, busiest->avg_load);
9981 capa_now += local->group_capacity *
9982 min(local->load_per_task, local->avg_load);
9983 capa_now /= SCHED_CAPACITY_SCALE;
9985 /* Amount of load we'd subtract */
9986 if (busiest->avg_load > scaled_busy_load_per_task) {
9987 capa_move += busiest->group_capacity *
9988 min(busiest->load_per_task,
9989 busiest->avg_load - scaled_busy_load_per_task);
9992 /* Amount of load we'd add */
9993 if (busiest->avg_load * busiest->group_capacity <
9994 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9995 tmp = (busiest->avg_load * busiest->group_capacity) /
9996 local->group_capacity;
9998 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9999 local->group_capacity;
10001 capa_move += local->group_capacity *
10002 min(local->load_per_task, local->avg_load + tmp);
10003 capa_move /= SCHED_CAPACITY_SCALE;
10005 /* Move if we gain throughput */
10006 if (capa_move > capa_now)
10007 env->imbalance = busiest->load_per_task;
10011 * calculate_imbalance - Calculate the amount of imbalance present within the
10012 * groups of a given sched_domain during load balance.
10013 * @env: load balance environment
10014 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
10016 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
10018 unsigned long max_pull, load_above_capacity = ~0UL;
10019 struct sg_lb_stats *local, *busiest;
10021 local = &sds->local_stat;
10022 busiest = &sds->busiest_stat;
10024 if (busiest->group_type == group_imbalanced) {
10026 * In the group_imb case we cannot rely on group-wide averages
10027 * to ensure cpu-load equilibrium, look at wider averages. XXX
10029 busiest->load_per_task =
10030 min(busiest->load_per_task, sds->avg_load);
10034 * In the presence of smp nice balancing, certain scenarios can have
10035 * max load less than avg load(as we skip the groups at or below
10036 * its cpu_capacity, while calculating max_load..)
10038 if (busiest->avg_load <= sds->avg_load ||
10039 local->avg_load >= sds->avg_load) {
10040 if (energy_aware()) {
10041 /* Misfitting tasks should be migrated in any case */
10042 if (busiest->group_type == group_misfit_task) {
10043 env->imbalance = busiest->group_misfit_task;
10048 * Busiest group is overloaded, local is not, use the spare
10049 * cycles to maximize throughput
10051 if (busiest->group_type == group_overloaded &&
10052 local->group_type <= group_misfit_task) {
10053 env->imbalance = busiest->load_per_task;
10058 env->imbalance = 0;
10059 return fix_small_imbalance(env, sds);
10063 * If there aren't any idle cpus, avoid creating some.
10065 if (busiest->group_type == group_overloaded &&
10066 local->group_type == group_overloaded) {
10067 load_above_capacity = busiest->sum_nr_running *
10069 if (load_above_capacity > busiest->group_capacity)
10070 load_above_capacity -= busiest->group_capacity;
10072 load_above_capacity = ~0UL;
10076 * We're trying to get all the cpus to the average_load, so we don't
10077 * want to push ourselves above the average load, nor do we wish to
10078 * reduce the max loaded cpu below the average load. At the same time,
10079 * we also don't want to reduce the group load below the group capacity
10080 * (so that we can implement power-savings policies etc). Thus we look
10081 * for the minimum possible imbalance.
10083 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
10085 /* How much load to actually move to equalise the imbalance */
10086 env->imbalance = min(
10087 max_pull * busiest->group_capacity,
10088 (sds->avg_load - local->avg_load) * local->group_capacity
10089 ) / SCHED_CAPACITY_SCALE;
10091 /* Boost imbalance to allow misfit task to be balanced. */
10092 if (energy_aware() && busiest->group_type == group_misfit_task)
10093 env->imbalance = max_t(long, env->imbalance,
10094 busiest->group_misfit_task);
10097 * if *imbalance is less than the average load per runnable task
10098 * there is no guarantee that any tasks will be moved so we'll have
10099 * a think about bumping its value to force at least one task to be
10102 if (env->imbalance < busiest->load_per_task)
10103 return fix_small_imbalance(env, sds);
10106 /******* find_busiest_group() helpers end here *********************/
10109 * find_busiest_group - Returns the busiest group within the sched_domain
10110 * if there is an imbalance. If there isn't an imbalance, and
10111 * the user has opted for power-savings, it returns a group whose
10112 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
10113 * such a group exists.
10115 * Also calculates the amount of weighted load which should be moved
10116 * to restore balance.
10118 * @env: The load balancing environment.
10120 * Return: - The busiest group if imbalance exists.
10121 * - If no imbalance and user has opted for power-savings balance,
10122 * return the least loaded group whose CPUs can be
10123 * put to idle by rebalancing its tasks onto our group.
10125 static struct sched_group *find_busiest_group(struct lb_env *env)
10127 struct sg_lb_stats *local, *busiest;
10128 struct sd_lb_stats sds;
10130 init_sd_lb_stats(&sds);
10133 * Compute the various statistics relavent for load balancing at
10136 update_sd_lb_stats(env, &sds);
10138 if (energy_aware() && !env->dst_rq->rd->overutilized)
10141 local = &sds.local_stat;
10142 busiest = &sds.busiest_stat;
10144 /* ASYM feature bypasses nice load balance check */
10145 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
10146 check_asym_packing(env, &sds))
10147 return sds.busiest;
10149 /* There is no busy sibling group to pull tasks from */
10150 if (!sds.busiest || busiest->sum_nr_running == 0)
10153 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10154 goto force_balance;
10156 if (bail_inter_cluster_balance(env, &sds))
10159 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
10160 / sds.total_capacity;
10163 * If the busiest group is imbalanced the below checks don't
10164 * work because they assume all things are equal, which typically
10165 * isn't true due to cpus_allowed constraints and the like.
10167 if (busiest->group_type == group_imbalanced)
10168 goto force_balance;
10171 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
10172 * capacities from resulting in underutilization due to avg_load.
10174 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
10175 busiest->group_no_capacity)
10176 goto force_balance;
10178 /* Misfitting tasks should be dealt with regardless of the avg load */
10179 if (energy_aware() && busiest->group_type == group_misfit_task) {
10180 goto force_balance;
10184 * If the local group is busier than the selected busiest group
10185 * don't try and pull any tasks.
10187 if (local->avg_load >= busiest->avg_load)
10191 * Don't pull any tasks if this group is already above the domain
10194 if (local->avg_load >= sds.avg_load)
10197 if (env->idle == CPU_IDLE) {
10199 * This cpu is idle. If the busiest group is not overloaded
10200 * and there is no imbalance between this and busiest group
10201 * wrt idle cpus, it is balanced. The imbalance becomes
10202 * significant if the diff is greater than 1 otherwise we
10203 * might end up to just move the imbalance on another group
10205 if ((busiest->group_type != group_overloaded) &&
10206 (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
10207 !group_smaller_cpu_capacity(sds.busiest, sds.local))
10211 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
10212 * imbalance_pct to be conservative.
10214 if (100 * busiest->avg_load <=
10215 env->sd->imbalance_pct * local->avg_load)
10220 env->busiest_group_type = busiest->group_type;
10221 /* Looks like there is an imbalance. Compute it */
10222 calculate_imbalance(env, &sds);
10223 return sds.busiest;
10226 env->imbalance = 0;
10230 #ifdef CONFIG_SCHED_HMP
10231 static struct rq *find_busiest_queue_hmp(struct lb_env *env,
10232 struct sched_group *group)
10234 struct rq *busiest = NULL, *busiest_big = NULL;
10235 u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
10236 int max_nr_big = 0, nr_big;
10237 bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
10241 cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
10243 for_each_cpu(i, &cpus) {
10244 struct rq *rq = cpu_rq(i);
10245 u64 cumulative_runnable_avg =
10246 rq->hmp_stats.cumulative_runnable_avg;
10248 if (!cpumask_test_cpu(i, env->cpus))
10253 nr_big = nr_big_tasks(rq);
10254 if (nr_big > max_nr_big ||
10255 (nr_big > 0 && nr_big == max_nr_big &&
10256 cumulative_runnable_avg > max_runnable_avg_big)) {
10257 max_runnable_avg_big = cumulative_runnable_avg;
10259 max_nr_big = nr_big;
10264 if (cumulative_runnable_avg > max_runnable_avg) {
10265 max_runnable_avg = cumulative_runnable_avg;
10271 return busiest_big;
10273 env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
10277 static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
10278 struct sched_group *group)
10285 * find_busiest_queue - find the busiest runqueue among the cpus in group.
10287 static struct rq *find_busiest_queue(struct lb_env *env,
10288 struct sched_group *group)
10290 struct rq *busiest = NULL, *rq;
10291 unsigned long busiest_load = 0, busiest_capacity = 1;
10294 #ifdef CONFIG_SCHED_HMP
10295 return find_busiest_queue_hmp(env, group);
10298 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
10299 unsigned long capacity, wl;
10303 rt = fbq_classify_rq(rq);
10306 * We classify groups/runqueues into three groups:
10307 * - regular: there are !numa tasks
10308 * - remote: there are numa tasks that run on the 'wrong' node
10309 * - all: there is no distinction
10311 * In order to avoid migrating ideally placed numa tasks,
10312 * ignore those when there's better options.
10314 * If we ignore the actual busiest queue to migrate another
10315 * task, the next balance pass can still reduce the busiest
10316 * queue by moving tasks around inside the node.
10318 * If we cannot move enough load due to this classification
10319 * the next pass will adjust the group classification and
10320 * allow migration of more tasks.
10322 * Both cases only affect the total convergence complexity.
10324 if (rt > env->fbq_type)
10327 capacity = capacity_of(i);
10329 wl = weighted_cpuload(i);
10332 * When comparing with imbalance, use weighted_cpuload()
10333 * which is not scaled with the cpu capacity.
10336 if (rq->nr_running == 1 && wl > env->imbalance &&
10337 !check_cpu_capacity(rq, env->sd) &&
10338 env->busiest_group_type != group_misfit_task)
10342 * For the load comparisons with the other cpu's, consider
10343 * the weighted_cpuload() scaled with the cpu capacity, so
10344 * that the load can be moved away from the cpu that is
10345 * potentially running at a lower capacity.
10347 * Thus we're looking for max(wl_i / capacity_i), crosswise
10348 * multiplication to rid ourselves of the division works out
10349 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
10350 * our previous maximum.
10352 if (wl * busiest_capacity > busiest_load * capacity) {
10354 busiest_capacity = capacity;
10363 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10364 * so long as it is large enough.
10366 #define MAX_PINNED_INTERVAL 16
10368 /* Working cpumask for load_balance and load_balance_newidle. */
10369 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
10371 #define NEED_ACTIVE_BALANCE_THRESHOLD 10
10373 static int need_active_balance(struct lb_env *env)
10375 struct sched_domain *sd = env->sd;
10377 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
10380 if (env->idle == CPU_NEWLY_IDLE) {
10383 * ASYM_PACKING needs to force migrate tasks from busy but
10384 * higher numbered CPUs in order to pack all tasks in the
10385 * lowest numbered CPUs.
10387 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
10392 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
10393 * It's worth migrating the task if the src_cpu's capacity is reduced
10394 * because of other sched_class or IRQs if more capacity stays
10395 * available on dst_cpu.
10397 if ((env->idle != CPU_NOT_IDLE) &&
10398 (env->src_rq->cfs.h_nr_running == 1)) {
10399 if ((check_cpu_capacity(env->src_rq, sd)) &&
10400 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10404 if (energy_aware() &&
10405 (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
10406 ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
10407 env->src_rq->cfs.h_nr_running == 1 &&
10408 cpu_overutilized(env->src_cpu) &&
10409 !cpu_overutilized(env->dst_cpu)) {
10413 return unlikely(sd->nr_balance_failed >
10414 sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
10417 static int group_balance_cpu_not_isolated(struct sched_group *sg)
10421 cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
10422 cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
10423 return cpumask_first(&cpus);
10426 static int should_we_balance(struct lb_env *env)
10428 struct sched_group *sg = env->sd->groups;
10429 struct cpumask *sg_cpus, *sg_mask;
10430 int cpu, balance_cpu = -1;
10433 * In the newly idle case, we will allow all the cpu's
10434 * to do the newly idle load balance.
10436 if (env->idle == CPU_NEWLY_IDLE)
10439 sg_cpus = sched_group_cpus(sg);
10440 sg_mask = sched_group_mask(sg);
10441 /* Try to find first idle cpu */
10442 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
10443 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
10451 if (balance_cpu == -1)
10452 balance_cpu = group_balance_cpu_not_isolated(sg);
10455 * First idle cpu or the first cpu(busiest) in this sched group
10456 * is eligible for doing load balancing at this and above domains.
10458 return balance_cpu == env->dst_cpu;
10462 * Check this_cpu to ensure it is balanced within domain. Attempt to move
10463 * tasks if there is an imbalance.
10465 static int load_balance(int this_cpu, struct rq *this_rq,
10466 struct sched_domain *sd, enum cpu_idle_type idle,
10467 int *continue_balancing)
10469 int ld_moved = 0, cur_ld_moved, active_balance = 0;
10470 struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
10471 struct sched_group *group = NULL;
10472 struct rq *busiest = NULL;
10473 unsigned long flags;
10474 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
10476 struct lb_env env = {
10478 .dst_cpu = this_cpu,
10480 .dst_grpmask = sched_group_cpus(sd->groups),
10482 .loop_break = sched_nr_migrate_break,
10485 .tasks = LIST_HEAD_INIT(env.tasks),
10489 .busiest_nr_running = 0,
10490 .busiest_grp_capacity = 0,
10491 .boost_policy = sched_boost_policy(),
10495 * For NEWLY_IDLE load_balancing, we don't need to consider
10496 * other cpus in our group
10498 if (idle == CPU_NEWLY_IDLE)
10499 env.dst_grpmask = NULL;
10501 cpumask_copy(cpus, cpu_active_mask);
10503 schedstat_inc(sd, lb_count[idle]);
10506 if (!should_we_balance(&env)) {
10507 *continue_balancing = 0;
10511 group = find_busiest_group(&env);
10513 schedstat_inc(sd, lb_nobusyg[idle]);
10517 busiest = find_busiest_queue(&env, group);
10519 schedstat_inc(sd, lb_nobusyq[idle]);
10523 BUG_ON(busiest == env.dst_rq);
10525 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
10527 env.src_cpu = busiest->cpu;
10528 env.src_rq = busiest;
10531 if (busiest->nr_running > 1) {
10533 * Attempt to move tasks. If find_busiest_group has found
10534 * an imbalance but busiest->nr_running <= 1, the group is
10535 * still unbalanced. ld_moved simply stays zero, so it is
10536 * correctly treated as an imbalance.
10538 env.flags |= LBF_ALL_PINNED;
10539 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
10542 raw_spin_lock_irqsave(&busiest->lock, flags);
10543 update_rq_clock(busiest);
10545 /* The world might have changed. Validate assumptions */
10546 if (busiest->nr_running <= 1) {
10547 raw_spin_unlock_irqrestore(&busiest->lock, flags);
10548 env.flags &= ~LBF_ALL_PINNED;
10553 * cur_ld_moved - load moved in current iteration
10554 * ld_moved - cumulative load moved across iterations
10556 cur_ld_moved = detach_tasks(&env);
10559 * We've detached some tasks from busiest_rq. Every
10560 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
10561 * unlock busiest->lock, and we are able to be sure
10562 * that nobody can manipulate the tasks in parallel.
10563 * See task_rq_lock() family for the details.
10566 raw_spin_unlock(&busiest->lock);
10568 if (cur_ld_moved) {
10569 attach_tasks(&env);
10570 ld_moved += cur_ld_moved;
10573 local_irq_restore(flags);
10575 if (env.flags & LBF_NEED_BREAK) {
10576 env.flags &= ~LBF_NEED_BREAK;
10581 * Revisit (affine) tasks on src_cpu that couldn't be moved to
10582 * us and move them to an alternate dst_cpu in our sched_group
10583 * where they can run. The upper limit on how many times we
10584 * iterate on same src_cpu is dependent on number of cpus in our
10587 * This changes load balance semantics a bit on who can move
10588 * load to a given_cpu. In addition to the given_cpu itself
10589 * (or a ilb_cpu acting on its behalf where given_cpu is
10590 * nohz-idle), we now have balance_cpu in a position to move
10591 * load to given_cpu. In rare situations, this may cause
10592 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
10593 * _independently_ and at _same_ time to move some load to
10594 * given_cpu) causing exceess load to be moved to given_cpu.
10595 * This however should not happen so much in practice and
10596 * moreover subsequent load balance cycles should correct the
10597 * excess load moved.
10599 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
10601 /* Prevent to re-select dst_cpu via env's cpus */
10602 cpumask_clear_cpu(env.dst_cpu, env.cpus);
10604 env.dst_rq = cpu_rq(env.new_dst_cpu);
10605 env.dst_cpu = env.new_dst_cpu;
10606 env.flags &= ~LBF_DST_PINNED;
10608 env.loop_break = sched_nr_migrate_break;
10611 * Go back to "more_balance" rather than "redo" since we
10612 * need to continue with same src_cpu.
10618 * We failed to reach balance because of affinity.
10621 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10623 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
10624 *group_imbalance = 1;
10627 /* All tasks on this runqueue were pinned by CPU affinity */
10628 if (unlikely(env.flags & LBF_ALL_PINNED)) {
10629 cpumask_clear_cpu(cpu_of(busiest), cpus);
10630 if (!cpumask_empty(cpus)) {
10632 env.loop_break = sched_nr_migrate_break;
10635 goto out_all_pinned;
10641 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
10642 schedstat_inc(sd, lb_failed[idle]);
10645 * Increment the failure counter only on periodic balance.
10646 * We do not want newidle balance, which can be very
10647 * frequent, pollute the failure counter causing
10648 * excessive cache_hot migrations and active balances.
10650 if (idle != CPU_NEWLY_IDLE &&
10651 !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
10652 if (env.src_grp_nr_running > 1)
10653 sd->nr_balance_failed++;
10656 if (need_active_balance(&env)) {
10657 raw_spin_lock_irqsave(&busiest->lock, flags);
10659 /* don't kick the active_load_balance_cpu_stop,
10660 * if the curr task on busiest cpu can't be
10661 * moved to this_cpu
10663 if (!cpumask_test_cpu(this_cpu,
10664 tsk_cpus_allowed(busiest->curr))) {
10665 raw_spin_unlock_irqrestore(&busiest->lock,
10667 env.flags |= LBF_ALL_PINNED;
10668 goto out_one_pinned;
10672 * ->active_balance synchronizes accesses to
10673 * ->active_balance_work. Once set, it's cleared
10674 * only after active load balance is finished.
10676 if (!busiest->active_balance &&
10677 !cpu_isolated(cpu_of(busiest))) {
10678 busiest->active_balance = 1;
10679 busiest->push_cpu = this_cpu;
10680 active_balance = 1;
10682 raw_spin_unlock_irqrestore(&busiest->lock, flags);
10684 if (active_balance) {
10685 stop_one_cpu_nowait(cpu_of(busiest),
10686 active_load_balance_cpu_stop, busiest,
10687 &busiest->active_balance_work);
10688 *continue_balancing = 0;
10692 * We've kicked active balancing, reset the failure
10695 sd->nr_balance_failed =
10696 sd->cache_nice_tries +
10697 NEED_ACTIVE_BALANCE_THRESHOLD - 1;
10700 sd->nr_balance_failed = 0;
10702 /* Assumes one 'busiest' cpu that we pulled tasks from */
10703 if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
10704 int check_groups = !!(env.flags &
10705 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
10707 check_for_freq_change(this_rq, false, check_groups);
10708 check_for_freq_change(busiest, false, check_groups);
10710 check_for_freq_change(this_rq, true, false);
10713 if (likely(!active_balance)) {
10714 /* We were unbalanced, so reset the balancing interval */
10715 sd->balance_interval = sd->min_interval;
10718 * If we've begun active balancing, start to back off. This
10719 * case may not be covered by the all_pinned logic if there
10720 * is only 1 task on the busy runqueue (because we don't call
10723 if (sd->balance_interval < sd->max_interval)
10724 sd->balance_interval *= 2;
10731 * We reach balance although we may have faced some affinity
10732 * constraints. Clear the imbalance flag if it was set.
10735 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10737 if (*group_imbalance)
10738 *group_imbalance = 0;
10743 * We reach balance because all tasks are pinned at this level so
10744 * we can't migrate them. Let the imbalance flag set so parent level
10745 * can try to migrate them.
10747 schedstat_inc(sd, lb_balanced[idle]);
10749 sd->nr_balance_failed = 0;
10752 /* tune up the balancing interval */
10753 if (((env.flags & LBF_ALL_PINNED) &&
10754 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10755 (sd->balance_interval < sd->max_interval))
10756 sd->balance_interval *= 2;
10760 trace_sched_load_balance(this_cpu, idle, *continue_balancing,
10761 group ? group->cpumask[0] : 0,
10762 busiest ? busiest->nr_running : 0,
10763 env.imbalance, env.flags, ld_moved,
10764 sd->balance_interval);
10768 static inline unsigned long
10769 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10771 unsigned long interval = sd->balance_interval;
10774 interval *= sd->busy_factor;
10776 /* scale ms to jiffies */
10777 interval = msecs_to_jiffies(interval);
10778 interval = clamp(interval, 1UL, max_load_balance_interval);
10784 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
10786 unsigned long interval, next;
10788 interval = get_sd_balance_interval(sd, cpu_busy);
10789 next = sd->last_balance + interval;
10791 if (time_after(*next_balance, next))
10792 *next_balance = next;
10796 * idle_balance is called by schedule() if this_cpu is about to become
10797 * idle. Attempts to pull tasks from other CPUs.
10799 static int idle_balance(struct rq *this_rq)
10801 unsigned long next_balance = jiffies + HZ;
10802 int this_cpu = this_rq->cpu;
10803 struct sched_domain *sd;
10804 int pulled_task = 0;
10807 if (cpu_isolated(this_cpu))
10810 idle_enter_fair(this_rq);
10813 * We must set idle_stamp _before_ calling idle_balance(), such that we
10814 * measure the duration of idle_balance() as idle time.
10816 this_rq->idle_stamp = rq_clock(this_rq);
10818 if (!energy_aware() &&
10819 (this_rq->avg_idle < sysctl_sched_migration_cost ||
10820 !this_rq->rd->overload)) {
10822 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10824 update_next_balance(sd, 0, &next_balance);
10830 raw_spin_unlock(&this_rq->lock);
10832 update_blocked_averages(this_cpu);
10834 for_each_domain(this_cpu, sd) {
10835 int continue_balancing = 1;
10836 u64 t0, domain_cost;
10838 if (!(sd->flags & SD_LOAD_BALANCE))
10841 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10842 update_next_balance(sd, 0, &next_balance);
10846 if (sd->flags & SD_BALANCE_NEWIDLE) {
10847 t0 = sched_clock_cpu(this_cpu);
10849 pulled_task = load_balance(this_cpu, this_rq,
10850 sd, CPU_NEWLY_IDLE,
10851 &continue_balancing);
10853 domain_cost = sched_clock_cpu(this_cpu) - t0;
10854 if (domain_cost > sd->max_newidle_lb_cost)
10855 sd->max_newidle_lb_cost = domain_cost;
10857 curr_cost += domain_cost;
10860 update_next_balance(sd, 0, &next_balance);
10863 * Stop searching for tasks to pull if there are
10864 * now runnable tasks on the balance rq or if
10865 * continue_balancing has been unset (only possible
10866 * due to active migration).
10868 if (pulled_task || this_rq->nr_running > 0 ||
10869 !continue_balancing)
10874 raw_spin_lock(&this_rq->lock);
10876 if (curr_cost > this_rq->max_idle_balance_cost)
10877 this_rq->max_idle_balance_cost = curr_cost;
10880 * While browsing the domains, we released the rq lock, a task could
10881 * have been enqueued in the meantime. Since we're not going idle,
10882 * pretend we pulled a task.
10884 if (this_rq->cfs.h_nr_running && !pulled_task)
10888 /* Move the next balance forward */
10889 if (time_after(this_rq->next_balance, next_balance))
10890 this_rq->next_balance = next_balance;
10892 /* Is there a task of a high priority class? */
10893 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10897 idle_exit_fair(this_rq);
10898 this_rq->idle_stamp = 0;
10901 return pulled_task;
10905 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
10906 * running tasks off the busiest CPU onto idle CPUs. It requires at
10907 * least 1 task to be running on each physical CPU where possible, and
10908 * avoids physical / logical imbalances.
10910 static int active_load_balance_cpu_stop(void *data)
10912 struct rq *busiest_rq = data;
10913 int busiest_cpu = cpu_of(busiest_rq);
10914 int target_cpu = busiest_rq->push_cpu;
10915 struct rq *target_rq = cpu_rq(target_cpu);
10916 struct sched_domain *sd = NULL;
10917 struct task_struct *p = NULL;
10918 struct task_struct *push_task = NULL;
10919 int push_task_detached = 0;
10920 struct lb_env env = {
10922 .dst_cpu = target_cpu,
10923 .dst_rq = target_rq,
10924 .src_cpu = busiest_rq->cpu,
10925 .src_rq = busiest_rq,
10927 .busiest_nr_running = 0,
10928 .busiest_grp_capacity = 0,
10931 .boost_policy = sched_boost_policy(),
10933 bool moved = false;
10935 raw_spin_lock_irq(&busiest_rq->lock);
10937 /* make sure the requested cpu hasn't gone down in the meantime */
10938 if (unlikely(busiest_cpu != smp_processor_id() ||
10939 !busiest_rq->active_balance))
10942 /* Is there any task to move? */
10943 if (busiest_rq->nr_running <= 1)
10947 * This condition is "impossible", if it occurs
10948 * we need to fix it. Originally reported by
10949 * Bjorn Helgaas on a 128-cpu setup.
10951 BUG_ON(busiest_rq == target_rq);
10953 push_task = busiest_rq->push_task;
10954 target_cpu = busiest_rq->push_cpu;
10956 if (task_on_rq_queued(push_task) &&
10957 push_task->state == TASK_RUNNING &&
10958 task_cpu(push_task) == busiest_cpu &&
10959 cpu_online(target_cpu)) {
10960 detach_task(push_task, &env);
10961 push_task_detached = 1;
10967 /* Search for an sd spanning us and the target CPU. */
10969 for_each_domain(target_cpu, sd) {
10970 if ((sd->flags & SD_LOAD_BALANCE) &&
10971 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10977 schedstat_inc(sd, alb_count);
10978 update_rq_clock(busiest_rq);
10980 p = detach_one_task(&env);
10982 schedstat_inc(sd, alb_pushed);
10985 schedstat_inc(sd, alb_failed);
10990 busiest_rq->active_balance = 0;
10991 push_task = busiest_rq->push_task;
10992 target_cpu = busiest_rq->push_cpu;
10995 busiest_rq->push_task = NULL;
10997 raw_spin_unlock(&busiest_rq->lock);
11000 if (push_task_detached)
11001 attach_one_task(target_rq, push_task);
11002 put_task_struct(push_task);
11003 clear_reserved(target_cpu);
11007 attach_one_task(target_rq, p);
11009 local_irq_enable();
11011 if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
11012 int check_groups = !!(env.flags &
11013 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
11014 check_for_freq_change(busiest_rq, false, check_groups);
11015 check_for_freq_change(target_rq, false, check_groups);
11016 } else if (moved) {
11017 check_for_freq_change(target_rq, true, false);
11023 static inline int on_null_domain(struct rq *rq)
11025 return unlikely(!rcu_dereference_sched(rq->sd));
11028 #ifdef CONFIG_NO_HZ_COMMON
11030 * idle load balancing details
11031 * - When one of the busy CPUs notice that there may be an idle rebalancing
11032 * needed, they will kick the idle load balancer, which then does idle
11033 * load balancing for all the idle CPUs.
11036 #ifdef CONFIG_SCHED_HMP
11037 static inline int find_new_hmp_ilb(int type)
11039 int call_cpu = raw_smp_processor_id();
11040 struct sched_domain *sd;
11045 /* Pick an idle cpu "closest" to call_cpu */
11046 for_each_domain(call_cpu, sd) {
11047 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
11048 sched_domain_span(sd)) {
11049 if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
11050 cpu_max_power_cost(ilb) <=
11051 cpu_max_power_cost(call_cpu))) {
11053 reset_balance_interval(ilb);
11062 #else /* CONFIG_SCHED_HMP */
11063 static inline int find_new_hmp_ilb(int type)
11067 #endif /* CONFIG_SCHED_HMP */
11069 static inline int find_new_ilb(int type)
11073 #ifdef CONFIG_SCHED_HMP
11074 return find_new_hmp_ilb(type);
11077 ilb = cpumask_first(nohz.idle_cpus_mask);
11079 if (ilb < nr_cpu_ids && idle_cpu(ilb))
11086 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
11087 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
11088 * CPU (if there is one).
11090 static void nohz_balancer_kick(int type)
11094 nohz.next_balance++;
11096 ilb_cpu = find_new_ilb(type);
11098 if (ilb_cpu >= nr_cpu_ids)
11101 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
11104 * Use smp_send_reschedule() instead of resched_cpu().
11105 * This way we generate a sched IPI on the target cpu which
11106 * is idle. And the softirq performing nohz idle load balance
11107 * will be run before returning from the IPI.
11109 smp_send_reschedule(ilb_cpu);
11113 void nohz_balance_clear_nohz_mask(int cpu)
11115 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
11116 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
11117 atomic_dec(&nohz.nr_cpus);
11121 static inline void nohz_balance_exit_idle(int cpu)
11123 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
11125 * Completely isolated CPUs don't ever set, so we must test.
11127 nohz_balance_clear_nohz_mask(cpu);
11128 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11132 static inline void set_cpu_sd_state_busy(void)
11134 struct sched_domain *sd;
11135 int cpu = smp_processor_id();
11138 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11140 if (!sd || !sd->nohz_idle)
11144 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
11149 void set_cpu_sd_state_idle(void)
11151 struct sched_domain *sd;
11152 int cpu = smp_processor_id();
11155 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11157 if (!sd || sd->nohz_idle)
11161 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
11167 * This routine will record that the cpu is going idle with tick stopped.
11168 * This info will be used in performing idle load balancing in the future.
11170 void nohz_balance_enter_idle(int cpu)
11173 * If this cpu is going down, then nothing needs to be done.
11175 if (!cpu_active(cpu))
11178 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
11182 * If we're a completely isolated CPU, we don't play.
11184 if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
11187 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
11188 atomic_inc(&nohz.nr_cpus);
11189 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
11192 static int sched_ilb_notifier(struct notifier_block *nfb,
11193 unsigned long action, void *hcpu)
11195 switch (action & ~CPU_TASKS_FROZEN) {
11197 nohz_balance_exit_idle(smp_processor_id());
11200 return NOTIFY_DONE;
11205 static DEFINE_SPINLOCK(balancing);
11208 * Scale the max load_balance interval with the number of CPUs in the system.
11209 * This trades load-balance latency on larger machines for less cross talk.
11211 void update_max_interval(void)
11213 cpumask_t avail_mask;
11214 unsigned int available_cpus;
11216 cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
11217 available_cpus = cpumask_weight(&avail_mask);
11219 max_load_balance_interval = HZ*available_cpus/10;
11223 * It checks each scheduling domain to see if it is due to be balanced,
11224 * and initiates a balancing operation if so.
11226 * Balancing parameters are set up in init_sched_domains.
11228 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
11230 int continue_balancing = 1;
11232 unsigned long interval;
11233 struct sched_domain *sd;
11234 /* Earliest time when we have to do rebalance again */
11235 unsigned long next_balance = jiffies + 60*HZ;
11236 int update_next_balance = 0;
11237 int need_serialize, need_decay = 0;
11240 update_blocked_averages(cpu);
11243 for_each_domain(cpu, sd) {
11245 * Decay the newidle max times here because this is a regular
11246 * visit to all the domains. Decay ~1% per second.
11248 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
11249 sd->max_newidle_lb_cost =
11250 (sd->max_newidle_lb_cost * 253) / 256;
11251 sd->next_decay_max_lb_cost = jiffies + HZ;
11254 max_cost += sd->max_newidle_lb_cost;
11256 if (!(sd->flags & SD_LOAD_BALANCE))
11260 * Stop the load balance at this level. There is another
11261 * CPU in our sched group which is doing load balancing more
11264 if (!continue_balancing) {
11270 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11272 need_serialize = sd->flags & SD_SERIALIZE;
11273 if (need_serialize) {
11274 if (!spin_trylock(&balancing))
11278 if (time_after_eq(jiffies, sd->last_balance + interval)) {
11279 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
11281 * The LBF_DST_PINNED logic could have changed
11282 * env->dst_cpu, so we can't know our idle
11283 * state even if we migrated tasks. Update it.
11285 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
11287 sd->last_balance = jiffies;
11288 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
11290 if (need_serialize)
11291 spin_unlock(&balancing);
11293 if (time_after(next_balance, sd->last_balance + interval)) {
11294 next_balance = sd->last_balance + interval;
11295 update_next_balance = 1;
11300 * Ensure the rq-wide value also decays but keep it at a
11301 * reasonable floor to avoid funnies with rq->avg_idle.
11303 rq->max_idle_balance_cost =
11304 max((u64)sysctl_sched_migration_cost, max_cost);
11309 * next_balance will be updated only when there is a need.
11310 * When the cpu is attached to null domain for ex, it will not be
11313 if (likely(update_next_balance)) {
11314 rq->next_balance = next_balance;
11316 #ifdef CONFIG_NO_HZ_COMMON
11318 * If this CPU has been elected to perform the nohz idle
11319 * balance. Other idle CPUs have already rebalanced with
11320 * nohz_idle_balance() and nohz.next_balance has been
11321 * updated accordingly. This CPU is now running the idle load
11322 * balance for itself and we need to update the
11323 * nohz.next_balance accordingly.
11325 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
11326 nohz.next_balance = rq->next_balance;
11331 #ifdef CONFIG_NO_HZ_COMMON
11333 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
11334 * rebalancing for all the cpus for whom scheduler ticks are stopped.
11336 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11338 int this_cpu = this_rq->cpu;
11341 /* Earliest time when we have to do rebalance again */
11342 unsigned long next_balance = jiffies + 60*HZ;
11343 int update_next_balance = 0;
11346 if (idle != CPU_IDLE ||
11347 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
11350 cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
11352 for_each_cpu(balance_cpu, &cpus) {
11353 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
11357 * If this cpu gets work to do, stop the load balancing
11358 * work being done for other cpus. Next load
11359 * balancing owner will pick it up.
11361 if (need_resched())
11364 rq = cpu_rq(balance_cpu);
11367 * If time for next balance is due,
11370 if (time_after_eq(jiffies, rq->next_balance)) {
11371 raw_spin_lock_irq(&rq->lock);
11372 update_rq_clock(rq);
11373 update_idle_cpu_load(rq);
11374 raw_spin_unlock_irq(&rq->lock);
11375 rebalance_domains(rq, CPU_IDLE);
11378 if (time_after(next_balance, rq->next_balance)) {
11379 next_balance = rq->next_balance;
11380 update_next_balance = 1;
11385 * next_balance will be updated only when there is a need.
11386 * When the CPU is attached to null domain for ex, it will not be
11389 if (likely(update_next_balance))
11390 nohz.next_balance = next_balance;
11392 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
11395 #ifdef CONFIG_SCHED_HMP
11396 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11398 struct sched_domain *sd;
11401 if (rq->nr_running < 2)
11404 if (!sysctl_sched_restrict_cluster_spill ||
11405 sched_boost_policy() == SCHED_BOOST_ON_ALL)
11408 if (cpu_max_power_cost(cpu) == max_power_cost)
11412 sd = rcu_dereference_check_sched_domain(rq->sd);
11418 for_each_cpu(i, sched_domain_span(sd)) {
11419 if (cpu_load(i) < sched_spill_load &&
11420 cpu_rq(i)->nr_running <
11421 sysctl_sched_spill_nr_run) {
11422 /* Change the kick type to limit to CPUs that
11423 * are of equal or lower capacity.
11425 *type = NOHZ_KICK_RESTRICT;
11433 static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
11439 static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
11441 unsigned long now = jiffies;
11444 * None are in tickless mode and hence no need for NOHZ idle load
11447 if (likely(!atomic_read(&nohz.nr_cpus)))
11450 #ifdef CONFIG_SCHED_HMP
11451 return _nohz_kick_needed_hmp(rq, cpu, type);
11454 if (time_before(now, nohz.next_balance))
11457 if (rq->nr_running >= 2 &&
11458 (!energy_aware() || cpu_overutilized(cpu)))
11461 /* Do idle load balance if there have misfit task */
11462 if (energy_aware())
11463 return rq->misfit_task;
11465 return (rq->nr_running >= 2);
11469 * Current heuristic for kicking the idle load balancer in the presence
11470 * of an idle cpu in the system.
11471 * - This rq has more than one task.
11472 * - This rq has at least one CFS task and the capacity of the CPU is
11473 * significantly reduced because of RT tasks or IRQs.
11474 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
11475 * multiple busy cpu.
11476 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
11477 * domain span are idle.
11479 static inline bool nohz_kick_needed(struct rq *rq, int *type)
11481 #ifndef CONFIG_SCHED_HMP
11482 struct sched_domain *sd;
11483 struct sched_group_capacity *sgc;
11489 if (unlikely(rq->idle_balance))
11493 * We may be recently in ticked or tickless idle mode. At the first
11494 * busy tick after returning from idle, we will update the busy stats.
11496 set_cpu_sd_state_busy();
11497 nohz_balance_exit_idle(cpu);
11499 if (_nohz_kick_needed(rq, cpu, type))
11502 #ifndef CONFIG_SCHED_HMP
11504 sd = rcu_dereference(per_cpu(sd_busy, cpu));
11506 sgc = sd->groups->sgc;
11507 nr_busy = atomic_read(&sgc->nr_busy_cpus);
11516 sd = rcu_dereference(rq->sd);
11518 if ((rq->cfs.h_nr_running >= 1) &&
11519 check_cpu_capacity(rq, sd)) {
11525 sd = rcu_dereference(per_cpu(sd_asym, cpu));
11526 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
11527 sched_domain_span(sd)) < cpu)) {
11538 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
11542 * run_rebalance_domains is triggered when needed from the scheduler tick.
11543 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
11545 static void run_rebalance_domains(struct softirq_action *h)
11547 struct rq *this_rq = this_rq();
11548 enum cpu_idle_type idle = this_rq->idle_balance ?
11549 CPU_IDLE : CPU_NOT_IDLE;
11552 * If this cpu has a pending nohz_balance_kick, then do the
11553 * balancing on behalf of the other idle cpus whose ticks are
11554 * stopped. Do nohz_idle_balance *before* rebalance_domains to
11555 * give the idle cpus a chance to load balance. Else we may
11556 * load balance only within the local sched_domain hierarchy
11557 * and abort nohz_idle_balance altogether if we pull some load.
11559 nohz_idle_balance(this_rq, idle);
11560 rebalance_domains(this_rq, idle);
11564 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
11566 void trigger_load_balance(struct rq *rq)
11568 int type = NOHZ_KICK_ANY;
11570 /* Don't need to rebalance while attached to NULL domain or
11573 if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
11576 if (time_after_eq(jiffies, rq->next_balance))
11577 raise_softirq(SCHED_SOFTIRQ);
11578 #ifdef CONFIG_NO_HZ_COMMON
11579 if (nohz_kick_needed(rq, &type))
11580 nohz_balancer_kick(type);
11584 static void rq_online_fair(struct rq *rq)
11588 update_runtime_enabled(rq);
11591 static void rq_offline_fair(struct rq *rq)
11595 /* Ensure any throttled groups are reachable by pick_next_task */
11596 unthrottle_offline_cfs_rqs(rq);
11599 #endif /* CONFIG_SMP */
11602 * scheduler tick hitting a task of our scheduling class:
11604 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11606 struct cfs_rq *cfs_rq;
11607 struct sched_entity *se = &curr->se;
11609 for_each_sched_entity(se) {
11610 cfs_rq = cfs_rq_of(se);
11611 entity_tick(cfs_rq, se, queued);
11614 if (static_branch_unlikely(&sched_numa_balancing))
11615 task_tick_numa(rq, curr);
11618 if (energy_aware() &&
11619 !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
11620 rq->rd->overutilized = true;
11621 trace_sched_overutilized(true);
11624 rq->misfit_task = !task_fits_max(curr, rq->cpu);
11630 * called on fork with the child task as argument from the parent's context
11631 * - child not yet on the tasklist
11632 * - preemption disabled
11634 static void task_fork_fair(struct task_struct *p)
11636 struct cfs_rq *cfs_rq;
11637 struct sched_entity *se = &p->se, *curr;
11638 struct rq *rq = this_rq();
11640 raw_spin_lock(&rq->lock);
11641 update_rq_clock(rq);
11643 cfs_rq = task_cfs_rq(current);
11644 curr = cfs_rq->curr;
11646 update_curr(cfs_rq);
11647 se->vruntime = curr->vruntime;
11649 place_entity(cfs_rq, se, 1);
11651 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11653 * Upon rescheduling, sched_class::put_prev_task() will place
11654 * 'current' within the tree based on its new key value.
11656 swap(curr->vruntime, se->vruntime);
11660 se->vruntime -= cfs_rq->min_vruntime;
11661 raw_spin_unlock(&rq->lock);
11665 * Priority of the task has changed. Check to see if we preempt
11666 * the current task.
11669 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11671 if (!task_on_rq_queued(p))
11675 * Reschedule if we are currently running on this runqueue and
11676 * our priority decreased, or if we are not currently running on
11677 * this runqueue and our priority is higher than the current's
11679 if (rq->curr == p) {
11680 if (p->prio > oldprio)
11683 check_preempt_curr(rq, p, 0);
11686 static inline bool vruntime_normalized(struct task_struct *p)
11688 struct sched_entity *se = &p->se;
11691 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11692 * the dequeue_entity(.flags=0) will already have normalized the
11699 * When !on_rq, vruntime of the task has usually NOT been normalized.
11700 * But there are some cases where it has already been normalized:
11702 * - A forked child which is waiting for being woken up by
11703 * wake_up_new_task().
11704 * - A task which has been woken up by try_to_wake_up() and
11705 * waiting for actually being woken up by sched_ttwu_pending().
11707 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
11713 #ifdef CONFIG_FAIR_GROUP_SCHED
11715 * Propagate the changes of the sched_entity across the tg tree to make it
11716 * visible to the root
11718 static void propagate_entity_cfs_rq(struct sched_entity *se)
11720 struct cfs_rq *cfs_rq;
11722 /* Start to propagate at parent */
11725 for_each_sched_entity(se) {
11726 cfs_rq = cfs_rq_of(se);
11728 if (cfs_rq_throttled(cfs_rq))
11731 update_load_avg(se, UPDATE_TG);
11735 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11738 static void detach_entity_cfs_rq(struct sched_entity *se)
11740 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11742 /* Catch up with the cfs_rq and remove our load when we leave */
11743 update_load_avg(se, 0);
11744 detach_entity_load_avg(cfs_rq, se);
11745 update_tg_load_avg(cfs_rq, false);
11746 propagate_entity_cfs_rq(se);
11749 static void attach_entity_cfs_rq(struct sched_entity *se)
11751 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11753 #ifdef CONFIG_FAIR_GROUP_SCHED
11755 * Since the real-depth could have been changed (only FAIR
11756 * class maintain depth value), reset depth properly.
11758 se->depth = se->parent ? se->parent->depth + 1 : 0;
11761 /* Synchronize entity with its cfs_rq */
11762 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11763 attach_entity_load_avg(cfs_rq, se);
11764 update_tg_load_avg(cfs_rq, false);
11765 propagate_entity_cfs_rq(se);
11768 static void detach_task_cfs_rq(struct task_struct *p)
11770 struct sched_entity *se = &p->se;
11771 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11773 if (!vruntime_normalized(p)) {
11775 * Fix up our vruntime so that the current sleep doesn't
11776 * cause 'unlimited' sleep bonus.
11778 place_entity(cfs_rq, se, 0);
11779 se->vruntime -= cfs_rq->min_vruntime;
11782 detach_entity_cfs_rq(se);
11785 static void attach_task_cfs_rq(struct task_struct *p)
11787 struct sched_entity *se = &p->se;
11788 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11790 attach_entity_cfs_rq(se);
11792 if (!vruntime_normalized(p))
11793 se->vruntime += cfs_rq->min_vruntime;
11796 static void switched_from_fair(struct rq *rq, struct task_struct *p)
11798 detach_task_cfs_rq(p);
11801 static void switched_to_fair(struct rq *rq, struct task_struct *p)
11803 attach_task_cfs_rq(p);
11805 if (task_on_rq_queued(p)) {
11807 * We were most likely switched from sched_rt, so
11808 * kick off the schedule if running, otherwise just see
11809 * if we can still preempt the current task.
11814 check_preempt_curr(rq, p, 0);
11818 /* Account for a task changing its policy or group.
11820 * This routine is mostly called to set cfs_rq->curr field when a task
11821 * migrates between groups/classes.
11823 static void set_curr_task_fair(struct rq *rq)
11825 struct sched_entity *se = &rq->curr->se;
11827 for_each_sched_entity(se) {
11828 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11830 set_next_entity(cfs_rq, se);
11831 /* ensure bandwidth has been allocated on our new cfs_rq */
11832 account_cfs_rq_runtime(cfs_rq, 0);
11836 void init_cfs_rq(struct cfs_rq *cfs_rq)
11838 cfs_rq->tasks_timeline = RB_ROOT;
11839 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11840 #ifndef CONFIG_64BIT
11841 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11844 #ifdef CONFIG_FAIR_GROUP_SCHED
11845 cfs_rq->propagate_avg = 0;
11847 atomic_long_set(&cfs_rq->removed_load_avg, 0);
11848 atomic_long_set(&cfs_rq->removed_util_avg, 0);
11852 #ifdef CONFIG_FAIR_GROUP_SCHED
11853 static void task_set_group_fair(struct task_struct *p)
11855 struct sched_entity *se = &p->se;
11857 set_task_rq(p, task_cpu(p));
11858 se->depth = se->parent ? se->parent->depth + 1 : 0;
11861 static void task_move_group_fair(struct task_struct *p)
11863 detach_task_cfs_rq(p);
11864 set_task_rq(p, task_cpu(p));
11867 /* Tell se's cfs_rq has been changed -- migrated */
11868 p->se.avg.last_update_time = 0;
11870 attach_task_cfs_rq(p);
11873 static void task_change_group_fair(struct task_struct *p, int type)
11876 case TASK_SET_GROUP:
11877 task_set_group_fair(p);
11880 case TASK_MOVE_GROUP:
11881 task_move_group_fair(p);
11886 void free_fair_sched_group(struct task_group *tg)
11890 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11892 for_each_possible_cpu(i) {
11894 kfree(tg->cfs_rq[i]);
11903 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11905 struct sched_entity *se;
11906 struct cfs_rq *cfs_rq;
11910 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
11913 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
11917 tg->shares = NICE_0_LOAD;
11919 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11921 for_each_possible_cpu(i) {
11924 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11925 GFP_KERNEL, cpu_to_node(i));
11929 se = kzalloc_node(sizeof(struct sched_entity),
11930 GFP_KERNEL, cpu_to_node(i));
11934 init_cfs_rq(cfs_rq);
11935 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11936 init_entity_runnable_average(se);
11938 raw_spin_lock_irq(&rq->lock);
11939 post_init_entity_util_avg(se);
11940 raw_spin_unlock_irq(&rq->lock);
11951 void unregister_fair_sched_group(struct task_group *tg)
11953 unsigned long flags;
11957 for_each_possible_cpu(cpu) {
11959 remove_entity_load_avg(tg->se[cpu]);
11962 * Only empty task groups can be destroyed; so we can speculatively
11963 * check on_list without danger of it being re-added.
11965 if (!tg->cfs_rq[cpu]->on_list)
11970 raw_spin_lock_irqsave(&rq->lock, flags);
11971 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11972 raw_spin_unlock_irqrestore(&rq->lock, flags);
11976 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11977 struct sched_entity *se, int cpu,
11978 struct sched_entity *parent)
11980 struct rq *rq = cpu_rq(cpu);
11984 init_cfs_rq_runtime(cfs_rq);
11986 tg->cfs_rq[cpu] = cfs_rq;
11989 /* se could be NULL for root_task_group */
11994 se->cfs_rq = &rq->cfs;
11997 se->cfs_rq = parent->my_q;
11998 se->depth = parent->depth + 1;
12002 /* guarantee group entities always have weight */
12003 update_load_set(&se->load, NICE_0_LOAD);
12004 se->parent = parent;
12007 static DEFINE_MUTEX(shares_mutex);
12009 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
12012 unsigned long flags;
12015 * We can't change the weight of the root cgroup.
12020 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
12022 mutex_lock(&shares_mutex);
12023 if (tg->shares == shares)
12026 tg->shares = shares;
12027 for_each_possible_cpu(i) {
12028 struct rq *rq = cpu_rq(i);
12029 struct sched_entity *se;
12032 /* Propagate contribution to hierarchy */
12033 raw_spin_lock_irqsave(&rq->lock, flags);
12035 /* Possible calls to update_curr() need rq clock */
12036 update_rq_clock(rq);
12037 for_each_sched_entity(se) {
12038 update_load_avg(se, UPDATE_TG);
12039 update_cfs_shares(se);
12041 raw_spin_unlock_irqrestore(&rq->lock, flags);
12045 mutex_unlock(&shares_mutex);
12048 #else /* CONFIG_FAIR_GROUP_SCHED */
12050 void free_fair_sched_group(struct task_group *tg) { }
12052 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12057 void unregister_fair_sched_group(struct task_group *tg) { }
12059 #endif /* CONFIG_FAIR_GROUP_SCHED */
12062 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
12064 struct sched_entity *se = &task->se;
12065 unsigned int rr_interval = 0;
12068 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
12071 if (rq->cfs.load.weight)
12072 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
12074 return rr_interval;
12078 * All the scheduling class methods:
12080 const struct sched_class fair_sched_class = {
12081 .next = &idle_sched_class,
12082 .enqueue_task = enqueue_task_fair,
12083 .dequeue_task = dequeue_task_fair,
12084 .yield_task = yield_task_fair,
12085 .yield_to_task = yield_to_task_fair,
12087 .check_preempt_curr = check_preempt_wakeup,
12089 .pick_next_task = pick_next_task_fair,
12090 .put_prev_task = put_prev_task_fair,
12093 .select_task_rq = select_task_rq_fair,
12094 .migrate_task_rq = migrate_task_rq_fair,
12096 .rq_online = rq_online_fair,
12097 .rq_offline = rq_offline_fair,
12099 .task_waking = task_waking_fair,
12100 .task_dead = task_dead_fair,
12101 .set_cpus_allowed = set_cpus_allowed_common,
12104 .set_curr_task = set_curr_task_fair,
12105 .task_tick = task_tick_fair,
12106 .task_fork = task_fork_fair,
12108 .prio_changed = prio_changed_fair,
12109 .switched_from = switched_from_fair,
12110 .switched_to = switched_to_fair,
12112 .get_rr_interval = get_rr_interval_fair,
12114 .update_curr = update_curr_fair,
12116 #ifdef CONFIG_FAIR_GROUP_SCHED
12117 .task_change_group = task_change_group_fair,
12119 #ifdef CONFIG_SCHED_HMP
12120 .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
12121 .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
12122 .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
12126 #ifdef CONFIG_SCHED_DEBUG
12127 void print_cfs_stats(struct seq_file *m, int cpu)
12129 struct cfs_rq *cfs_rq;
12132 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
12133 print_cfs_rq(m, cpu, cfs_rq);
12137 #ifdef CONFIG_NUMA_BALANCING
12138 void show_numa_stats(struct task_struct *p, struct seq_file *m)
12141 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
12143 for_each_online_node(node) {
12144 if (p->numa_faults) {
12145 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
12146 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
12148 if (p->numa_group) {
12149 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
12150 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
12152 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
12155 #endif /* CONFIG_NUMA_BALANCING */
12156 #endif /* CONFIG_SCHED_DEBUG */
12158 __init void init_sched_fair_class(void)
12161 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
12163 #ifdef CONFIG_NO_HZ_COMMON
12164 nohz.next_balance = jiffies;
12165 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
12166 cpu_notifier(sched_ilb_notifier, 0);