#include <linux/task_work.h>
#include <linux/module.h>
-#include <trace/events/sched.h>
-
#include "sched.h"
+#include <trace/events/sched.h>
#include "tune.h"
#include "walt.h"
unsigned int sysctl_sched_sync_hint_enable = 1;
unsigned int sysctl_sched_cstate_aware = 1;
-#ifdef CONFIG_SCHED_WALT
-unsigned int sysctl_sched_use_walt_cpu_util = 1;
-unsigned int sysctl_sched_use_walt_task_util = 1;
-__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
- (10 * NSEC_PER_MSEC);
-#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
const struct sched_class fair_sched_class;
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+ u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+ return !!enabled;
+}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+ int enable = !!wake_up_idle;
+
+ if (enable)
+ p->flags |= PF_WAKE_UP_IDLE;
+ else
+ p->flags &= ~PF_WAKE_UP_IDLE;
+
+ return 0;
+}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
+
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
return contrib + runnable_avg_yN_sum[n];
}
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
+#ifdef CONFIG_SCHED_HMP
+
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU 0x1
+#define SBC_FLAG_BEST_CAP_CPU 0x2
+#define SBC_FLAG_CPU_COST 0x4
+#define SBC_FLAG_MIN_COST 0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED 0x10
+#define SBC_FLAG_IDLE_CSTATE 0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
+#define SBC_FLAG_CSTATE_LOAD 0x100
+#define SBC_FLAG_BEST_SIBLING 0x200
+#define SBC_FLAG_WAKER_CPU 0x400
+#define SBC_FLAG_PACK_TASK 0x800
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER 0x10000
+#define SBC_FLAG_WAKER_CLUSTER 0x20000
+#define SBC_FLAG_BACKUP_CLUSTER 0x40000
+#define SBC_FLAG_BOOST_CLUSTER 0x80000
+
+struct cpu_select_env {
+ struct task_struct *p;
+ struct related_thread_group *rtg;
+ u8 reason;
+ u8 need_idle:1;
+ u8 need_waker_cluster:1;
+ u8 sync:1;
+ enum sched_boost_policy boost_policy;
+ u8 pack_task:1;
+ int prev_cpu;
+ DECLARE_BITMAP(candidate_list, NR_CPUS);
+ DECLARE_BITMAP(backup_list, NR_CPUS);
+ u64 task_load;
+ u64 cpu_load;
+ u32 sbc_best_flag;
+ u32 sbc_best_cluster_flag;
+ struct cpumask search_cpus;
+};
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+struct cluster_cpu_stats {
+ int best_idle_cpu, least_loaded_cpu;
+ int best_capacity_cpu, best_cpu, best_sibling_cpu;
+ int min_cost, best_sibling_cpu_cost;
+ int best_cpu_wakeup_latency;
+ u64 min_load, best_load, best_sibling_cpu_load;
+ s64 highest_spare_capacity;
+};
/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series. To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- * p0 p1 p2
- * (now) (~1ms ago) (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- * y^32 = 0.5
+ * Should task be woken to any available idle cpu?
*
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
*/
-static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+static inline int wake_to_idle(struct task_struct *p)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ return (current->flags & PF_WAKE_UP_IDLE) ||
+ (p->flags & PF_WAKE_UP_IDLE);
+}
- delta = now - sa->last_update_time;
- /*
- * This should only happen when time goes backwards, which it
- * unfortunately does during sched clock init when we swap over to TSC.
- */
- if ((s64)delta < 0) {
- sa->last_update_time = now;
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+ u64 total_load;
+
+ total_load = env->task_load + env->cpu_load;
+
+ if (total_load > sched_spill_load ||
+ (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+ return 1;
+
+ return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+ int tcpu = task_cpu(env->p);
+ int skip = 0;
+
+ if (!env->reason)
return 0;
+
+ if (is_reserved(cpu))
+ return 1;
+
+ switch (env->reason) {
+ case UP_MIGRATION:
+ skip = !idle_cpu(cpu);
+ break;
+ case IRQLOAD_MIGRATION:
+ /* Purposely fall through */
+ default:
+ skip = (cpu == tcpu);
+ break;
}
- /*
- * Use 1024ns as the unit of measurement since it's a reasonable
- * approximation of 1us and fast to compute.
- */
- delta >>= 10;
- if (!delta)
- return 0;
- sa->last_update_time = now;
+ return skip;
+}
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
- trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ int tcpu;
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
+ if (!env->reason)
+ return 1;
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
+ tcpu = task_cpu(env->p);
+ switch (env->reason) {
+ case UP_MIGRATION:
+ return cluster->capacity > cpu_capacity(tcpu);
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
+ case DOWN_MIGRATION:
+ return cluster->capacity < cpu_capacity(tcpu);
+
+ default:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ if (!test_bit(cluster->id, env->candidate_list))
+ return 1;
+
+ if (!acceptable_capacity(cluster, env)) {
+ __clear_bit(cluster->id, env->candidate_list);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+ struct sched_cluster *cluster;
+
+ if (env->rtg) {
+ int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+ if (task_load_will_fit(env->p, env->task_load,
+ cpu, env->boost_policy)) {
+ env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+ if (env->boost_policy == SCHED_BOOST_NONE)
+ return env->rtg->preferred_cluster;
+
+ for_each_sched_cluster(cluster) {
+ if (cluster != env->rtg->preferred_cluster) {
+ __set_bit(cluster->id,
+ env->backup_list);
+ __clear_bit(cluster->id,
+ env->candidate_list);
+ }
}
+
+ return env->rtg->preferred_cluster;
}
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
- delta -= delta_w;
+ /*
+ * Since the task load does not fit on the preferred
+ * cluster anymore, pretend that the task does not
+ * have any preferred cluster. This allows the waking
+ * task to get the appropriate CPU it needs as per the
+ * non co-location placement policy without having to
+ * wait until the preferred cluster is updated.
+ */
+ env->rtg = NULL;
+ }
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
+ for_each_sched_cluster(cluster) {
+ if (!skip_cluster(cluster, env)) {
+ int cpu = cluster_first_cpu(cluster);
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cpu);
+ if (task_load_will_fit(env->p, env->task_load, cpu,
+ env->boost_policy))
+ return cluster;
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
}
- if (running)
- sa->util_sum += contrib * scale_cpu;
}
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
+ return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+ int cluster_id;
+
+ cluster_id = find_next_bit(list, end, start - 1 + 1);
+ if (cluster_id >= end)
+ return NULL;
+
+ return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu, int capacity,
+ u64 cpu_load)
+{
+ s64 spare_capacity = sched_ravg_window - cpu_load;
+
+ if (spare_capacity > 0 &&
+ (spare_capacity > stats->highest_spare_capacity ||
+ (spare_capacity == stats->highest_spare_capacity &&
+ ((!env->need_waker_cluster &&
+ capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+ (env->need_waker_cluster &&
+ cpu_rq(cpu)->nr_running <
+ cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+ /*
+ * If sync waker is the only runnable of CPU, cr_avg of the
+ * CPU is 0 so we have high chance to place the wakee on the
+ * waker's CPU which likely causes preemtion of the waker.
+ * This can lead migration of preempted waker. Place the
+ * wakee on the real idle CPU when it's possible by checking
+ * nr_running to avoid such preemption.
+ */
+ stats->highest_spare_capacity = spare_capacity;
+ stats->best_capacity_cpu = cpu;
}
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
+}
- sa->period_contrib += delta;
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+ int i;
+ struct cpumask search_cpus;
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ while (!bitmap_empty(env->backup_list, num_clusters)) {
+ next = next_candidate(env->backup_list, 0, num_clusters);
+ __clear_bit(next->id, env->backup_list);
+
+ cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+ for_each_cpu(i, &search_cpus) {
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i), power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ update_spare_capacity(stats, env, i, next->capacity,
+ cpu_load_sync(i, env->sync));
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
}
-
- return decayed;
}
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- \
- res = var + val; \
- \
- if (val < 0 && res > var) \
- res = 0; \
- \
- WRITE_ONCE(*ptr, res); \
-} while (0)
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+ struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+
+ __clear_bit(cluster->id, env->candidate_list);
+
+ if (env->rtg && preferred_cluster(cluster, env->p))
+ return NULL;
+
+ do {
+ if (bitmap_empty(env->candidate_list, num_clusters))
+ return NULL;
+
+ next = next_candidate(env->candidate_list, 0, num_clusters);
+ if (next) {
+ if (next->min_power_cost > stats->min_cost) {
+ clear_bit(next->id, env->candidate_list);
+ next = NULL;
+ continue;
+ }
+
+ if (skip_cluster(next, env))
+ next = NULL;
+ }
+ } while (!next);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(next));
+ return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int wakeup_latency;
+ int prev_cpu = env->prev_cpu;
+
+ wakeup_latency = cpu_rq(cpu)->wakeup_latency;
+
+ if (env->need_idle) {
+ stats->min_cost = cpu_cost;
+ if (idle_cpu(cpu)) {
+ if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+ (wakeup_latency == stats->best_cpu_wakeup_latency &&
+ cpu == prev_cpu)) {
+ stats->best_idle_cpu = cpu;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ }
+ } else {
+ if (env->cpu_load < stats->min_load ||
+ (env->cpu_load == stats->min_load &&
+ cpu == prev_cpu)) {
+ stats->least_loaded_cpu = cpu;
+ stats->min_load = env->cpu_load;
+ }
+ }
+
+ return;
+ }
+
+ if (cpu_cost < stats->min_cost) {
+ stats->min_cost = cpu_cost;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CPU_COST;
+ return;
+ }
+
+ /* CPU cost is the same. Start breaking the tie by C-state */
+
+ if (wakeup_latency > stats->best_cpu_wakeup_latency)
+ return;
+
+ if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
+ return;
+ }
+
+ /* C-state is the same. Use prev CPU to break the tie */
+ if (cpu == prev_cpu) {
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
+ return;
+ }
+
+ if (stats->best_cpu != prev_cpu &&
+ ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+ (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
+ }
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int prev_cpu = env->prev_cpu;
+
+ if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+ if (stats->best_sibling_cpu_cost > cpu_cost ||
+ (stats->best_sibling_cpu_cost == cpu_cost &&
+ stats->best_sibling_cpu_load > env->cpu_load)) {
+ stats->best_sibling_cpu_cost = cpu_cost;
+ stats->best_sibling_cpu_load = env->cpu_load;
+ stats->best_sibling_cpu = cpu;
+ }
+ }
+
+ if ((cpu_cost < stats->min_cost) ||
+ ((stats->best_cpu != prev_cpu &&
+ stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+ if (env->need_idle) {
+ if (idle_cpu(cpu)) {
+ stats->min_cost = cpu_cost;
+ stats->best_idle_cpu = cpu;
+ }
+ } else {
+ stats->min_cost = cpu_cost;
+ stats->min_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_MIN_COST;
+ }
+ }
+}
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env)
+{
+ int cpu_cost;
+
+ /*
+ * We try to find the least loaded *busy* CPU irrespective
+ * of the power cost.
+ */
+ if (env->pack_task)
+ cpu_cost = cpu_min_power_cost(cpu);
+
+ else
+ cpu_cost = power_cost(cpu, task_load(env->p) +
+ cpu_cravg_sync(cpu, env->sync));
+
+ if (cpu_cost <= stats->min_cost)
+ __update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int i;
+ struct cpumask search_cpus;
+
+ cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
+
+ env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
+ for_each_cpu(i, &search_cpus) {
+ env->cpu_load = cpu_load_sync(i, env->sync);
+
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ if (skip_cpu(i, env))
+ continue;
+
+ update_spare_capacity(stats, env, i, c->capacity,
+ env->cpu_load);
+
+ /*
+ * need_idle takes precedence over sched boost but when both
+ * are set, idlest CPU with in all the clusters is selected
+ * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+ * big cluster is selected within boost_policy = BOOST_ON_BIG.
+ */
+ if ((!env->need_idle &&
+ env->boost_policy != SCHED_BOOST_NONE) ||
+ env->need_waker_cluster ||
+ sched_cpu_high_irqload(i) ||
+ spill_threshold_crossed(env, cpu_rq(i)))
+ continue;
+
+ update_cluster_stats(i, stats, env);
+ }
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+ stats->best_cpu = stats->best_idle_cpu = -1;
+ stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
+ stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+ stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
+ stats->highest_spare_capacity = 0;
+ stats->least_loaded_cpu = -1;
+ stats->best_cpu_wakeup_latency = INT_MAX;
+ /* No need to initialize stats->best_load */
+}
+
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+ if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+ env->reason)
+ return true;
+
+ return false;
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int prev_cpu;
+ struct task_struct *task = env->p;
+ struct sched_cluster *cluster;
+
+ if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
+ return false;
+
+ prev_cpu = env->prev_cpu;
+ if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
+ return false;
+
+ if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+ sched_long_cpu_selection_threshold)
+ return false;
+
+ /*
+ * This function should be used by task wake up path only as it's
+ * assuming p->last_switch_out_ts as last sleep time.
+ * p->last_switch_out_ts can denote last preemption time as well as
+ * last sleep time.
+ */
+ if (task->ravg.mark_start - task->last_switch_out_ts >=
+ sched_short_sleep_task_threshold)
+ return false;
+
+ env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+ cluster = cpu_rq(prev_cpu)->cluster;
+
+ if (!task_load_will_fit(task, env->task_load, prev_cpu,
+ sched_boost_policy())) {
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ return false;
+ }
+
+ env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+ if (sched_cpu_high_irqload(prev_cpu) ||
+ spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+ update_spare_capacity(stats, env, prev_cpu,
+ cluster->capacity, env->cpu_load);
+ cpumask_clear_cpu(prev_cpu, &env->search_cpus);
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+ return env->sync &&
+ task_load(current) > sched_big_waker_task_load &&
+ task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline bool
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
+{
+ return sysctl_sched_prefer_sync_wakee_to_waker &&
+ cpu_rq(cpu)->nr_running == 1 &&
+ cpumask_test_cpu(cpu, &env->search_cpus);
+}
+
+static inline int
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
+{
+ return cpumask_intersects(&env->search_cpus, &cluster->cpus);
+}
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+ int sync)
+{
+ struct sched_cluster *cluster, *pref_cluster = NULL;
+ struct cluster_cpu_stats stats;
+ struct related_thread_group *grp;
+ unsigned int sbc_flag = 0;
+ int cpu = raw_smp_processor_id();
+ bool special;
+
+ struct cpu_select_env env = {
+ .p = p,
+ .reason = reason,
+ .need_idle = wake_to_idle(p),
+ .need_waker_cluster = 0,
+ .sync = sync,
+ .prev_cpu = target,
+ .rtg = NULL,
+ .sbc_best_flag = 0,
+ .sbc_best_cluster_flag = 0,
+ .pack_task = false,
+ };
+
+ env.boost_policy = task_sched_boost(p) ?
+ sched_boost_policy() : SCHED_BOOST_NONE;
+
+ bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+ bitmap_zero(env.backup_list, NR_CPUS);
+
+ cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+ cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
+ init_cluster_cpu_stats(&stats);
+ special = env_has_special_flags(&env);
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+
+ if (grp && grp->preferred_cluster) {
+ pref_cluster = grp->preferred_cluster;
+ if (!cluster_allowed(&env, pref_cluster))
+ clear_bit(pref_cluster->id, env.candidate_list);
+ else
+ env.rtg = grp;
+ } else if (!special) {
+ cluster = cpu_rq(cpu)->cluster;
+ if (wake_to_waker_cluster(&env)) {
+ if (bias_to_waker_cpu(&env, cpu)) {
+ target = cpu;
+ sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+ SBC_FLAG_WAKER_CPU;
+ goto out;
+ } else if (cluster_allowed(&env, cluster)) {
+ env.need_waker_cluster = 1;
+ bitmap_zero(env.candidate_list, NR_CPUS);
+ __set_bit(cluster->id, env.candidate_list);
+ env.sbc_best_cluster_flag =
+ SBC_FLAG_WAKER_CLUSTER;
+ }
+ } else if (bias_to_prev_cpu(&env, &stats)) {
+ sbc_flag = SBC_FLAG_PREV_CPU;
+ goto out;
+ }
+ }
+
+ if (!special && is_short_burst_task(p)) {
+ env.pack_task = true;
+ sbc_flag = SBC_FLAG_PACK_TASK;
+ }
+retry:
+ cluster = select_least_power_cluster(&env);
+
+ if (!cluster)
+ goto out;
+
+ /*
+ * 'cluster' now points to the minimum power cluster which can satisfy
+ * task's perf goals. Walk down the cluster list starting with that
+ * cluster. For non-small tasks, skip clusters that don't have
+ * mostly_idle/idle cpus
+ */
+
+ do {
+ find_best_cpu_in_cluster(cluster, &env, &stats);
+
+ } while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+ if (env.need_idle) {
+ if (stats.best_idle_cpu >= 0) {
+ target = stats.best_idle_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+ } else if (stats.least_loaded_cpu >= 0) {
+ target = stats.least_loaded_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+ }
+ } else if (stats.best_cpu >= 0) {
+ if (stats.best_sibling_cpu >= 0 &&
+ stats.best_cpu != task_cpu(p) &&
+ stats.min_cost == stats.best_sibling_cpu_cost) {
+ stats.best_cpu = stats.best_sibling_cpu;
+ sbc_flag |= SBC_FLAG_BEST_SIBLING;
+ }
+ sbc_flag |= env.sbc_best_flag;
+ target = stats.best_cpu;
+ } else {
+ if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
+ env.rtg = NULL;
+ goto retry;
+ }
+
+ /*
+ * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+ * backup_list = little cluster, candidate_list = none and
+ * stats->best_capacity_cpu points the best spare capacity
+ * CPU among the CPUs in the big cluster.
+ */
+ if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+ stats.best_capacity_cpu >= 0)
+ sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+ else
+ find_backup_cluster(&env, &stats);
+
+ if (stats.best_capacity_cpu >= 0) {
+ target = stats.best_capacity_cpu;
+ sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+ }
+ }
+ p->last_cpu_selected_ts = sched_ktime_clock();
+out:
+ sbc_flag |= env.sbc_best_cluster_flag;
+ rcu_read_unlock();
+ trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+ env.reason, env.sync, env.need_idle, sbc_flag, target);
+ return target;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+ return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu) \
+ for (tg = container_of(&task_groups, struct task_group, list); \
+ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+
+ for_each_cfs_rq(cfs_rq, tg, cpu)
+ reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+ rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Although below check is not strictly required (as
+ * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+ * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+ * efficiency by short-circuiting for_each_sched_entity() loop when
+ * sched_disable_window_stats
+ */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /* See comment on efficiency in _inc_hmp_sched_stats_fair */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se) {
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+ }
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+ struct sched_domain *sd;
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ sd->balance_interval = 0;
+ rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+ int nice;
+ struct related_thread_group *grp;
+
+ if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+ return 0;
+
+ /* No need to migrate task that is about to be throttled */
+ if (task_will_be_throttled(p))
+ return 0;
+
+ if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+ cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+ return UP_MIGRATION;
+
+ if (sched_cpu_high_irqload(cpu))
+ return IRQLOAD_MIGRATION;
+
+ nice = task_nice(p);
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ /*
+ * Don't assume higher capacity means higher power. If the task
+ * is running on the power efficient CPU, avoid migrating it
+ * to a lower capacity cluster.
+ */
+ if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p)) &&
+ cpu_capacity(cpu) > min_capacity &&
+ cpu_max_power_cost(cpu) == max_power_cost) {
+ rcu_read_unlock();
+ return DOWN_MIGRATION;
+ }
+
+ if (!task_will_fit(p, cpu)) {
+ rcu_read_unlock();
+ return UP_MIGRATION;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (do_migration(reason, new_cpu, cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->hmp_stats.nr_big_tasks = 0;
+ cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+ cfs_rq->hmp_stats.pred_demands_sum = 0;
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg +=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg -=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
+
+ BUG_ON(stats->nr_big_tasks < 0 ||
+ (s64)stats->cumulative_runnable_avg < 0);
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
+
+#endif /* CONFIG_SCHED_HMP */
+
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ u64 delta, scaled_delta, periods;
+ u32 contrib;
+ unsigned int delta_w, scaled_delta_w, decayed = 0;
+ unsigned long scale_freq, scale_cpu;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_update_time = now;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->period_contrib;
+ if (delta + delta_w >= 1024) {
+ decayed = 1;
+
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ scaled_delta_w = cap_scale(delta_w, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta_w;
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum +=
+ weight * scaled_delta_w;
+ }
+ }
+ if (running)
+ sa->util_sum += scaled_delta_w * scale_cpu;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ contrib = __compute_runnable_contrib(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ scaled_delta = cap_scale(delta, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * scaled_delta;
+ }
+
+ if (running)
+ sa->util_sum += scaled_delta * scale_cpu;
+
+ sa->period_contrib += delta;
+
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ }
+
+ return decayed;
+}
+
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
return 0;
}
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
#endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return 0;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ if (!cfs_rq->runtime_enabled)
+ continue;
+ if (cfs_rq->runtime_remaining <= 0)
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
if (qcfs_rq->load.weight)
dequeue = 0;
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, task_delta);
+ dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+ }
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
+
+ /* Log effect on hmp stats after throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
int enqueue = 1;
long task_delta;
+ struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
se = cfs_rq->tg->se[cpu_of(rq)];
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
if (cfs_rq_throttled(cfs_rq))
break;
}
- if (!se)
+ if (!se) {
add_nr_running(rq, task_delta);
+ inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+ }
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
+
+ /* Log effect on hmp stats after un-throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ init_cfs_rq_hmp_stats(cfs_rq);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
/*
* called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
*/
static void hrtick_update(struct rq *rq)
{
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
- walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
- walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ inc_rq_hmp_stats(rq, p, 1);
+ }
#ifdef CONFIG_SMP
*/
schedtune_enqueue_task(p, cpu_of(rq));
- if (!se) {
- walt_inc_cumulative_runnable_avg(rq, p);
+ if (energy_aware() && !se) {
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
- walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
- walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(se);
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, 1);
+ dec_rq_hmp_stats(rq, p, 1);
+ }
#ifdef CONFIG_SMP
*/
schedtune_dequeue_task(p, cpu_of(rq));
- if (!se)
- walt_dec_cumulative_runnable_avg(rq, p);
#endif /* CONFIG_SMP */
hrtick_update(rq);
>> SCHED_CAPACITY_SHIFT;
}
-static inline bool energy_aware(void)
-{
- return sched_feat(ENERGY_AWARE);
-}
-
struct energy_env {
struct sched_group *sg_top;
struct sched_group *sg_cap;
static inline unsigned long task_util(struct task_struct *p)
{
-#ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_task_util) {
- unsigned long demand = p->ravg.demand;
- return (demand << 10) / walt_ravg_window;
- }
-#endif
return p->se.avg.util_avg;
}
}
}
+ if (!(current->flags & PF_WAKE_UP_IDLE) &&
+ !(p->flags & PF_WAKE_UP_IDLE))
+ return target;
+
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
+#ifdef CONFIG_SCHED_HMP
+ return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
+
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p, sibling_count_hint) &&
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
+ unsigned int busiest_grp_capacity;
+ unsigned int busiest_nr_running;
unsigned int flags;
enum fbq_type fbq_type;
enum group_type busiest_group_type;
struct list_head tasks;
+ enum sched_boost_policy boost_policy;
};
/*
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
+ int twf, group_cpus;
lockdep_assert_held(&env->src_rq->lock);
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
+ if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+ if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+ return 0;
+
+ if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+ !task_sched_boost(p))
+ return 0;
+ }
+
+ twf = task_will_fit(p, env->dst_cpu);
+
+ /*
+ * Attempt to not pull tasks that don't fit. We may get lucky and find
+ * one that actually fits.
+ */
+ if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+ return 0;
+
+ if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+ !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+ return 0;
+
+ /*
+ * Group imbalance can sometimes cause work to be pulled across groups
+ * even though the group could have managed the imbalance on its own.
+ * Prevent inter-cluster migrations for big tasks when the number of
+ * tasks is lower than the capacity of the group.
+ */
+ group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+ SCHED_CAPACITY_SCALE);
+ if (!twf && env->busiest_nr_running <= group_cpus)
+ return 0;
+
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) IDLE or NEWLY_IDLE balance.
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
- if (tsk_cache_hot <= 0 ||
+ if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (task_in_related_thread_group(p))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
double_unlock_balance(env->src_rq, env->dst_rq);
}
* inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
+
return p;
}
return NULL;
struct task_struct *p;
unsigned long load;
int detached = 0;
+ int orig_loop = env->loop;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
+ if (!same_cluster(env->dst_cpu, env->src_cpu))
+ env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+ if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+ env->flags |= LBF_IGNORE_BIG_TASKS;
+
+redo:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
list_move_tail(&p->se.group_node, tasks);
}
+ if (env->flags & (LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+ tasks = &env->src_rq->cfs_tasks;
+ env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+ env->loop = orig_loop;
+ goto redo;
+ }
+
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+ unsigned long sum_nr_big_tasks;
+ u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
.avg_load = 0UL,
.sum_nr_running = 0,
.group_type = group_other,
+#ifdef CONFIG_SCHED_HMP
+ .sum_nr_big_tasks = 0UL,
+ .group_cpu_load = 0ULL,
+#endif
},
};
}
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int local_cpu, busiest_cpu;
+ int local_capacity, busiest_capacity;
+ int local_pwr_cost, busiest_pwr_cost;
+ int nr_cpus;
+ int boost = sched_boost();
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
+ return 0;
+
+ local_cpu = group_first_cpu(sds->local);
+ busiest_cpu = group_first_cpu(sds->busiest);
+
+ local_capacity = cpu_max_possible_capacity(local_cpu);
+ busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+ local_pwr_cost = cpu_max_power_cost(local_cpu);
+ busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+ if (local_pwr_cost <= busiest_pwr_cost)
+ return 0;
+
+ if (local_capacity > busiest_capacity &&
+ sds->busiest_stat.sum_nr_big_tasks)
+ return 0;
+
+ nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+ if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+ (sds->busiest_stat.sum_nr_running <
+ nr_cpus * sysctl_sched_spill_nr_run))
+ return 1;
+
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
+ if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+ continue;
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
do {
struct sched_group_capacity *sgc = group->sgc;
- capacity += sgc->capacity;
- max_capacity = max(sgc->max_capacity, max_capacity);
- min_capacity = min(sgc->min_capacity, min_capacity);
+ cpumask_t *cpus = sched_group_cpus(group);
+
+ /* Revisit this later. This won't work for MT domain */
+ if (!cpu_isolated(cpumask_first(cpus))) {
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
+ }
group = group->next;
} while (group != child->groups);
}
static inline enum
group_type group_classify(struct sched_group *group,
- struct sg_lb_stats *sgs)
+ struct sg_lb_stats *sgs, struct lb_env *env)
{
if (sgs->group_no_capacity)
return group_overloaded;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, 0),
+ cpu_temp(i));
+
+ if (cpu_isolated(i))
+ continue;
+
/* if we are entering idle and there are CPUs with
* their tick stopped, do an update for them
*/
if (nr_running > 1)
*overload = true;
+#ifdef CONFIG_SCHED_HMP
+ sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+ sgs->group_cpu_load += cpu_load(i);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
- if (cpu_overutilized(i)) {
+ if (energy_aware() && cpu_overutilized(i)) {
*overutilized = true;
if (!sgs->group_misfit_task && rq->misfit_task)
sgs->group_misfit_task = capacity_of(i);
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Isolated CPU has no weight */
+ if (!group->group_weight) {
+ sgs->group_capacity = 0;
+ sgs->avg_load = 0;
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_other;
+ sgs->group_weight = group->group_weight;
+ } else {
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(group, sgs, env);
+ }
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+}
- sgs->group_weight = group->group_weight;
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ if (env->idle != CPU_NOT_IDLE &&
+ cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+ if (sgs->sum_nr_big_tasks >
+ sds->busiest_stat.sum_nr_big_tasks) {
+ env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+ return true;
+ }
+ }
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ return false;
+}
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ return false;
}
+#endif
/**
* update_sd_pick_busiest - return 1 on busiest group
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+ return true;
+
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
- /*
- * Candidate sg doesn't face any serious load-balance problems
- * so don't pick it if the local sg is already filled up.
- */
- if (sgs->group_type == group_other &&
- !group_has_capacity(env, &sds->local_stat))
- return false;
+ if (energy_aware()) {
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
- /*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
- */
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+ }
asym_packing:
/* This is the busiest node in its class. */
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
+ sgs->group_type = group_classify(sg, sgs, env);
}
/*
* Ignore task groups with misfit tasks if local group has no
* capacity or if per-cpu capacity isn't higher.
*/
- if (sgs->group_type == group_misfit_task &&
+ if (energy_aware() &&
+ sgs->group_type == group_misfit_task &&
(!group_has_capacity(env, &sds->local_stat) ||
!group_smaller_cpu_capacity(sg, sds->local)))
sgs->group_type = group_other;
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
+ env->busiest_nr_running = sgs->sum_nr_running;
+ env->busiest_grp_capacity = sgs->group_capacity;
}
next_group:
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized) {
+ if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
trace_sched_overutilized(overutilized);
}
} else {
- if (!env->dst_rq->rd->overutilized && overutilized) {
+ if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
- /* Misfitting tasks should be migrated in any case */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = busiest->group_misfit_task;
- return;
- }
+ if (energy_aware()) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
- /*
- * Busiest group is overloaded, local is not, use the spare
- * cycles to maximize throughput
- */
- if (busiest->group_type == group_overloaded &&
- local->group_type <= group_misfit_task) {
- env->imbalance = busiest->load_per_task;
- return;
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
}
env->imbalance = 0;
) / SCHED_CAPACITY_SCALE;
/* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task)
+ if (energy_aware() && busiest->group_type == group_misfit_task)
env->imbalance = max_t(long, env->imbalance,
busiest->group_misfit_task);
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ goto force_balance;
+
+ if (bail_inter_cluster_balance(env, &sds))
+ goto out_balanced;
+
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
goto force_balance;
/* Misfitting tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task) {
+ if (energy_aware() && busiest->group_type == group_misfit_task) {
goto force_balance;
}
return NULL;
}
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *busiest_big = NULL;
+ u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+ int max_nr_big = 0, nr_big;
+ bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+ int i;
+ cpumask_t cpus;
+
+ cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+ for_each_cpu(i, &cpus) {
+ struct rq *rq = cpu_rq(i);
+ u64 cumulative_runnable_avg =
+ rq->hmp_stats.cumulative_runnable_avg;
+
+ if (!cpumask_test_cpu(i, env->cpus))
+ continue;
+
+
+ if (find_big) {
+ nr_big = nr_big_tasks(rq);
+ if (nr_big > max_nr_big ||
+ (nr_big > 0 && nr_big == max_nr_big &&
+ cumulative_runnable_avg > max_runnable_avg_big)) {
+ max_runnable_avg_big = cumulative_runnable_avg;
+ busiest_big = rq;
+ max_nr_big = nr_big;
+ continue;
+ }
+ }
+
+ if (cumulative_runnable_avg > max_runnable_avg) {
+ max_runnable_avg = cumulative_runnable_avg;
+ busiest = rq;
+ }
+ }
+
+ if (busiest_big)
+ return busiest_big;
+
+ env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+ return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ return NULL;
+}
+#endif
+
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
+#ifdef CONFIG_SCHED_HMP
+ return find_busiest_queue_hmp(env, group);
+#endif
+
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
-#define MAX_PINNED_INTERVAL 512
+#define MAX_PINNED_INTERVAL 16
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ return 1;
+
if (env->idle == CPU_NEWLY_IDLE) {
/*
return 1;
}
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ if (energy_aware() &&
+ (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
env->src_rq->cfs.h_nr_running == 1 &&
cpu_overutilized(env->src_cpu) &&
return 1;
}
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+ return unlikely(sd->nr_balance_failed >
+ sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}
-static int active_load_balance_cpu_stop(void *data);
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+ cpumask_t cpus;
+
+ cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+ cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+ return cpumask_first(&cpus);
+}
static int should_we_balance(struct lb_env *env)
{
sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+ cpu_isolated(cpu))
continue;
balance_cpu = cpu;
}
if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
+ balance_cpu = group_balance_cpu_not_isolated(sg);
/*
* First idle cpu or the first cpu(busiest) in this sched group
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
- int ld_moved, cur_ld_moved, active_balance = 0;
+ int ld_moved = 0, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
- struct sched_group *group;
- struct rq *busiest;
+ struct sched_group *group = NULL;
+ struct rq *busiest = NULL;
unsigned long flags;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
- .sd = sd,
- .dst_cpu = this_cpu,
- .dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
- .idle = idle,
- .loop_break = sched_nr_migrate_break,
- .cpus = cpus,
- .fbq_type = all,
- .tasks = LIST_HEAD_INIT(env.tasks),
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ .imbalance = 0,
+ .flags = 0,
+ .loop = 0,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .boost_policy = sched_boost_policy(),
};
/*
raw_spin_lock_irqsave(&busiest->lock, flags);
update_rq_clock(busiest);
+ /* The world might have changed. Validate assumptions */
+ if (busiest->nr_running <= 1) {
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ env.flags &= ~LBF_ALL_PINNED;
+ goto no_move;
+ }
+
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
}
}
+no_move:
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
+ schedstat_inc(sd, lb_failed[idle]);
+
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
- if (idle != CPU_NEWLY_IDLE)
- if (env.src_grp_nr_running > 1)
+ if (idle != CPU_NEWLY_IDLE &&
+ !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+ if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
+ }
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
- if (!busiest->active_balance) {
+ if (!busiest->active_balance &&
+ !cpu_isolated(cpu_of(busiest))) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ *continue_balancing = 0;
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ sd->nr_balance_failed =
+ sd->cache_nice_tries +
+ NEED_ACTIVE_BALANCE_THRESHOLD - 1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ /* Assumes one 'busiest' cpu that we pulled tasks from */
+ if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
+ } else {
+ check_for_freq_change(this_rq, true, false);
+ }
+ }
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
ld_moved = 0;
out:
+ trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+ group ? group->cpumask[0] : 0,
+ busiest ? busiest->nr_running : 0,
+ env.imbalance, env.flags, ld_moved,
+ sd->balance_interval);
return ld_moved;
}
int pulled_task = 0;
u64 curr_cost = 0;
+ if (cpu_isolated(this_cpu))
+ return 0;
+
idle_enter_fair(this_rq);
/*
/*
* Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * now runnable tasks on the balance rq or if
+ * continue_balancing has been unset (only possible
+ * due to active migration).
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ !continue_balancing)
break;
}
rcu_read_unlock();
struct task_struct *push_task = NULL;
int push_task_detached = 0;
struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .flags = 0,
+ .loop = 0,
+ .boost_policy = sched_boost_policy(),
};
+ bool moved = false;
raw_spin_lock_irq(&busiest_rq->lock);
BUG_ON(busiest_rq == target_rq);
push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
if (push_task) {
if (task_on_rq_queued(push_task) &&
+ push_task->state == TASK_RUNNING &&
task_cpu(push_task) == busiest_cpu &&
cpu_online(target_cpu)) {
detach_task(push_task, &env);
push_task_detached = 1;
+ moved = true;
}
goto out_unlock;
}
update_rq_clock(busiest_rq);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ moved = true;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
if (push_task)
busiest_rq->push_task = NULL;
if (push_task_detached)
attach_one_task(target_rq, push_task);
put_task_struct(push_task);
+ clear_reserved(target_cpu);
}
if (p)
local_irq_enable();
+ if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
+ } else if (moved) {
+ check_for_freq_change(target_rq, true, false);
+ }
+
return 0;
}
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static inline int find_new_ilb(void)
+
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
+{
+ int call_cpu = raw_smp_processor_id();
+ struct sched_domain *sd;
+ int ilb;
+
+ rcu_read_lock();
+
+ /* Pick an idle cpu "closest" to call_cpu */
+ for_each_domain(call_cpu, sd) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ sched_domain_span(sd)) {
+ if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+ cpu_max_power_cost(ilb) <=
+ cpu_max_power_cost(call_cpu))) {
+ rcu_read_unlock();
+ reset_balance_interval(ilb);
+ return ilb;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+ return nr_cpu_ids;
+}
+#else /* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int ilb;
+
+#ifdef CONFIG_SCHED_HMP
+ return find_new_hmp_ilb(type);
+#endif
+
+ ilb = cpumask_first(nohz.idle_cpus_mask);
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(type);
if (ilb_cpu >= nr_cpu_ids)
return;
return;
}
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+}
+
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
* Completely isolated CPUs don't ever set, so we must test.
*/
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ nohz_balance_clear_nohz_mask(cpu);
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
*/
void update_max_interval(void)
{
- max_load_balance_interval = HZ*num_online_cpus()/10;
+ cpumask_t avail_mask;
+ unsigned int available_cpus;
+
+ cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+ available_cpus = cpumask_weight(&avail_mask);
+
+ max_load_balance_interval = HZ*available_cpus/10;
}
/*
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ cpumask_t cpus;
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
goto end;
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+ for_each_cpu(balance_cpu, &cpus) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ struct sched_domain *sd;
+ int i;
+
+ if (rq->nr_running < 2)
+ return 0;
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ sched_boost_policy() == SCHED_BOOST_ON_ALL)
+ return 1;
+
+ if (cpu_max_power_cost(cpu) == max_power_cost)
+ return 1;
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu_load(i) < sched_spill_load &&
+ cpu_rq(i)->nr_running <
+ sysctl_sched_spill_nr_run) {
+ /* Change the kick type to limit to CPUs that
+ * are of equal or lower capacity.
+ */
+ *type = NOHZ_KICK_RESTRICT;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+ unsigned long now = jiffies;
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP
+ return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
+
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware())
+ return rq->misfit_task;
+
+ return (rq->nr_running >= 2);
+}
+
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
{
- unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
struct sched_domain *sd;
struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy;
+#endif
+ int cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
-
- if (time_before(now, nohz.next_balance))
- return false;
-
- if (rq->nr_running >= 2 &&
- (!energy_aware() || cpu_overutilized(cpu)))
+ if (_nohz_kick_needed(rq, cpu, type))
return true;
- /* Do idle load balance if there have misfit task */
- if (energy_aware())
- return rq->misfit_task;
-
+#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
unlock:
rcu_read_unlock();
+#endif
return kick;
}
#else
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ int type = NOHZ_KICK_ANY;
+
+ /* Don't need to rebalance while attached to NULL domain or
+ * cpu is isolated.
+ */
+ if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ if (nohz_kick_needed(rq, &type))
+ nohz_balancer_kick(type);
#endif
}
unthrottle_offline_cfs_rqs(rq);
}
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
- int rc = 0;
-
- /* Invoke active balance to force migrate currently running task */
- raw_spin_lock(&rq->lock);
- if (!rq->active_balance) {
- rq->active_balance = 1;
- rq->push_cpu = new_cpu;
- get_task_struct(p);
- rq->push_task = p;
- rc = 1;
- }
- raw_spin_unlock(&rq->lock);
-
- return rc;
-}
-
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
- int new_cpu;
- int active_balance;
- int cpu = task_cpu(p);
-
- if (energy_aware() && rq->misfit_task) {
- if (rq->curr->state != TASK_RUNNING ||
- rq->curr->nr_cpus_allowed == 1)
- return;
-
- new_cpu = select_energy_cpu_brute(p, cpu, 0);
- if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
- active_balance = kick_active_balance(rq, p, new_cpu);
- if (active_balance)
- stop_one_cpu_nowait(cpu,
- active_load_balance_cpu_stop,
- rq, &rq->active_balance_work);
- }
- }
-}
-
#endif /* CONFIG_SMP */
/*
task_tick_numa(rq, curr);
#ifdef CONFIG_SMP
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ if (energy_aware() &&
+ !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG