Merge android-4.4-p.195 (4af3204) into msm-4.4

[sagit-ice-cold/kernel_xiaomi_msm8998.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 134e2bd..f01eb27 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
  #include <linux/task_work.h>
  #include <linux/module.h>
  
-#include <trace/events/sched.h>
-
  #include "sched.h"
+#include <trace/events/sched.h>
  #include "tune.h"
  #include "walt.h"
  
@@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  unsigned int sysctl_sched_sync_hint_enable = 1;
  unsigned int sysctl_sched_cstate_aware = 1;
  
-#ifdef CONFIG_SCHED_WALT
-unsigned int sysctl_sched_use_walt_cpu_util = 1;
-unsigned int sysctl_sched_use_walt_task_util = 1;
-__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
-    (10 * NSEC_PER_MSEC);
-#endif
  /*
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
         return mul_u64_u32_shr(delta_exec, fact, shift);
  }
  
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
  
  const struct sched_class fair_sched_class;
  
@@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq)
         update_curr(cfs_rq_of(&rq->curr->se));
  }
  
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+       if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+           likely(wait_start > se->statistics.wait_start))
+               wait_start -= se->statistics.wait_start;
+
+       se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct task_struct *p;
+       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+       if (entity_is_task(se)) {
+               p = task_of(se);
+               if (task_on_rq_migrating(p)) {
+                       /*
+                        * Preserve migrating task's wait time so wait_start
+                        * time stamp can be adjusted to accumulate wait time
+                        * prior to migration.
+                        */
+                       se->statistics.wait_start = delta;
+                       return;
+               }
+               trace_sched_stat_wait(p, delta);
+       }
+
+       se->statistics.wait_max = max(se->statistics.wait_max, delta);
+       se->statistics.wait_count++;
+       se->statistics.wait_sum += delta;
+       se->statistics.wait_start = 0;
+}
+#else
  static inline void
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
  }
  
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
  /*
   * Task is being enqueued - update stats:
   */
@@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 update_stats_wait_start(cfs_rq, se);
  }
  
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-       schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-       schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-       if (entity_is_task(se)) {
-               trace_sched_stat_wait(task_of(se),
-                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-       }
-#endif
-       schedstat_set(se->statistics.wait_start, 0);
-}
-
  static inline void
  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
@@ -2633,7 +2656,27 @@ static inline void update_cfs_shares(struct sched_entity *se)
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+       u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+       return !!enabled;
+}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+       int enable = !!wake_up_idle;
+
+       if (enable)
+               p->flags |= PF_WAKE_UP_IDLE;
+       else
+               p->flags &= ~PF_WAKE_UP_IDLE;
+
+       return 0;
+}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
+
  static const u32 runnable_avg_yN_inv[] = {
         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
@@ -2713,165 +2756,1224 @@ static u32 __compute_runnable_contrib(u64 n)
         return contrib + runnable_avg_yN_sum[n];
  }
  
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
+#ifdef CONFIG_SCHED_HMP
+
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU                              0x1
+#define SBC_FLAG_BEST_CAP_CPU                          0x2
+#define SBC_FLAG_CPU_COST                              0x4
+#define SBC_FLAG_MIN_COST                              0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED                     0x10
+#define SBC_FLAG_IDLE_CSTATE                           0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER               0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER      0x80
+#define SBC_FLAG_CSTATE_LOAD                           0x100
+#define SBC_FLAG_BEST_SIBLING                          0x200
+#define SBC_FLAG_WAKER_CPU                             0x400
+#define SBC_FLAG_PACK_TASK                             0x800
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER                         0x10000
+#define SBC_FLAG_WAKER_CLUSTER                         0x20000
+#define SBC_FLAG_BACKUP_CLUSTER                                0x40000
+#define SBC_FLAG_BOOST_CLUSTER                         0x80000
+
+struct cpu_select_env {
+       struct task_struct *p;
+       struct related_thread_group *rtg;
+       u8 reason;
+       u8 need_idle:1;
+       u8 need_waker_cluster:1;
+       u8 sync:1;
+       enum sched_boost_policy boost_policy;
+       u8 pack_task:1;
+       int prev_cpu;
+       DECLARE_BITMAP(candidate_list, NR_CPUS);
+       DECLARE_BITMAP(backup_list, NR_CPUS);
+       u64 task_load;
+       u64 cpu_load;
+       u32 sbc_best_flag;
+       u32 sbc_best_cluster_flag;
+       struct cpumask search_cpus;
+};
  
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+struct cluster_cpu_stats {
+       int best_idle_cpu, least_loaded_cpu;
+       int best_capacity_cpu, best_cpu, best_sibling_cpu;
+       int min_cost, best_sibling_cpu_cost;
+       int best_cpu_wakeup_latency;
+       u64 min_load, best_load, best_sibling_cpu_load;
+       s64 highest_spare_capacity;
+};
  
  /*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series.  To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- *      p0            p1           p2
- *     (now)       (~1ms ago)  (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- *   y^32 = 0.5
+ * Should task be woken to any available idle cpu?
   *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
   */
-static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-                 unsigned long weight, int running, struct cfs_rq *cfs_rq)
+static inline int wake_to_idle(struct task_struct *p)
  {
-       u64 delta, scaled_delta, periods;
-       u32 contrib;
-       unsigned int delta_w, scaled_delta_w, decayed = 0;
-       unsigned long scale_freq, scale_cpu;
+       return (current->flags & PF_WAKE_UP_IDLE) ||
+                (p->flags & PF_WAKE_UP_IDLE);
+}
  
-       delta = now - sa->last_update_time;
-       /*
-        * This should only happen when time goes backwards, which it
-        * unfortunately does during sched clock init when we swap over to TSC.
-        */
-       if ((s64)delta < 0) {
-               sa->last_update_time = now;
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+       u64 total_load;
+
+       total_load = env->task_load + env->cpu_load;
+
+       if (total_load > sched_spill_load ||
+           (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+               return 1;
+
+       return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+       int tcpu = task_cpu(env->p);
+       int skip = 0;
+
+       if (!env->reason)
                 return 0;
+
+       if (is_reserved(cpu))
+               return 1;
+
+       switch (env->reason) {
+       case UP_MIGRATION:
+               skip = !idle_cpu(cpu);
+               break;
+       case IRQLOAD_MIGRATION:
+               /* Purposely fall through */
+       default:
+               skip = (cpu == tcpu);
+               break;
         }
  
-       /*
-        * Use 1024ns as the unit of measurement since it's a reasonable
-        * approximation of 1us and fast to compute.
-        */
-       delta >>= 10;
-       if (!delta)
-               return 0;
-       sa->last_update_time = now;
+       return skip;
+}
  
-       scale_freq = arch_scale_freq_capacity(NULL, cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-       trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+       int tcpu;
  
-       /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->period_contrib;
-       if (delta + delta_w >= 1024) {
-               decayed = 1;
+       if (!env->reason)
+               return 1;
  
-               /* how much left for next period will start over, we don't know yet */
-               sa->period_contrib = 0;
+       tcpu = task_cpu(env->p);
+       switch (env->reason) {
+       case UP_MIGRATION:
+               return cluster->capacity > cpu_capacity(tcpu);
  
-               /*
-                * Now that we know we're crossing a period boundary, figure
-                * out how much from delta we need to complete the current
-                * period and accrue it.
-                */
-               delta_w = 1024 - delta_w;
-               scaled_delta_w = cap_scale(delta_w, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * scaled_delta_w;
-                       if (cfs_rq) {
-                               cfs_rq->runnable_load_sum +=
-                                               weight * scaled_delta_w;
+       case DOWN_MIGRATION:
+               return cluster->capacity < cpu_capacity(tcpu);
+
+       default:
+               break;
+       }
+
+       return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+       if (!test_bit(cluster->id, env->candidate_list))
+               return 1;
+
+       if (!acceptable_capacity(cluster, env)) {
+               __clear_bit(cluster->id, env->candidate_list);
+               return 1;
+       }
+
+       return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+       struct sched_cluster *cluster;
+
+       if (env->rtg) {
+               int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+               env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+               if (task_load_will_fit(env->p, env->task_load,
+                                       cpu, env->boost_policy)) {
+                       env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+                       if (env->boost_policy == SCHED_BOOST_NONE)
+                               return env->rtg->preferred_cluster;
+
+                       for_each_sched_cluster(cluster) {
+                               if (cluster != env->rtg->preferred_cluster) {
+                                       __set_bit(cluster->id,
+                                               env->backup_list);
+                                       __clear_bit(cluster->id,
+                                               env->candidate_list);
+                               }
                         }
+
+                       return env->rtg->preferred_cluster;
                 }
-               if (running)
-                       sa->util_sum += scaled_delta_w * scale_cpu;
  
-               delta -= delta_w;
+               /*
+                * Since the task load does not fit on the preferred
+                * cluster anymore, pretend that the task does not
+                * have any preferred cluster. This allows the waking
+                * task to get the appropriate CPU it needs as per the
+                * non co-location placement policy without having to
+                * wait until the preferred cluster is updated.
+                */
+               env->rtg = NULL;
+       }
  
-               /* Figure out how many additional periods this update spans */
-               periods = delta / 1024;
-               delta %= 1024;
+       for_each_sched_cluster(cluster) {
+               if (!skip_cluster(cluster, env)) {
+                       int cpu = cluster_first_cpu(cluster);
  
-               sa->load_sum = decay_load(sa->load_sum, periods + 1);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
-               }
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+                       env->task_load = scale_load_to_cpu(task_load(env->p),
+                                                                        cpu);
+                       if (task_load_will_fit(env->p, env->task_load, cpu,
+                                              env->boost_policy))
+                               return cluster;
  
-               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-               contrib = __compute_runnable_contrib(periods);
-               contrib = cap_scale(contrib, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * contrib;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * contrib;
+                       __set_bit(cluster->id, env->backup_list);
+                       __clear_bit(cluster->id, env->candidate_list);
                 }
-               if (running)
-                       sa->util_sum += contrib * scale_cpu;
         }
  
-       /* Remainder of delta accrued against u_0` */
-       scaled_delta = cap_scale(delta, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * scaled_delta;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * scaled_delta;
+       return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+       int cluster_id;
+
+       cluster_id = find_next_bit(list, end, start - 1 + 1);
+       if (cluster_id >= end)
+               return NULL;
+
+       return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+                     struct cpu_select_env *env, int cpu, int capacity,
+                     u64 cpu_load)
+{
+       s64 spare_capacity = sched_ravg_window - cpu_load;
+
+       if (spare_capacity > 0 &&
+           (spare_capacity > stats->highest_spare_capacity ||
+            (spare_capacity == stats->highest_spare_capacity &&
+             ((!env->need_waker_cluster &&
+               capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+              (env->need_waker_cluster &&
+               cpu_rq(cpu)->nr_running <
+               cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+               /*
+                * If sync waker is the only runnable of CPU, cr_avg of the
+                * CPU is 0 so we have high chance to place the wakee on the
+                * waker's CPU which likely causes preemtion of the waker.
+                * This can lead migration of preempted waker.  Place the
+                * wakee on the real idle CPU when it's possible by checking
+                * nr_running to avoid such preemption.
+                */
+               stats->highest_spare_capacity = spare_capacity;
+               stats->best_capacity_cpu = cpu;
         }
-       if (running)
-               sa->util_sum += scaled_delta * scale_cpu;
+}
  
-       sa->period_contrib += delta;
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+       struct sched_cluster *next = NULL;
+       int i;
+       struct cpumask search_cpus;
  
-       if (decayed) {
-               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_avg =
-                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+       while (!bitmap_empty(env->backup_list, num_clusters)) {
+               next = next_candidate(env->backup_list, 0, num_clusters);
+               __clear_bit(next->id, env->backup_list);
+
+               cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+               for_each_cpu(i, &search_cpus) {
+                       trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+                       sched_irqload(i), power_cost(i, task_load(env->p) +
+                                       cpu_cravg_sync(i, env->sync)), 0);
+
+                       update_spare_capacity(stats, env, i, next->capacity,
+                                         cpu_load_sync(i, env->sync));
                 }
-               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+               env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
         }
-
-       return decayed;
  }
  
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do {                           \
-       typeof(_ptr) ptr = (_ptr);                              \
-       typeof(_val) val = (_val);                              \
-       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-                                                               \
-       res = var + val;                                        \
-                                                               \
-       if (val < 0 && res > var)                               \
-               res = 0;                                        \
-                                                               \
-       WRITE_ONCE(*ptr, res);                                  \
-} while (0)
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+                                       struct cluster_cpu_stats *stats)
+{
+       struct sched_cluster *next = NULL;
+
+       __clear_bit(cluster->id, env->candidate_list);
+
+       if (env->rtg && preferred_cluster(cluster, env->p))
+               return NULL;
+
+       do {
+               if (bitmap_empty(env->candidate_list, num_clusters))
+                       return NULL;
+
+               next = next_candidate(env->candidate_list, 0, num_clusters);
+               if (next) {
+                       if (next->min_power_cost > stats->min_cost) {
+                               clear_bit(next->id, env->candidate_list);
+                               next = NULL;
+                               continue;
+                       }
+
+                       if (skip_cluster(next, env))
+                               next = NULL;
+               }
+       } while (!next);
+
+       env->task_load = scale_load_to_cpu(task_load(env->p),
+                                       cluster_first_cpu(next));
+       return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+                                  struct cpu_select_env *env, int cpu_cost)
+{
+       int wakeup_latency;
+       int prev_cpu = env->prev_cpu;
+
+       wakeup_latency = cpu_rq(cpu)->wakeup_latency;
+
+       if (env->need_idle) {
+               stats->min_cost = cpu_cost;
+               if (idle_cpu(cpu)) {
+                       if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+                           (wakeup_latency == stats->best_cpu_wakeup_latency &&
+                            cpu == prev_cpu)) {
+                               stats->best_idle_cpu = cpu;
+                               stats->best_cpu_wakeup_latency = wakeup_latency;
+                       }
+               } else {
+                       if (env->cpu_load < stats->min_load ||
+                               (env->cpu_load == stats->min_load &&
+                                                       cpu == prev_cpu)) {
+                               stats->least_loaded_cpu = cpu;
+                               stats->min_load = env->cpu_load;
+                       }
+               }
+
+               return;
+       }
+
+       if (cpu_cost < stats->min_cost)  {
+               stats->min_cost = cpu_cost;
+               stats->best_cpu_wakeup_latency = wakeup_latency;
+               stats->best_load = env->cpu_load;
+               stats->best_cpu = cpu;
+               env->sbc_best_flag = SBC_FLAG_CPU_COST;
+               return;
+       }
+
+       /* CPU cost is the same. Start breaking the tie by C-state */
+
+       if (wakeup_latency > stats->best_cpu_wakeup_latency)
+               return;
+
+       if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+               stats->best_cpu_wakeup_latency = wakeup_latency;
+               stats->best_load = env->cpu_load;
+               stats->best_cpu = cpu;
+               env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
+               return;
+       }
+
+       /* C-state is the same. Use prev CPU to break the tie */
+       if (cpu == prev_cpu) {
+               stats->best_cpu = cpu;
+               env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
+               return;
+       }
+
+       if (stats->best_cpu != prev_cpu &&
+           ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+           (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
+               stats->best_load = env->cpu_load;
+               stats->best_cpu = cpu;
+               env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
+       }
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+                                  struct cpu_select_env *env, int cpu_cost)
+{
+       int prev_cpu = env->prev_cpu;
+
+       if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+               if (stats->best_sibling_cpu_cost > cpu_cost ||
+                   (stats->best_sibling_cpu_cost == cpu_cost &&
+                    stats->best_sibling_cpu_load > env->cpu_load)) {
+                       stats->best_sibling_cpu_cost = cpu_cost;
+                       stats->best_sibling_cpu_load = env->cpu_load;
+                       stats->best_sibling_cpu = cpu;
+               }
+       }
+
+       if ((cpu_cost < stats->min_cost) ||
+           ((stats->best_cpu != prev_cpu &&
+             stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+               if (env->need_idle) {
+                       if (idle_cpu(cpu)) {
+                               stats->min_cost = cpu_cost;
+                               stats->best_idle_cpu = cpu;
+                       }
+               } else {
+                       stats->min_cost = cpu_cost;
+                       stats->min_load = env->cpu_load;
+                       stats->best_cpu = cpu;
+                       env->sbc_best_flag = SBC_FLAG_MIN_COST;
+               }
+       }
+}
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+                                        struct cpu_select_env *env)
+{
+       int cpu_cost;
+
+       /*
+        * We try to find the least loaded *busy* CPU irrespective
+        * of the power cost.
+        */
+       if (env->pack_task)
+               cpu_cost = cpu_min_power_cost(cpu);
+
+       else
+               cpu_cost = power_cost(cpu, task_load(env->p) +
+                               cpu_cravg_sync(cpu, env->sync));
+
+       if (cpu_cost <= stats->min_cost)
+               __update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+        struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+       int i;
+       struct cpumask search_cpus;
+
+       cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
+
+       env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
+       for_each_cpu(i, &search_cpus) {
+               env->cpu_load = cpu_load_sync(i, env->sync);
+
+               trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+                       sched_irqload(i),
+                       power_cost(i, task_load(env->p) +
+                                       cpu_cravg_sync(i, env->sync)), 0);
+
+               if (skip_cpu(i, env))
+                       continue;
+
+               update_spare_capacity(stats, env, i, c->capacity,
+                                     env->cpu_load);
+
+               /*
+                * need_idle takes precedence over sched boost but when both
+                * are set, idlest CPU with in all the clusters is selected
+                * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+                * big cluster is selected within boost_policy = BOOST_ON_BIG.
+                */
+               if ((!env->need_idle &&
+                   env->boost_policy != SCHED_BOOST_NONE) ||
+                   env->need_waker_cluster ||
+                   sched_cpu_high_irqload(i) ||
+                   spill_threshold_crossed(env, cpu_rq(i)))
+                       continue;
+
+               update_cluster_stats(i, stats, env);
+       }
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+       stats->best_cpu = stats->best_idle_cpu = -1;
+       stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
+       stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+       stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
+       stats->highest_spare_capacity = 0;
+       stats->least_loaded_cpu = -1;
+       stats->best_cpu_wakeup_latency = INT_MAX;
+       /* No need to initialize stats->best_load */
+}
+
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+       if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+           env->reason)
+               return true;
+
+       return false;
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+       int prev_cpu;
+       struct task_struct *task = env->p;
+       struct sched_cluster *cluster;
+
+       if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
+               return false;
+
+       prev_cpu = env->prev_cpu;
+       if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
+               return false;
+
+       if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+                               sched_long_cpu_selection_threshold)
+               return false;
+
+       /*
+        * This function should be used by task wake up path only as it's
+        * assuming p->last_switch_out_ts as last sleep time.
+        * p->last_switch_out_ts can denote last preemption time as well as
+        * last sleep time.
+        */
+       if (task->ravg.mark_start - task->last_switch_out_ts >=
+                                       sched_short_sleep_task_threshold)
+               return false;
+
+       env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+       cluster = cpu_rq(prev_cpu)->cluster;
+
+       if (!task_load_will_fit(task, env->task_load, prev_cpu,
+                               sched_boost_policy())) {
+
+               __set_bit(cluster->id, env->backup_list);
+               __clear_bit(cluster->id, env->candidate_list);
+               return false;
+       }
+
+       env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+       if (sched_cpu_high_irqload(prev_cpu) ||
+                       spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+               update_spare_capacity(stats, env, prev_cpu,
+                               cluster->capacity, env->cpu_load);
+               cpumask_clear_cpu(prev_cpu, &env->search_cpus);
+               return false;
+       }
+
+       return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+       return env->sync &&
+              task_load(current) > sched_big_waker_task_load &&
+              task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline bool
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
+{
+       return sysctl_sched_prefer_sync_wakee_to_waker &&
+              cpu_rq(cpu)->nr_running == 1 &&
+              cpumask_test_cpu(cpu, &env->search_cpus);
+}
+
+static inline int
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
+{
+       return cpumask_intersects(&env->search_cpus, &cluster->cpus);
+}
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+                          int sync)
+{
+       struct sched_cluster *cluster, *pref_cluster = NULL;
+       struct cluster_cpu_stats stats;
+       struct related_thread_group *grp;
+       unsigned int sbc_flag = 0;
+       int cpu = raw_smp_processor_id();
+       bool special;
+
+       struct cpu_select_env env = {
+               .p                      = p,
+               .reason                 = reason,
+               .need_idle              = wake_to_idle(p),
+               .need_waker_cluster     = 0,
+               .sync                   = sync,
+               .prev_cpu               = target,
+               .rtg                    = NULL,
+               .sbc_best_flag          = 0,
+               .sbc_best_cluster_flag  = 0,
+               .pack_task              = false,
+       };
+
+       env.boost_policy = task_sched_boost(p) ?
+                       sched_boost_policy() : SCHED_BOOST_NONE;
+
+       bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+       bitmap_zero(env.backup_list, NR_CPUS);
+
+       cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+       cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
+       init_cluster_cpu_stats(&stats);
+       special = env_has_special_flags(&env);
+
+       rcu_read_lock();
+
+       grp = task_related_thread_group(p);
+
+       if (grp && grp->preferred_cluster) {
+               pref_cluster = grp->preferred_cluster;
+               if (!cluster_allowed(&env, pref_cluster))
+                       clear_bit(pref_cluster->id, env.candidate_list);
+               else
+                       env.rtg = grp;
+       } else if (!special) {
+               cluster = cpu_rq(cpu)->cluster;
+               if (wake_to_waker_cluster(&env)) {
+                       if (bias_to_waker_cpu(&env, cpu)) {
+                               target = cpu;
+                               sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+                                          SBC_FLAG_WAKER_CPU;
+                               goto out;
+                       } else if (cluster_allowed(&env, cluster)) {
+                               env.need_waker_cluster = 1;
+                               bitmap_zero(env.candidate_list, NR_CPUS);
+                               __set_bit(cluster->id, env.candidate_list);
+                               env.sbc_best_cluster_flag =
+                                                       SBC_FLAG_WAKER_CLUSTER;
+                       }
+               } else if (bias_to_prev_cpu(&env, &stats)) {
+                       sbc_flag = SBC_FLAG_PREV_CPU;
+                       goto out;
+               }
+       }
+
+       if (!special && is_short_burst_task(p)) {
+               env.pack_task = true;
+               sbc_flag = SBC_FLAG_PACK_TASK;
+       }
+retry:
+       cluster = select_least_power_cluster(&env);
+
+       if (!cluster)
+               goto out;
+
+       /*
+        * 'cluster' now points to the minimum power cluster which can satisfy
+        * task's perf goals. Walk down the cluster list starting with that
+        * cluster. For non-small tasks, skip clusters that don't have
+        * mostly_idle/idle cpus
+        */
+
+       do {
+               find_best_cpu_in_cluster(cluster, &env, &stats);
+
+       } while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+       if (env.need_idle) {
+               if (stats.best_idle_cpu >= 0) {
+                       target = stats.best_idle_cpu;
+                       sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+               } else if (stats.least_loaded_cpu >= 0) {
+                       target = stats.least_loaded_cpu;
+                       sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+               }
+       } else if (stats.best_cpu >= 0) {
+               if (stats.best_sibling_cpu >= 0 &&
+                               stats.best_cpu != task_cpu(p) &&
+                               stats.min_cost == stats.best_sibling_cpu_cost) {
+                       stats.best_cpu = stats.best_sibling_cpu;
+                       sbc_flag |= SBC_FLAG_BEST_SIBLING;
+               }
+               sbc_flag |= env.sbc_best_flag;
+               target = stats.best_cpu;
+       } else {
+               if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
+                       env.rtg = NULL;
+                       goto retry;
+               }
+
+               /*
+                * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+                * backup_list = little cluster, candidate_list = none and
+                * stats->best_capacity_cpu points the best spare capacity
+                * CPU among the CPUs in the big cluster.
+                */
+               if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+                   stats.best_capacity_cpu >= 0)
+                       sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+               else
+                       find_backup_cluster(&env, &stats);
+
+               if (stats.best_capacity_cpu >= 0) {
+                       target = stats.best_capacity_cpu;
+                       sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+               }
+       }
+       p->last_cpu_selected_ts = sched_ktime_clock();
+out:
+       sbc_flag |= env.sbc_best_cluster_flag;
+       rcu_read_unlock();
+       trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+               env.reason, env.sync, env.need_idle, sbc_flag, target);
+       return target;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+       tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+       return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu)       \
+       for (tg = container_of(&task_groups, struct task_group, list);  \
+               ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+       struct task_group *tg;
+       struct cfs_rq *cfs_rq;
+
+       rcu_read_lock();
+
+       for_each_cfs_rq(cfs_rq, tg, cpu)
+               reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+       rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+                       struct task_struct *p, int change_cra)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se;
+
+       /*
+        * Although below check is not strictly required  (as
+        * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+        * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+        * efficiency by short-circuiting for_each_sched_entity() loop when
+        * sched_disable_window_stats
+        */
+       if (sched_disable_window_stats)
+               return;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+       }
+
+       /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+       if (!se)
+               inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se;
+
+       /* See comment on efficiency in _inc_hmp_sched_stats_fair */
+       if (sched_disable_window_stats)
+               return;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+       }
+
+       /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+       if (!se)
+               dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+       _inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+       _dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+                                      u32 new_task_load, u32 new_pred_demand)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se;
+       s64 task_load_delta = (s64)new_task_load - task_load(p);
+       s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+
+               fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+                                             task_load_delta,
+                                             pred_demand_delta);
+               fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+       }
+
+       /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+       if (!se) {
+               fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+                                             task_load_delta,
+                                             pred_demand_delta);
+               fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+       }
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else  /* CONFIG_CFS_BANDWIDTH */
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+       inc_nr_big_task(&rq->hmp_stats, p);
+       inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+       dec_nr_big_task(&rq->hmp_stats, p);
+       dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+                          u32 new_task_load, u32 new_pred_demand)
+{
+       s64 task_load_delta = (s64)new_task_load - task_load(p);
+       s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+       fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+                                     pred_demand_delta);
+       fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+       return 0;
+}
+
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+                       struct task_struct *p, int change_cra)
+{
+       inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+       struct sched_domain *sd;
+
+       if (cpu >= nr_cpu_ids)
+               return;
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd)
+               sd->balance_interval = 0;
+       rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+       int nice;
+       struct related_thread_group *grp;
+
+       if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+               return 0;
+
+       /* No need to migrate task that is about to be throttled */
+       if (task_will_be_throttled(p))
+               return 0;
+
+       if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+                cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+               return UP_MIGRATION;
+
+       if (sched_cpu_high_irqload(cpu))
+               return IRQLOAD_MIGRATION;
+
+       nice = task_nice(p);
+       rcu_read_lock();
+       grp = task_related_thread_group(p);
+       /*
+        * Don't assume higher capacity means higher power. If the task
+        * is running on the power efficient CPU, avoid migrating it
+        * to a lower capacity cluster.
+        */
+       if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+                       upmigrate_discouraged(p)) &&
+                       cpu_capacity(cpu) > min_capacity &&
+                       cpu_max_power_cost(cpu) == max_power_cost) {
+               rcu_read_unlock();
+               return DOWN_MIGRATION;
+       }
+
+       if (!task_will_fit(p, cpu)) {
+               rcu_read_unlock();
+               return UP_MIGRATION;
+       }
+       rcu_read_unlock();
+
+       return 0;
+}
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+       unsigned long flags;
+       int rc = 0;
+
+       /* Invoke active balance to force migrate currently running task */
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (!rq->active_balance) {
+               rq->active_balance = 1;
+               rq->push_cpu = new_cpu;
+               get_task_struct(p);
+               rq->push_task = p;
+               rc = 1;
+       }
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return rc;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+       if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+                               && same_cluster(new_cpu, cpu))
+               return false;
+
+       /* Inter cluster high irqload migrations are OK */
+       return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+       int cpu = cpu_of(rq), new_cpu;
+       int active_balance = 0, reason;
+
+       reason = migration_needed(p, cpu);
+       if (!reason)
+               return;
+
+       raw_spin_lock(&migration_lock);
+       new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+       if (do_migration(reason, new_cpu, cpu)) {
+               active_balance = kick_active_balance(rq, p, new_cpu);
+               if (active_balance)
+                       mark_reserved(new_cpu);
+       }
+
+       raw_spin_unlock(&migration_lock);
+
+       if (active_balance)
+               stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+                                       &rq->active_balance_work);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->hmp_stats.nr_big_tasks = 0;
+       cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+       cfs_rq->hmp_stats.pred_demands_sum = 0;
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+                struct task_struct *p, int change_cra)
+{
+       inc_nr_big_task(&cfs_rq->hmp_stats, p);
+       if (change_cra)
+               inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+                struct task_struct *p, int change_cra)
+{
+       dec_nr_big_task(&cfs_rq->hmp_stats, p);
+       if (change_cra)
+               dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+                        struct cfs_rq *cfs_rq)
+{
+       stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+       stats->cumulative_runnable_avg +=
+                               cfs_rq->hmp_stats.cumulative_runnable_avg;
+       stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+                                struct cfs_rq *cfs_rq)
+{
+       stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+       stats->cumulative_runnable_avg -=
+                               cfs_rq->hmp_stats.cumulative_runnable_avg;
+       stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
+
+       BUG_ON(stats->nr_big_tasks < 0 ||
+               (s64)stats->cumulative_runnable_avg < 0);
+       BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#else  /* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#else  /* CONFIG_SCHED_HMP */
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
+
+#endif /* CONFIG_SCHED_HMP */
+
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p2
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+                 unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+       u64 delta, scaled_delta, periods;
+       u32 contrib;
+       unsigned int delta_w, scaled_delta_w, decayed = 0;
+       unsigned long scale_freq, scale_cpu;
+
+       delta = now - sa->last_update_time;
+       /*
+        * This should only happen when time goes backwards, which it
+        * unfortunately does during sched clock init when we swap over to TSC.
+        */
+       if ((s64)delta < 0) {
+               sa->last_update_time = now;
+               return 0;
+       }
+
+       /*
+        * Use 1024ns as the unit of measurement since it's a reasonable
+        * approximation of 1us and fast to compute.
+        */
+       delta >>= 10;
+       if (!delta)
+               return 0;
+       sa->last_update_time = now;
+
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+       trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
+
+       /* delta_w is the amount already accumulated against our next period */
+       delta_w = sa->period_contrib;
+       if (delta + delta_w >= 1024) {
+               decayed = 1;
+
+               /* how much left for next period will start over, we don't know yet */
+               sa->period_contrib = 0;
+
+               /*
+                * Now that we know we're crossing a period boundary, figure
+                * out how much from delta we need to complete the current
+                * period and accrue it.
+                */
+               delta_w = 1024 - delta_w;
+               scaled_delta_w = cap_scale(delta_w, scale_freq);
+               if (weight) {
+                       sa->load_sum += weight * scaled_delta_w;
+                       if (cfs_rq) {
+                               cfs_rq->runnable_load_sum +=
+                                               weight * scaled_delta_w;
+                       }
+               }
+               if (running)
+                       sa->util_sum += scaled_delta_w * scale_cpu;
+
+               delta -= delta_w;
+
+               /* Figure out how many additional periods this update spans */
+               periods = delta / 1024;
+               delta %= 1024;
+
+               sa->load_sum = decay_load(sa->load_sum, periods + 1);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_sum =
+                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
+               }
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+
+               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+               contrib = __compute_runnable_contrib(periods);
+               contrib = cap_scale(contrib, scale_freq);
+               if (weight) {
+                       sa->load_sum += weight * contrib;
+                       if (cfs_rq)
+                               cfs_rq->runnable_load_sum += weight * contrib;
+               }
+               if (running)
+                       sa->util_sum += contrib * scale_cpu;
+       }
+
+       /* Remainder of delta accrued against u_0` */
+       scaled_delta = cap_scale(delta, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * scaled_delta;
+               if (cfs_rq)
+                       cfs_rq->runnable_load_sum += weight * scaled_delta;
+       }
+
+       if (running)
+               sa->util_sum += scaled_delta * scale_cpu;
+
+       sa->period_contrib += delta;
+
+       if (decayed) {
+               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_avg =
+                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+               }
+               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+       }
+
+       return decayed;
+}
+
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(_val) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+                                                               \
+       res = var + val;                                        \
+                                                               \
+       if (val < 0 && res > var)                               \
+               res = 0;                                        \
+                                                               \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /**
@@ -3404,6 +4506,12 @@ static inline int idle_balance(struct rq *rq)
         return 0;
  }
  
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+        struct task_struct *p, int change_cra) { }
+
  #endif /* CONFIG_SMP */
  
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4030,6 +5138,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
         return cfs_bandwidth_used() && cfs_rq->throttled;
  }
  
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq;
+
+       if (!cfs_bandwidth_used())
+               return 0;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               if (!cfs_rq->runtime_enabled)
+                       continue;
+               if (cfs_rq->runtime_remaining <= 0)
+                       return 1;
+       }
+
+       return 0;
+}
+#endif
+
  /* check whether cfs_rq, or any parent, is throttled */
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  {
@@ -4109,13 +5246,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 if (dequeue)
                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                 qcfs_rq->h_nr_running -= task_delta;
+               dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
  
                 if (qcfs_rq->load.weight)
                         dequeue = 0;
         }
  
-       if (!se)
+       if (!se) {
                 sub_nr_running(rq, task_delta);
+               dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+       }
  
         cfs_rq->throttled = 1;
         cfs_rq->throttled_clock = rq_clock(rq);
@@ -4140,6 +5280,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 start_cfs_bandwidth(cfs_b);
  
         raw_spin_unlock(&cfs_b->lock);
+
+       /* Log effect on hmp stats after throttling */
+       trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+                            sched_irqload(cpu_of(rq)),
+                            power_cost(cpu_of(rq), 0),
+                            cpu_temp(cpu_of(rq)));
  }
  
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4149,6 +5295,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         struct sched_entity *se;
         int enqueue = 1;
         long task_delta;
+       struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
  
         se = cfs_rq->tg->se[cpu_of(rq)];
  
@@ -4176,17 +5323,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                 if (enqueue)
                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                 cfs_rq->h_nr_running += task_delta;
+               inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
         }
  
-       if (!se)
+       if (!se) {
                 add_nr_running(rq, task_delta);
+               inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+       }
  
         /* determine whether we need to wake up potentially idle cpu */
         if (rq->curr == rq->idle && rq->cfs.nr_running)
                 resched_curr(rq);
+
+       /* Log effect on hmp stats after un-throttling */
+       trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+                            sched_irqload(cpu_of(rq)),
+                            power_cost(cpu_of(rq), 0),
+                            cpu_temp(cpu_of(rq)));
  }
  
  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
@@ -4564,6 +5720,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
         cfs_rq->runtime_enabled = 0;
         INIT_LIST_HEAD(&cfs_rq->throttled_list);
+       init_cfs_rq_hmp_stats(cfs_rq);
  }
  
  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4679,7 +5836,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  
         WARN_ON(task_rq(p) != rq);
  
-       if (cfs_rq->nr_running > 1) {
+       if (rq->cfs.h_nr_running > 1) {
                 u64 slice = sched_slice(cfs_rq, se);
                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                 s64 delta = slice - ran;
@@ -4695,8 +5852,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  
  /*
   * called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
   */
  static void hrtick_update(struct rq *rq)
  {
@@ -4705,8 +5861,7 @@ static void hrtick_update(struct rq *rq)
         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
                 return;
  
-       if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
-               hrtick_start_fair(rq, curr);
+       hrtick_start_fair(rq, curr);
  }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
@@ -4764,7 +5919,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
-               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+               inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                 flags = ENQUEUE_WAKEUP;
         }
@@ -4772,7 +5927,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
-               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+               inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -4781,8 +5936,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 update_cfs_shares(se);
         }
  
-       if (!se)
+       if (!se) {
                 add_nr_running(rq, 1);
+               inc_rq_hmp_stats(rq, p, 1);
+       }
  
  #ifdef CONFIG_SMP
  
@@ -4805,8 +5962,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
          */
         schedtune_enqueue_task(p, cpu_of(rq));
  
-       if (!se) {
-               walt_inc_cumulative_runnable_avg(rq, p);
+       if (energy_aware() && !se) {
                 if (!task_new && !rq->rd->overutilized &&
                     cpu_overutilized(rq->cpu)) {
                         rq->rd->overutilized = true;
@@ -4844,7 +6000,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
-               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+               dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@ -4864,7 +6020,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
-               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+               dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -4873,8 +6029,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 update_cfs_shares(se);
         }
  
-       if (!se)
+       if (!se) {
                 sub_nr_running(rq, 1);
+               dec_rq_hmp_stats(rq, p, 1);
+       }
  
  #ifdef CONFIG_SMP
  
@@ -4887,8 +6045,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
          */
         schedtune_dequeue_task(p, cpu_of(rq));
  
-       if (!se)
-               walt_dec_cumulative_runnable_avg(rq, p);
  #endif /* CONFIG_SMP */
  
         hrtick_update(rq);
@@ -5301,11 +6457,6 @@ unsigned long capacity_curr_of(int cpu)
                >> SCHED_CAPACITY_SHIFT;
  }
  
-static inline bool energy_aware(void)
-{
-       return sched_feat(ENERGY_AWARE);
-}
-
  struct energy_env {
         struct sched_group      *sg_top;
         struct sched_group      *sg_cap;
@@ -5905,12 +7056,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
  
  static inline unsigned long task_util(struct task_struct *p)
  {
-#ifdef CONFIG_SCHED_WALT
-       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
-               unsigned long demand = p->ravg.demand;
-               return (demand << 10) / walt_ravg_window;
-       }
-#endif
         return p->se.avg.util_avg;
  }
  
@@ -6295,6 +7440,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
                 }
         }
  
+       if (!(current->flags & PF_WAKE_UP_IDLE) &&
+                       !(p->flags & PF_WAKE_UP_IDLE))
+               return target;
+
         /*
          * Otherwise, iterate the domains and find an elegible idle cpu.
          */
@@ -6819,6 +7968,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int want_affine = 0;
         int sync = wake_flags & WF_SYNC;
  
+#ifdef CONFIG_SCHED_HMP
+       return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
+
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
                 want_affine = !wake_wide(p, sibling_count_hint) &&
@@ -7405,6 +8558,10 @@ enum group_type {
  #define LBF_NEED_BREAK 0x02
  #define LBF_DST_PINNED  0x04
  #define LBF_SOME_PINNED        0x08
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
  
  struct lb_env {
         struct sched_domain     *sd;
@@ -7422,6 +8579,8 @@ struct lb_env {
         unsigned int            src_grp_nr_running;
         /* The set of CPUs under consideration for load-balancing */
         struct cpumask          *cpus;
+       unsigned int            busiest_grp_capacity;
+       unsigned int            busiest_nr_running;
  
         unsigned int            flags;
  
@@ -7432,6 +8591,7 @@ struct lb_env {
         enum fbq_type           fbq_type;
         enum group_type         busiest_group_type;
         struct list_head        tasks;
+       enum sched_boost_policy boost_policy;
  };
  
  /*
@@ -7529,6 +8689,7 @@ static
  int can_migrate_task(struct task_struct *p, struct lb_env *env)
  {
         int tsk_cache_hot;
+       int twf, group_cpus;
  
         lockdep_assert_held(&env->src_rq->lock);
  
@@ -7575,6 +8736,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         /* Record that we found atleast one task that could run on dst_cpu */
         env->flags &= ~LBF_ALL_PINNED;
  
+       if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+               if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+                       return 0;
+
+               if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+                                       !task_sched_boost(p))
+                       return 0;
+       }
+
+       twf = task_will_fit(p, env->dst_cpu);
+
+       /*
+        * Attempt to not pull tasks that don't fit. We may get lucky and find
+        * one that actually fits.
+        */
+       if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+               return 0;
+
+       if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+           !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+               return 0;
+
+       /*
+        * Group imbalance can sometimes cause work to be pulled across groups
+        * even though the group could have managed the imbalance on its own.
+        * Prevent inter-cluster migrations for big tasks when the number of
+        * tasks is lower than the capacity of the group.
+        */
+       group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+                                                SCHED_CAPACITY_SCALE);
+       if (!twf && env->busiest_nr_running <= group_cpus)
+               return 0;
+
         if (task_running(env->src_rq, p)) {
                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                 return 0;
@@ -7582,15 +8776,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
         /*
          * Aggressive migration if:
-        * 1) destination numa is preferred
-        * 2) task is cache cold, or
-        * 3) too many balance attempts have failed.
+        * 1) IDLE or NEWLY_IDLE balance.
+        * 2) destination numa is preferred
+        * 3) task is cache cold, or
+        * 4) too many balance attempts have failed.
          */
         tsk_cache_hot = migrate_degrades_locality(p, env);
         if (tsk_cache_hot == -1)
                 tsk_cache_hot = task_hot(p, env);
  
-       if (tsk_cache_hot <= 0 ||
+       if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                 if (tsk_cache_hot == 1) {
                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
@@ -7610,10 +8805,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  {
         lockdep_assert_held(&env->src_rq->lock);
  
-       deactivate_task(env->src_rq, p, 0);
         p->on_rq = TASK_ON_RQ_MIGRATING;
+       deactivate_task(env->src_rq, p, 0);
         double_lock_balance(env->src_rq, env->dst_rq);
         set_task_cpu(p, env->dst_cpu);
+       if (task_in_related_thread_group(p))
+               env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
         double_unlock_balance(env->src_rq, env->dst_rq);
  }
  
@@ -7642,6 +8839,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                  * inside detach_tasks().
                  */
                 schedstat_inc(env->sd, lb_gained[env->idle]);
+
                 return p;
         }
         return NULL;
@@ -7661,12 +8859,20 @@ static int detach_tasks(struct lb_env *env)
         struct task_struct *p;
         unsigned long load;
         int detached = 0;
+       int orig_loop = env->loop;
  
         lockdep_assert_held(&env->src_rq->lock);
  
         if (env->imbalance <= 0)
                 return 0;
  
+       if (!same_cluster(env->dst_cpu, env->src_cpu))
+               env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+       if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+               env->flags |= LBF_IGNORE_BIG_TASKS;
+
+redo:
         while (!list_empty(tasks)) {
                 /*
                  * We don't want to steal all, otherwise we may be treated likewise,
@@ -7728,6 +8934,15 @@ next:
                 list_move_tail(&p->se.group_node, tasks);
         }
  
+       if (env->flags & (LBF_IGNORE_BIG_TASKS |
+                       LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+               tasks = &env->src_rq->cfs_tasks;
+               env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+                               LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+               env->loop = orig_loop;
+               goto redo;
+       }
+
         /*
          * Right now, this is one of only two places we collect this stat
          * so we can safely collect detach_one_task() stats here rather
@@ -7746,8 +8961,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
         lockdep_assert_held(&rq->lock);
  
         BUG_ON(task_rq(p) != rq);
-       p->on_rq = TASK_ON_RQ_QUEUED;
         activate_task(rq, p, 0);
+       p->on_rq = TASK_ON_RQ_QUEUED;
         check_preempt_curr(rq, p, 0);
  }
  
@@ -7891,6 +9106,10 @@ struct sg_lb_stats {
         unsigned long group_capacity;
         unsigned long group_util; /* Total utilization of the group */
         unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+       unsigned long sum_nr_big_tasks;
+       u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
         unsigned int idle_cpus;
         unsigned int group_weight;
         enum group_type group_type;
@@ -7934,10 +9153,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                         .avg_load = 0UL,
                         .sum_nr_running = 0,
                         .group_type = group_other,
+#ifdef CONFIG_SCHED_HMP
+                       .sum_nr_big_tasks = 0UL,
+                       .group_cpu_load = 0ULL,
+#endif
                 },
         };
  }
  
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+       int local_cpu, busiest_cpu;
+       int local_capacity, busiest_capacity;
+       int local_pwr_cost, busiest_pwr_cost;
+       int nr_cpus;
+       int boost = sched_boost();
+
+       if (!sysctl_sched_restrict_cluster_spill ||
+               boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
+               return 0;
+
+       local_cpu = group_first_cpu(sds->local);
+       busiest_cpu = group_first_cpu(sds->busiest);
+
+       local_capacity = cpu_max_possible_capacity(local_cpu);
+       busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+       local_pwr_cost = cpu_max_power_cost(local_cpu);
+       busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+       if (local_pwr_cost <= busiest_pwr_cost)
+               return 0;
+
+       if (local_capacity > busiest_capacity &&
+                       sds->busiest_stat.sum_nr_big_tasks)
+               return 0;
+
+       nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+       if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+               (sds->busiest_stat.sum_nr_running <
+                       nr_cpus * sysctl_sched_spill_nr_run))
+               return 1;
+
+       return 0;
+}
+
+#else  /* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+       return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
  /**
   * get_sd_load_idx - Obtain the load index for a given sched domain.
   * @sd: The sched_domain whose load_idx is to be obtained.
@@ -8081,6 +9354,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         struct sched_group_capacity *sgc;
                         struct rq *rq = cpu_rq(cpu);
  
+                       if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+                               continue;
                         /*
                          * build_sched_domains() -> init_sched_groups_capacity()
                          * gets here before we've attached the domains to the
@@ -8112,9 +9387,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                 do {
                         struct sched_group_capacity *sgc = group->sgc;
  
-                       capacity += sgc->capacity;
-                       max_capacity = max(sgc->max_capacity, max_capacity);
-                       min_capacity = min(sgc->min_capacity, min_capacity);
+                       cpumask_t *cpus = sched_group_cpus(group);
+
+                       /* Revisit this later. This won't work for MT domain */
+                       if (!cpu_isolated(cpumask_first(cpus))) {
+                               capacity += sgc->capacity;
+                               max_capacity = max(sgc->max_capacity, max_capacity);
+                               min_capacity = min(sgc->min_capacity, min_capacity);
+                       }
                         group = group->next;
                 } while (group != child->groups);
         }
@@ -8230,7 +9510,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  
  static inline enum
  group_type group_classify(struct sched_group *group,
-                         struct sg_lb_stats *sgs)
+                         struct sg_lb_stats *sgs, struct lb_env *env)
  {
         if (sgs->group_no_capacity)
                 return group_overloaded;
@@ -8299,6 +9579,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
  
+               trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+                                    sched_irqload(i),
+                                    power_cost(i, 0),
+                                    cpu_temp(i));
+
+               if (cpu_isolated(i))
+                       continue;
+
                 /* if we are entering idle and there are CPUs with
                  * their tick stopped, do an update for them
                  */
@@ -8319,6 +9607,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 if (nr_running > 1)
                         *overload = true;
  
+#ifdef CONFIG_SCHED_HMP
+               sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+               sgs->group_cpu_load += cpu_load(i);
+#endif
+
  #ifdef CONFIG_NUMA_BALANCING
                 sgs->nr_numa_running += rq->nr_numa_running;
                 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -8330,25 +9623,62 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 if (!nr_running && idle_cpu(i))
                         sgs->idle_cpus++;
  
-               if (cpu_overutilized(i)) {
+               if (energy_aware() && cpu_overutilized(i)) {
                         *overutilized = true;
                         if (!sgs->group_misfit_task && rq->misfit_task)
                                 sgs->group_misfit_task = capacity_of(i);
                 }
         }
  
-       /* Adjust by relative CPU capacity of the group */
-       sgs->group_capacity = group->sgc->capacity;
-       sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+       /* Isolated CPU has no weight */
+       if (!group->group_weight) {
+               sgs->group_capacity = 0;
+               sgs->avg_load = 0;
+               sgs->group_no_capacity = 1;
+               sgs->group_type = group_other;
+               sgs->group_weight = group->group_weight;
+       } else {
+               /* Adjust by relative CPU capacity of the group */
+               sgs->group_capacity = group->sgc->capacity;
+               sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+                                                       sgs->group_capacity;
+
+               sgs->group_weight = group->group_weight;
+
+               sgs->group_no_capacity = group_is_overloaded(env, sgs);
+               sgs->group_type = group_classify(group, sgs, env);
+       }
  
         if (sgs->sum_nr_running)
                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+}
  
-       sgs->group_weight = group->group_weight;
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+                                                 struct sd_lb_stats *sds,
+                                                 struct sched_group *sg,
+                                                 struct sg_lb_stats *sgs)
+{
+       if (env->idle != CPU_NOT_IDLE &&
+           cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+               if (sgs->sum_nr_big_tasks >
+                               sds->busiest_stat.sum_nr_big_tasks) {
+                       env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+                       return true;
+               }
+       }
  
-       sgs->group_no_capacity = group_is_overloaded(env, sgs);
-       sgs->group_type = group_classify(group, sgs);
+       return false;
+}
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+                                                 struct sd_lb_stats *sds,
+                                                 struct sched_group *sg,
+                                                 struct sg_lb_stats *sgs)
+{
+       return false;
  }
+#endif
  
  /**
   * update_sd_pick_busiest - return 1 on busiest group
@@ -8370,35 +9700,40 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  {
         struct sg_lb_stats *busiest = &sds->busiest_stat;
  
+       if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+               return true;
+
         if (sgs->group_type > busiest->group_type)
                 return true;
  
         if (sgs->group_type < busiest->group_type)
                 return false;
  
-       /*
-        * Candidate sg doesn't face any serious load-balance problems
-        * so don't pick it if the local sg is already filled up.
-        */
-       if (sgs->group_type == group_other &&
-           !group_has_capacity(env, &sds->local_stat))
-               return false;
+       if (energy_aware()) {
+               /*
+                * Candidate sg doesn't face any serious load-balance problems
+                * so don't pick it if the local sg is already filled up.
+                */
+               if (sgs->group_type == group_other &&
+                   !group_has_capacity(env, &sds->local_stat))
+                       return false;
  
-       if (sgs->avg_load <= busiest->avg_load)
-               return false;
+               if (sgs->avg_load <= busiest->avg_load)
+                       return false;
  
-       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
-               goto asym_packing;
+               if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+                       goto asym_packing;
  
-       /*
-        * Candidate sg has no more than one task per CPU and
-        * has higher per-CPU capacity. Migrating tasks to less
-        * capable CPUs may harm throughput. Maximize throughput,
-        * power/energy consequences are not considered.
-        */
-       if (sgs->sum_nr_running <= sgs->group_weight &&
-           group_smaller_cpu_capacity(sds->local, sg))
-               return false;
+               /*
+                * Candidate sg has no more than one task per CPU and
+                * has higher per-CPU capacity. Migrating tasks to less
+                * capable CPUs may harm throughput. Maximize throughput,
+                * power/energy consequences are not considered.
+                */
+               if (sgs->sum_nr_running <= sgs->group_weight &&
+                   group_smaller_cpu_capacity(sds->local, sg))
+                       return false;
+       }
  
  asym_packing:
         /* This is the busiest node in its class. */
@@ -8506,14 +9841,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                     group_has_capacity(env, &sds->local_stat) &&
                     (sgs->sum_nr_running > 1)) {
                         sgs->group_no_capacity = 1;
-                       sgs->group_type = group_classify(sg, sgs);
+                       sgs->group_type = group_classify(sg, sgs, env);
                 }
  
                 /*
                  * Ignore task groups with misfit tasks if local group has no
                  * capacity or if per-cpu capacity isn't higher.
                  */
-               if (sgs->group_type == group_misfit_task &&
+               if (energy_aware() &&
+                   sgs->group_type == group_misfit_task &&
                     (!group_has_capacity(env, &sds->local_stat) ||
                      !group_smaller_cpu_capacity(sg, sds->local)))
                         sgs->group_type = group_other;
@@ -8521,6 +9857,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
                         sds->busiest_stat = *sgs;
+                       env->busiest_nr_running = sgs->sum_nr_running;
+                       env->busiest_grp_capacity = sgs->group_capacity;
                 }
  
  next_group:
@@ -8542,12 +9880,12 @@ next_group:
                         env->dst_rq->rd->overload = overload;
  
                 /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized) {
+               if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
                         env->dst_rq->rd->overutilized = overutilized;
                         trace_sched_overutilized(overutilized);
                 }
         } else {
-               if (!env->dst_rq->rd->overutilized && overutilized) {
+               if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
                         env->dst_rq->rd->overutilized = true;
                         trace_sched_overutilized(true);
                 }
@@ -8699,20 +10037,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          */
         if (busiest->avg_load <= sds->avg_load ||
             local->avg_load >= sds->avg_load) {
-               /* Misfitting tasks should be migrated in any case */
-               if (busiest->group_type == group_misfit_task) {
-                       env->imbalance = busiest->group_misfit_task;
-                       return;
-               }
+               if (energy_aware()) {
+                       /* Misfitting tasks should be migrated in any case */
+                       if (busiest->group_type == group_misfit_task) {
+                               env->imbalance = busiest->group_misfit_task;
+                               return;
+                       }
  
-               /*
-                * Busiest group is overloaded, local is not, use the spare
-                * cycles to maximize throughput
-                */
-               if (busiest->group_type == group_overloaded &&
-                   local->group_type <= group_misfit_task) {
-                       env->imbalance = busiest->load_per_task;
-                       return;
+                       /*
+                        * Busiest group is overloaded, local is not, use the spare
+                        * cycles to maximize throughput
+                        */
+                       if (busiest->group_type == group_overloaded &&
+                           local->group_type <= group_misfit_task) {
+                               env->imbalance = busiest->load_per_task;
+                               return;
+                       }
                 }
  
                 env->imbalance = 0;
@@ -8749,7 +10089,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         ) / SCHED_CAPACITY_SCALE;
  
         /* Boost imbalance to allow misfit task to be balanced. */
-       if (busiest->group_type == group_misfit_task)
+       if (energy_aware() && busiest->group_type == group_misfit_task)
                 env->imbalance = max_t(long, env->imbalance,
                                      busiest->group_misfit_task);
  
@@ -8810,6 +10150,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         if (!sds.busiest || busiest->sum_nr_running == 0)
                 goto out_balanced;
  
+       if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+               goto force_balance;
+
+       if (bail_inter_cluster_balance(env, &sds))
+               goto out_balanced;
+
         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
                                                 / sds.total_capacity;
  
@@ -8830,7 +10176,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                 goto force_balance;
  
         /* Misfitting tasks should be dealt with regardless of the avg load */
-       if (busiest->group_type == group_misfit_task) {
+       if (energy_aware() && busiest->group_type == group_misfit_task) {
                 goto force_balance;
         }
  
@@ -8881,6 +10227,60 @@ out_balanced:
         return NULL;
  }
  
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+                                    struct sched_group *group)
+{
+       struct rq *busiest = NULL, *busiest_big = NULL;
+       u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+       int max_nr_big = 0, nr_big;
+       bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+       int i;
+       cpumask_t cpus;
+
+       cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+       for_each_cpu(i, &cpus) {
+               struct rq *rq = cpu_rq(i);
+               u64 cumulative_runnable_avg =
+                               rq->hmp_stats.cumulative_runnable_avg;
+
+               if (!cpumask_test_cpu(i, env->cpus))
+                       continue;
+
+
+               if (find_big) {
+                       nr_big = nr_big_tasks(rq);
+                       if (nr_big > max_nr_big ||
+                           (nr_big > 0 && nr_big == max_nr_big &&
+                            cumulative_runnable_avg > max_runnable_avg_big)) {
+                               max_runnable_avg_big = cumulative_runnable_avg;
+                               busiest_big = rq;
+                               max_nr_big = nr_big;
+                               continue;
+                       }
+               }
+
+               if (cumulative_runnable_avg > max_runnable_avg) {
+                       max_runnable_avg = cumulative_runnable_avg;
+                       busiest = rq;
+               }
+       }
+
+       if (busiest_big)
+               return busiest_big;
+
+       env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+       return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+                                    struct sched_group *group)
+{
+       return NULL;
+}
+#endif
+
  /*
   * find_busiest_queue - find the busiest runqueue among the cpus in group.
   */
@@ -8891,6 +10291,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
         unsigned long busiest_load = 0, busiest_capacity = 1;
         int i;
  
+#ifdef CONFIG_SCHED_HMP
+       return find_busiest_queue_hmp(env, group);
+#endif
+
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 unsigned long capacity, wl;
                 enum fbq_type rt;
@@ -8959,15 +10363,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
   * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   * so long as it is large enough.
   */
-#define MAX_PINNED_INTERVAL    512
+#define MAX_PINNED_INTERVAL    16
  
  /* Working cpumask for load_balance and load_balance_newidle. */
  DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
  
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
  static int need_active_balance(struct lb_env *env)
  {
         struct sched_domain *sd = env->sd;
  
+       if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+               return 1;
+
         if (env->idle == CPU_NEWLY_IDLE) {
  
                 /*
@@ -8992,7 +10401,8 @@ static int need_active_balance(struct lb_env *env)
                         return 1;
         }
  
-       if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+       if (energy_aware() &&
+           (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
             ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
                                 env->src_rq->cfs.h_nr_running == 1 &&
                                 cpu_overutilized(env->src_cpu) &&
@@ -9000,10 +10410,18 @@ static int need_active_balance(struct lb_env *env)
                         return 1;
         }
  
-       return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+       return unlikely(sd->nr_balance_failed >
+                       sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
  }
  
-static int active_load_balance_cpu_stop(void *data);
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+       cpumask_t cpus;
+
+       cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+       cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+       return cpumask_first(&cpus);
+}
  
  static int should_we_balance(struct lb_env *env)
  {
@@ -9022,7 +10440,8 @@ static int should_we_balance(struct lb_env *env)
         sg_mask = sched_group_mask(sg);
         /* Try to find first idle cpu */
         for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-               if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+               if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+                   cpu_isolated(cpu))
                         continue;
  
                 balance_cpu = cpu;
@@ -9030,7 +10449,7 @@ static int should_we_balance(struct lb_env *env)
         }
  
         if (balance_cpu == -1)
-               balance_cpu = group_balance_cpu(sg);
+               balance_cpu = group_balance_cpu_not_isolated(sg);
  
         /*
          * First idle cpu or the first cpu(busiest) in this sched group
@@ -9047,23 +10466,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
                         int *continue_balancing)
  {
-       int ld_moved, cur_ld_moved, active_balance = 0;
+       int ld_moved = 0, cur_ld_moved, active_balance = 0;
         struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
-       struct sched_group *group;
-       struct rq *busiest;
+       struct sched_group *group = NULL;
+       struct rq *busiest = NULL;
         unsigned long flags;
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
  
         struct lb_env env = {
-               .sd             = sd,
-               .dst_cpu        = this_cpu,
-               .dst_rq         = this_rq,
-               .dst_grpmask    = sched_group_cpus(sd->groups),
-               .idle           = idle,
-               .loop_break     = sched_nr_migrate_break,
-               .cpus           = cpus,
-               .fbq_type       = all,
-               .tasks          = LIST_HEAD_INIT(env.tasks),
+               .sd                     = sd,
+               .dst_cpu                = this_cpu,
+               .dst_rq                 = this_rq,
+               .dst_grpmask            = sched_group_cpus(sd->groups),
+               .idle                   = idle,
+               .loop_break             = sched_nr_migrate_break,
+               .cpus                   = cpus,
+               .fbq_type               = all,
+               .tasks                  = LIST_HEAD_INIT(env.tasks),
+               .imbalance              = 0,
+               .flags                  = 0,
+               .loop                   = 0,
+               .busiest_nr_running     = 0,
+               .busiest_grp_capacity   = 0,
+               .boost_policy           = sched_boost_policy(),
         };
  
         /*
@@ -9117,6 +10542,13 @@ more_balance:
                 raw_spin_lock_irqsave(&busiest->lock, flags);
                 update_rq_clock(busiest);
  
+               /* The world might have changed. Validate assumptions */
+               if (busiest->nr_running <= 1) {
+                       raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                       env.flags &= ~LBF_ALL_PINNED;
+                       goto no_move;
+               }
+
                 /*
                  * cur_ld_moved - load moved in current iteration
                  * ld_moved     - cumulative load moved across iterations
@@ -9204,17 +10636,22 @@ more_balance:
                 }
         }
  
+no_move:
         if (!ld_moved) {
-               schedstat_inc(sd, lb_failed[idle]);
+               if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
+                       schedstat_inc(sd, lb_failed[idle]);
+
                 /*
                  * Increment the failure counter only on periodic balance.
                  * We do not want newidle balance, which can be very
                  * frequent, pollute the failure counter causing
                  * excessive cache_hot migrations and active balances.
                  */
-               if (idle != CPU_NEWLY_IDLE)
-                       if (env.src_grp_nr_running > 1)
+               if (idle != CPU_NEWLY_IDLE &&
+                   !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+                       if (env.src_grp_nr_running > 1)
                                 sd->nr_balance_failed++;
+               }
  
                 if (need_active_balance(&env)) {
                         raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -9236,7 +10673,8 @@ more_balance:
                          * ->active_balance_work.  Once set, it's cleared
                          * only after active load balance is finished.
                          */
-                       if (!busiest->active_balance) {
+                       if (!busiest->active_balance &&
+                           !cpu_isolated(cpu_of(busiest))) {
                                 busiest->active_balance = 1;
                                 busiest->push_cpu = this_cpu;
                                 active_balance = 1;
@@ -9247,17 +10685,31 @@ more_balance:
                                 stop_one_cpu_nowait(cpu_of(busiest),
                                         active_load_balance_cpu_stop, busiest,
                                         &busiest->active_balance_work);
+                               *continue_balancing = 0;
                         }
  
                         /*
                          * We've kicked active balancing, reset the failure
                          * counter.
                          */
-                       sd->nr_balance_failed = sd->cache_nice_tries+1;
+                       sd->nr_balance_failed =
+                           sd->cache_nice_tries +
+                           NEED_ACTIVE_BALANCE_THRESHOLD - 1;
                 }
-       } else
+       } else {
                 sd->nr_balance_failed = 0;
  
+               /* Assumes one 'busiest' cpu that we pulled tasks from */
+               if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+                       int check_groups = !!(env.flags &
+                                        LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+                       check_for_freq_change(this_rq, false, check_groups);
+                       check_for_freq_change(busiest, false, check_groups);
+               } else {
+                       check_for_freq_change(this_rq, true, false);
+               }
+       }
         if (likely(!active_balance)) {
                 /* We were unbalanced, so reset the balancing interval */
                 sd->balance_interval = sd->min_interval;
@@ -9277,9 +10729,10 @@ more_balance:
  out_balanced:
         /*
          * We reach balance although we may have faced some affinity
-        * constraints. Clear the imbalance flag if it was set.
+        * constraints. Clear the imbalance flag only if other tasks got
+        * a chance to move and fix the imbalance.
          */
-       if (sd_parent) {
+       if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  
                 if (*group_imbalance)
@@ -9305,6 +10758,11 @@ out_one_pinned:
  
         ld_moved = 0;
  out:
+       trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+                                group ? group->cpumask[0] : 0,
+                                busiest ? busiest->nr_running : 0,
+                                env.imbalance, env.flags, ld_moved,
+                                sd->balance_interval);
         return ld_moved;
  }
  
@@ -9347,6 +10805,9 @@ static int idle_balance(struct rq *this_rq)
         int pulled_task = 0;
         u64 curr_cost = 0;
  
+       if (cpu_isolated(this_cpu))
+               return 0;
+
         idle_enter_fair(this_rq);
  
         /*
@@ -9401,9 +10862,12 @@ static int idle_balance(struct rq *this_rq)
  
                 /*
                  * Stop searching for tasks to pull if there are
-                * now runnable tasks on this rq.
+                * now runnable tasks on the balance rq or if
+                * continue_balancing has been unset (only possible
+                * due to active migration).
                  */
-               if (pulled_task || this_rq->nr_running > 0)
+               if (pulled_task || this_rq->nr_running > 0 ||
+                                               !continue_balancing)
                         break;
         }
         rcu_read_unlock();
@@ -9455,13 +10919,19 @@ static int active_load_balance_cpu_stop(void *data)
         struct task_struct *push_task = NULL;
         int push_task_detached = 0;
         struct lb_env env = {
-               .sd             = sd,
-               .dst_cpu        = target_cpu,
-               .dst_rq         = target_rq,
-               .src_cpu        = busiest_rq->cpu,
-               .src_rq         = busiest_rq,
-               .idle           = CPU_IDLE,
+               .sd                     = sd,
+               .dst_cpu                = target_cpu,
+               .dst_rq                 = target_rq,
+               .src_cpu                = busiest_rq->cpu,
+               .src_rq                 = busiest_rq,
+               .idle                   = CPU_IDLE,
+               .busiest_nr_running     = 0,
+               .busiest_grp_capacity   = 0,
+               .flags                  = 0,
+               .loop                   = 0,
+               .boost_policy           = sched_boost_policy(),
         };
+       bool moved = false;
  
         raw_spin_lock_irq(&busiest_rq->lock);
  
@@ -9482,12 +10952,15 @@ static int active_load_balance_cpu_stop(void *data)
         BUG_ON(busiest_rq == target_rq);
  
         push_task = busiest_rq->push_task;
+       target_cpu = busiest_rq->push_cpu;
         if (push_task) {
                 if (task_on_rq_queued(push_task) &&
+                       push_task->state == TASK_RUNNING &&
                         task_cpu(push_task) == busiest_cpu &&
                                         cpu_online(target_cpu)) {
                         detach_task(push_task, &env);
                         push_task_detached = 1;
+                       moved = true;
                 }
                 goto out_unlock;
         }
@@ -9506,14 +10979,18 @@ static int active_load_balance_cpu_stop(void *data)
                 update_rq_clock(busiest_rq);
  
                 p = detach_one_task(&env);
-               if (p)
+               if (p) {
                         schedstat_inc(sd, alb_pushed);
-               else
+                       moved = true;
+               } else {
                         schedstat_inc(sd, alb_failed);
+               }
         }
         rcu_read_unlock();
  out_unlock:
         busiest_rq->active_balance = 0;
+       push_task = busiest_rq->push_task;
+       target_cpu = busiest_rq->push_cpu;
  
         if (push_task)
                 busiest_rq->push_task = NULL;
@@ -9524,6 +11001,7 @@ out_unlock:
                 if (push_task_detached)
                         attach_one_task(target_rq, push_task);
                 put_task_struct(push_task);
+               clear_reserved(target_cpu);
         }
  
         if (p)
@@ -9531,6 +11009,15 @@ out_unlock:
  
         local_irq_enable();
  
+       if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+               int check_groups = !!(env.flags &
+                                        LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+               check_for_freq_change(busiest_rq, false, check_groups);
+               check_for_freq_change(target_rq, false, check_groups);
+       } else if (moved) {
+               check_for_freq_change(target_rq, true, false);
+       }
+
         return 0;
  }
  
@@ -9546,9 +11033,49 @@ static inline int on_null_domain(struct rq *rq)
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
   */
-static inline int find_new_ilb(void)
+
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
+{
+       int call_cpu = raw_smp_processor_id();
+       struct sched_domain *sd;
+       int ilb;
+
+       rcu_read_lock();
+
+       /* Pick an idle cpu "closest" to call_cpu */
+       for_each_domain(call_cpu, sd) {
+               for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+                                               sched_domain_span(sd)) {
+                       if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+                                       cpu_max_power_cost(ilb) <=
+                                       cpu_max_power_cost(call_cpu))) {
+                               rcu_read_unlock();
+                               reset_balance_interval(ilb);
+                               return ilb;
+                       }
+               }
+       }
+
+       rcu_read_unlock();
+       return nr_cpu_ids;
+}
+#else  /* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
+{
+       return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
  {
-       int ilb = cpumask_first(nohz.idle_cpus_mask);
+       int ilb;
+
+#ifdef CONFIG_SCHED_HMP
+       return find_new_hmp_ilb(type);
+#endif
+
+       ilb = cpumask_first(nohz.idle_cpus_mask);
  
         if (ilb < nr_cpu_ids && idle_cpu(ilb))
                 return ilb;
@@ -9561,13 +11088,13 @@ static inline int find_new_ilb(void)
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
  {
         int ilb_cpu;
  
         nohz.next_balance++;
  
-       ilb_cpu = find_new_ilb();
+       ilb_cpu = find_new_ilb(type);
  
         if (ilb_cpu >= nr_cpu_ids)
                 return;
@@ -9584,16 +11111,21 @@ static void nohz_balancer_kick(void)
         return;
  }
  
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+       if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+               cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+               atomic_dec(&nohz.nr_cpus);
+       }
+}
+
  static inline void nohz_balance_exit_idle(int cpu)
  {
         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
                 /*
                  * Completely isolated CPUs don't ever set, so we must test.
                  */
-               if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
-                       cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-                       atomic_dec(&nohz.nr_cpus);
-               }
+               nohz_balance_clear_nohz_mask(cpu);
                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
         }
  }
@@ -9650,7 +11182,7 @@ void nohz_balance_enter_idle(int cpu)
         /*
          * If we're a completely isolated CPU, we don't play.
          */
-       if (on_null_domain(cpu_rq(cpu)))
+       if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
                 return;
  
         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@ -9679,7 +11211,13 @@ static DEFINE_SPINLOCK(balancing);
   */
  void update_max_interval(void)
  {
-       max_load_balance_interval = HZ*num_online_cpus()/10;
+       cpumask_t avail_mask;
+       unsigned int available_cpus;
+
+       cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+       available_cpus = cpumask_weight(&avail_mask);
+
+       max_load_balance_interval = HZ*available_cpus/10;
  }
  
  /*
@@ -9804,12 +11342,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
+       cpumask_t cpus;
  
         if (idle != CPU_IDLE ||
             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
                 goto end;
  
-       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+       cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+       for_each_cpu(balance_cpu, &cpus) {
                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
                         continue;
  
@@ -9852,6 +11393,79 @@ end:
         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
  }
  
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+       struct sched_domain *sd;
+       int i;
+
+       if (rq->nr_running < 2)
+               return 0;
+
+       if (!sysctl_sched_restrict_cluster_spill ||
+                       sched_boost_policy() == SCHED_BOOST_ON_ALL)
+               return 1;
+
+       if (cpu_max_power_cost(cpu) == max_power_cost)
+               return 1;
+
+       rcu_read_lock();
+       sd = rcu_dereference_check_sched_domain(rq->sd);
+       if (!sd) {
+               rcu_read_unlock();
+               return 0;
+       }
+
+       for_each_cpu(i, sched_domain_span(sd)) {
+               if (cpu_load(i) < sched_spill_load &&
+                               cpu_rq(i)->nr_running <
+                               sysctl_sched_spill_nr_run) {
+                       /* Change the kick type to limit to CPUs that
+                        * are of equal or lower capacity.
+                        */
+                       *type = NOHZ_KICK_RESTRICT;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+       return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+       return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+       unsigned long now = jiffies;
+
+       /*
+        * None are in tickless mode and hence no need for NOHZ idle load
+        * balancing.
+        */
+       if (likely(!atomic_read(&nohz.nr_cpus)))
+               return 0;
+
+#ifdef CONFIG_SCHED_HMP
+       return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
+
+       if (time_before(now, nohz.next_balance))
+               return 0;
+
+       if (rq->nr_running >= 2 &&
+           (!energy_aware() || cpu_overutilized(cpu)))
+               return true;
+
+       /* Do idle load balance if there have misfit task */
+       if (energy_aware())
+               return rq->misfit_task;
+
+       return (rq->nr_running >= 2);
+}
+
  /*
   * Current heuristic for kicking the idle load balancer in the presence
   * of an idle cpu in the system.
@@ -9863,12 +11477,14 @@ end:
   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
   *     domain span are idle.
   */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
  {
-       unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
         struct sched_domain *sd;
         struct sched_group_capacity *sgc;
-       int nr_busy, cpu = rq->cpu;
+       int nr_busy;
+#endif
+       int cpu = rq->cpu;
         bool kick = false;
  
         if (unlikely(rq->idle_balance))
@@ -9881,24 +11497,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
         set_cpu_sd_state_busy();
         nohz_balance_exit_idle(cpu);
  
-       /*
-        * None are in tickless mode and hence no need for NOHZ idle load
-        * balancing.
-        */
-       if (likely(!atomic_read(&nohz.nr_cpus)))
-               return false;
-
-       if (time_before(now, nohz.next_balance))
-               return false;
-
-       if (rq->nr_running >= 2 &&
-           (!energy_aware() || cpu_overutilized(cpu)))
+       if (_nohz_kick_needed(rq, cpu, type))
                 return true;
  
-       /* Do idle load balance if there have misfit task */
-       if (energy_aware())
-               return rq->misfit_task;
-
+#ifndef CONFIG_SCHED_HMP
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_busy, cpu));
         if (sd) {
@@ -9930,6 +11532,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
  
  unlock:
         rcu_read_unlock();
+#endif
         return kick;
  }
  #else
@@ -9963,15 +11566,19 @@ static void run_rebalance_domains(struct softirq_action *h)
   */
  void trigger_load_balance(struct rq *rq)
  {
-       /* Don't need to rebalance while attached to NULL domain */
-       if (unlikely(on_null_domain(rq)))
+       int type = NOHZ_KICK_ANY;
+
+       /* Don't need to rebalance while attached to NULL domain or
+        * cpu is isolated.
+        */
+       if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
                 return;
  
         if (time_after_eq(jiffies, rq->next_balance))
                 raise_softirq(SCHED_SOFTIRQ);
  #ifdef CONFIG_NO_HZ_COMMON
-       if (nohz_kick_needed(rq))
-               nohz_balancer_kick();
+       if (nohz_kick_needed(rq, &type))
+               nohz_balancer_kick(type);
  #endif
  }
  
@@ -9990,47 +11597,6 @@ static void rq_offline_fair(struct rq *rq)
         unthrottle_offline_cfs_rqs(rq);
  }
  
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
-       int rc = 0;
-
-       /* Invoke active balance to force migrate currently running task */
-       raw_spin_lock(&rq->lock);
-       if (!rq->active_balance) {
-               rq->active_balance = 1;
-               rq->push_cpu = new_cpu;
-               get_task_struct(p);
-               rq->push_task = p;
-               rc = 1;
-       }
-       raw_spin_unlock(&rq->lock);
-
-       return rc;
-}
-
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
-       int new_cpu;
-       int active_balance;
-       int cpu = task_cpu(p);
-
-       if (energy_aware() && rq->misfit_task) {
-               if (rq->curr->state != TASK_RUNNING ||
-                   rq->curr->nr_cpus_allowed == 1)
-                       return;
-
-               new_cpu = select_energy_cpu_brute(p, cpu, 0);
-               if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
-                       active_balance = kick_active_balance(rq, p, new_cpu);
-                       if (active_balance)
-                               stop_one_cpu_nowait(cpu,
-                                               active_load_balance_cpu_stop,
-                                               rq, &rq->active_balance_work);
-               }
-       }
-}
-
  #endif /* CONFIG_SMP */
  
  /*
@@ -10050,7 +11616,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                 task_tick_numa(rq, curr);
  
  #ifdef CONFIG_SMP
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+       if (energy_aware() &&
+           !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
                 rq->rd->overutilized = true;
                 trace_sched_overutilized(true);
         }
@@ -10550,6 +12117,11 @@ const struct sched_class fair_sched_class = {
  #ifdef CONFIG_FAIR_GROUP_SCHED
         .task_change_group      = task_change_group_fair,
  #endif
+#ifdef CONFIG_SCHED_HMP
+       .inc_hmp_sched_stats    = inc_hmp_sched_stats_fair,
+       .dec_hmp_sched_stats    = dec_hmp_sched_stats_fair,
+       .fixup_hmp_sched_stats  = fixup_hmp_sched_stats_fair,
+#endif
  };
  
  #ifdef CONFIG_SCHED_DEBUG