Merge 4.4.163 into android-4.4

[sagit-ice-cold/kernel_xiaomi_msm8998.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index a99c329..d6a90c8 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -32,7 +32,7 @@
  #include <linux/init.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
  #include <linux/interrupt.h>
  #include <linux/capability.h>
  #include <linux/completion.h>
@@ -74,6 +74,7 @@
  #include <linux/binfmts.h>
  #include <linux/context_tracking.h>
  #include <linux/compiler.h>
+#include <linux/cpufreq_times.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
@@ -89,6 +90,7 @@
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
+#include "walt.h"
  
  DEFINE_MUTEX(sched_domains_mutex);
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -287,6 +289,18 @@ int sysctl_sched_rt_runtime = 950000;
  /* cpus with isolated domains */
  cpumask_var_t cpu_isolated_map;
  
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+       return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+       task_rq_unlock(rq, p, flags);
+}
+
  /*
   * this_rq_lock - lock this runqueue and disable interrupts.
   */
@@ -533,6 +547,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
                 return;
  
+       head->count++;
+
         get_task_struct(task);
  
         /*
@@ -542,6 +558,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         head->lastp = &node->next;
  }
  
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+              int sibling_count_hint);
+
  void wake_up_q(struct wake_q_head *head)
  {
         struct wake_q_node *node = head->first;
@@ -556,10 +576,10 @@ void wake_up_q(struct wake_q_head *head)
                 task->wake_q.next = NULL;
  
                 /*
-                * wake_up_process() implies a wmb() to pair with the queueing
+                * try_to_wake_up() implies a wmb() to pair with the queueing
                  * in wake_q_add() so as not to miss wakeups.
                  */
-               wake_up_process(task);
+               try_to_wake_up(task, TASK_NORMAL, 0, head->count);
                 put_task_struct(task);
         }
  }
@@ -601,7 +621,8 @@ void resched_cpu(int cpu)
         unsigned long flags;
  
         raw_spin_lock_irqsave(&rq->lock, flags);
-       resched_curr(rq);
+       if (cpu_online(cpu) || cpu == smp_processor_id())
+               resched_curr(rq);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -1075,7 +1096,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
  
         dequeue_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(rq, cpu_rq(new_cpu));
         set_task_cpu(p, new_cpu);
+       double_unlock_balance(rq, cpu_rq(new_cpu));
         raw_spin_unlock(&rq->lock);
  
         rq = cpu_rq(new_cpu);
@@ -1299,6 +1322,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                         p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
+
+               walt_fixup_busy_time(p, new_cpu);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -1313,7 +1338,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 dst_rq = cpu_rq(cpu);
  
                 deactivate_task(src_rq, p, 0);
+               p->on_rq = TASK_ON_RQ_MIGRATING;
                 set_task_cpu(p, cpu);
+               p->on_rq = TASK_ON_RQ_QUEUED;
                 activate_task(dst_rq, p, 0);
                 check_preempt_curr(dst_rq, p, 0);
         } else {
@@ -1622,12 +1649,14 @@ out:
   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+                  int sibling_count_hint)
  {
         lockdep_assert_held(&p->pi_lock);
  
         if (p->nr_cpus_allowed > 1)
-               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+                                                    sibling_count_hint);
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -1912,6 +1941,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
   * @p: the thread to be awakened
   * @state: the mask of task states that can be woken
   * @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ *                      in this event.
   *
   * Put it on the run-queue if it's not already there. The "current"
   * thread is always on the run-queue (except when the actual
@@ -1923,10 +1954,15 @@ static void ttwu_queue(struct task_struct *p, int cpu)
   * or @state didn't match @p's state.
   */
  static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+              int sibling_count_hint)
  {
         unsigned long flags;
         int cpu, success = 0;
+#ifdef CONFIG_SMP
+       struct rq *rq;
+       u64 wallclock;
+#endif
  
         /*
          * If we are going to wake up a thread waiting for CONDITION we
@@ -2006,17 +2042,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         smp_rmb();
  
+       rq = cpu_rq(task_cpu(p));
+
+       raw_spin_lock(&rq->lock);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+       raw_spin_unlock(&rq->lock);
+
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
         if (p->sched_class->task_waking)
                 p->sched_class->task_waking(p);
  
-       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+                            sibling_count_hint);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
         }
+
  #endif /* CONFIG_SMP */
  
         ttwu_queue(p, cpu);
@@ -2065,8 +2111,13 @@ static void try_to_wake_up_local(struct task_struct *p)
  
         trace_sched_waking(p);
  
-       if (!task_on_rq_queued(p))
+       if (!task_on_rq_queued(p)) {
+               u64 wallclock = walt_ktime_clock();
+
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+               walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+       }
  
         ttwu_do_wakeup(rq, p, 0);
         ttwu_stat(p, smp_processor_id(), 0);
@@ -2088,13 +2139,13 @@ out:
   */
  int wake_up_process(struct task_struct *p)
  {
-       return try_to_wake_up(p, TASK_NORMAL, 0);
+       return try_to_wake_up(p, TASK_NORMAL, 0, 1);
  }
  EXPORT_SYMBOL(wake_up_process);
  
  int wake_up_state(struct task_struct *p, unsigned int state)
  {
-       return try_to_wake_up(p, state, 0);
+       return try_to_wake_up(p, state, 0, 1);
  }
  
  /*
@@ -2109,6 +2160,7 @@ void __dl_clear_params(struct task_struct *p)
         dl_se->dl_period = 0;
         dl_se->flags = 0;
         dl_se->dl_bw = 0;
+       dl_se->dl_density = 0;
  
         dl_se->dl_throttled = 0;
         dl_se->dl_new = 1;
@@ -2131,7 +2183,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
+#ifdef CONFIG_SCHED_WALT
+       p->last_sleep_ts                = 0;
+#endif
+
         INIT_LIST_HEAD(&p->se.group_node);
+       walt_init_new_task_load(p);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq                    = NULL;
+#endif
  
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2141,6 +2202,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         init_dl_task_timer(&p->dl);
         __dl_clear_params(p);
  
+       init_rt_schedtune_timer(&p->rt);
         INIT_LIST_HEAD(&p->rt.run_list);
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2215,11 +2277,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  
         __sched_fork(clone_flags, p);
         /*
-        * We mark the process as running here. This guarantees that
+        * We mark the process as NEW here. This guarantees that
          * nobody will actually run it, and a signal or other external
          * event cannot wake it up and insert it on the runqueue either.
          */
-       p->state = TASK_RUNNING;
+       p->state = TASK_NEW;
  
         /*
          * Make sure we do not leak PI boosting priority to the child.
@@ -2256,8 +2318,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                 p->sched_class = &fair_sched_class;
         }
  
-       if (p->sched_class->task_fork)
-               p->sched_class->task_fork(p);
+       init_entity_runnable_average(&p->se);
  
         /*
          * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2267,7 +2328,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
          * Silence PROVE_RCU.
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
+       /*
+        * We're setting the cpu for the first time, we don't migrate,
+        * so use __set_task_cpu().
+        */
+       __set_task_cpu(p, cpu);
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
  #ifdef CONFIG_SCHED_INFO
@@ -2399,6 +2466,10 @@ void wake_up_new_task(struct task_struct *p)
         struct rq *rq;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
+       p->state = TASK_RUNNING;
+
+       walt_init_new_task_load(p);
+
         /* Initialize new task's runnable average */
         init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
@@ -2406,12 +2477,18 @@ void wake_up_new_task(struct task_struct *p)
          * Fork balancing, do it here and not earlier because:
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
+        *
+        * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+        * as we're not fully set-up yet.
          */
-       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
  #endif
-
         rq = __task_rq_lock(p);
-       activate_task(rq, p, 0);
+       update_rq_clock(rq);
+       post_init_entity_util_avg(&p->se);
+
+       walt_mark_task_starting(p);
+       activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
@@ -2708,7 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
-               switch_mm(oldmm, mm, next);
+               switch_mm_irqs_off(oldmm, mm, next);
  
         if (!prev->mm) {
                 prev->active_mm = NULL;
@@ -2792,6 +2869,36 @@ unsigned long nr_iowait_cpu(int cpu)
         return atomic_read(&this->nr_iowait);
  }
  
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+       unsigned int seqcnt;
+       u64 integral;
+       struct rq *q;
+
+       if (cpu >= nr_cpu_ids)
+               return 0;
+
+       q = cpu_rq(cpu);
+
+       /*
+        * Update average to avoid reading stalled value if there were
+        * no run-queue changes for a long time. On the other hand if
+        * the changes are happening right now, just read current value
+        * directly.
+        */
+
+       seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+       integral = do_nr_running_integral(q);
+       if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+               read_seqcount_begin(&q->ave_seqcnt);
+               integral = q->nr_running_integral;
+       }
+
+       return integral;
+}
+#endif
+
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
         struct rq *rq = this_rq();
@@ -2812,7 +2919,7 @@ void sched_exec(void)
         int dest_cpu;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -2891,6 +2998,9 @@ void scheduler_tick(void)
         sched_clock_tick();
  
         raw_spin_lock(&rq->lock);
+       walt_set_window_start(rq);
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                       walt_ktime_clock(), 0);
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
@@ -2904,6 +3014,9 @@ void scheduler_tick(void)
         trigger_load_balance(rq);
  #endif
         rq_last_tick_reset(rq);
+
+       if (curr->sched_class == &fair_sched_class)
+               check_for_migration(rq, curr);
  }
  
  #ifdef CONFIG_NO_HZ_FULL
@@ -3131,6 +3244,7 @@ static void __sched notrace __schedule(bool preempt)
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
+       u64 wallclock;
  
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
@@ -3192,11 +3306,18 @@ static void __sched notrace __schedule(bool preempt)
                 update_rq_clock(rq);
  
         next = pick_next_task(rq, prev);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+       walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
         rq->clock_skip_update = 0;
  
         if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+               if (!prev->on_rq)
+                       prev->last_sleep_ts = wallclock;
+#endif
                 rq->nr_switches++;
                 rq->curr = next;
                 ++*switch_count;
@@ -3370,7 +3491,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
  int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                           void *key)
  {
-       return try_to_wake_up(curr->private, mode, wake_flags);
+       return try_to_wake_up(curr->private, mode, wake_flags, 1);
  }
  EXPORT_SYMBOL(default_wake_function);
  
@@ -3396,6 +3517,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         BUG_ON(prio > MAX_PRIO);
  
         rq = __task_rq_lock(p);
+       update_rq_clock(rq);
  
         /*
          * Idle task boosting is a nono in general. There is one
@@ -3487,6 +3609,8 @@ void set_user_nice(struct task_struct *p, long nice)
          * the task might be in the middle of scheduling on another CPU.
          */
         rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
+
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@ -3647,6 +3771,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
         dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
         dl_se->flags = attr->sched_flags;
         dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+       dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
  
         /*
          * Changing the parameters of a task is 'tricky' and we're not doing
@@ -3913,6 +4038,7 @@ recheck:
          * runqueue lock must be held.
          */
         rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
  
         /*
          * Changing the policy of the stop threads its a very bad idea
@@ -5018,6 +5144,7 @@ void init_idle(struct task_struct *idle, int cpu)
         raw_spin_lock(&rq->lock);
  
         __sched_fork(0, idle);
+
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
@@ -5400,9 +5527,60 @@ set_table_entry(struct ctl_table *entry,
  }
  
  static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+                       sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+                       sge->nr_idle_states*sizeof(struct idle_state), 0644,
+                       proc_doulongvec_minmax, false);
+       set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+                       sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+                       sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+                       proc_doulongvec_minmax, false);
+
+       return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+       if (table == NULL)
+               return NULL;
+
+       table->procname = kstrdup("energy", GFP_KERNEL);
+       table->mode = 0555;
+       table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+       return table;
+}
+
+static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
+       struct ctl_table *table;
+       unsigned int nr_entries = 14;
+
+       int i = 0;
+       struct sched_group *sg = sd->groups;
+
+       if (sg->sge) {
+               int nr_sgs = 0;
+
+               do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+               nr_entries += nr_sgs;
+       }
+
+       table = sd_alloc_ctl_entry(nr_entries);
  
         if (table == NULL)
                 return NULL;
@@ -5435,7 +5613,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[12], "name", sd->name,
                 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
+       sg = sd->groups;
+       if (sg->sge) {
+               char buf[32];
+               struct ctl_table *entry = &table[13];
+
+               do {
+                       snprintf(buf, 32, "group%d", i);
+                       entry->procname = kstrdup(buf, GFP_KERNEL);
+                       entry->mode = 0555;
+                       entry->child = sd_alloc_ctl_group_table(sg);
+               } while (entry++, i++, sg = sg->next, sg != sd->groups);
+       }
+       /* &table[nr_entries-1] is terminator */
  
         return table;
  }
@@ -5551,6 +5741,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               walt_set_window_start(rq);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
                 rq->calc_load_update = calc_load_update;
                 break;
  
@@ -5570,6 +5763,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 sched_ttwu_pending();
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
+               walt_migrate_sync_cpu(cpu);
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
@@ -5697,9 +5891,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  
         if (!(sd->flags & SD_LOAD_BALANCE)) {
                 printk("does not load-balance\n");
-               if (sd->parent)
-                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                                       " has parent");
                 return -1;
         }
  
@@ -5741,7 +5932,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 printk(KERN_CONT " %*pbl",
                        cpumask_pr_args(sched_group_cpus(group)));
                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-                       printk(KERN_CONT " (cpu_capacity = %d)",
+                       printk(KERN_CONT " (cpu_capacity = %lu)",
                                 group->sgc->capacity);
                 }
  
@@ -5792,8 +5983,12 @@ static inline bool sched_debug(void)
  
  static int sd_degenerate(struct sched_domain *sd)
  {
-       if (cpumask_weight(sched_domain_span(sd)) == 1)
-               return 1;
+       if (cpumask_weight(sched_domain_span(sd)) == 1) {
+               if (sd->groups->sge)
+                       sd->flags &= ~SD_LOAD_BALANCE;
+               else
+                       return 1;
+       }
  
         /* Following flags need at least 2 groups */
         if (sd->flags & (SD_LOAD_BALANCE |
@@ -5801,8 +5996,10 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                          SD_SHARE_PKG_RESOURCES |
-                        SD_SHARE_POWERDOMAIN)) {
+                        SD_SHARE_POWERDOMAIN |
+                        SD_SHARE_CAP_STATES)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@ -5831,10 +6028,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                 SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
                                 SD_PREFER_SIBLING |
-                               SD_SHARE_POWERDOMAIN);
+                               SD_SHARE_POWERDOMAIN |
+                               SD_SHARE_CAP_STATES);
+               if (parent->groups->sge) {
+                       parent->flags &= ~SD_LOAD_BALANCE;
+                       return 0;
+               }
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@ -5894,6 +6097,19 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
  }
  
+void sched_get_rd(struct root_domain *rd)
+{
+       atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+       if (!atomic_dec_and_test(&rd->refcount))
+               return;
+
+       call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
  static int init_rootdomain(struct root_domain *rd)
  {
         memset(rd, 0, sizeof(*rd));
@@ -5907,12 +6123,23 @@ static int init_rootdomain(struct root_domain *rd)
         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_dlo_mask;
  
+#ifdef HAVE_RT_PUSH_IPI
+       rd->rto_cpu = -1;
+       raw_spin_lock_init(&rd->rto_lock);
+       init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
         init_dl_bw(&rd->dl_bw);
         if (cpudl_init(&rd->cpudl) != 0)
                 goto free_dlo_mask;
  
         if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
+
+       init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+       rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
         return 0;
  
  free_rto_mask:
@@ -6018,11 +6245,13 @@ DEFINE_PER_CPU(int, sd_llc_id);
  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  DEFINE_PER_CPU(struct sched_domain *, sd_busy);
  DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
  
  static void update_top_cache_domain(int cpu)
  {
         struct sched_domain *sd;
-       struct sched_domain *busy_sd = NULL;
+       struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
         int id = cpu;
         int size = 1;
  
@@ -6043,6 +6272,17 @@ static void update_top_cache_domain(int cpu)
  
         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
         rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+       for_each_domain(cpu, sd) {
+               if (sd->groups->sge)
+                       ea_sd = sd;
+               else
+                       break;
+       }
+       rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+       sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+       rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
  }
  
  /*
@@ -6219,6 +6459,8 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                  * die on a /0 trap.
                  */
                 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+               sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+               sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
  
                 /*
                  * Make sure the first group of this domain contains the
@@ -6348,6 +6590,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
  }
  
  /*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+                                          const struct cpumask *cpumask)
+{
+       const struct sched_group_energy * const sge = fn(cpu);
+       struct cpumask mask;
+       int i;
+
+       if (cpumask_weight(cpumask) <= 1)
+               return;
+
+       cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+       for_each_cpu(i, &mask) {
+               const struct sched_group_energy * const e = fn(i);
+               int y;
+
+               BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+               for (y = 0; y < (e->nr_idle_states); y++) {
+                       BUG_ON(e->idle_states[y].power !=
+                                       sge->idle_states[y].power);
+               }
+
+               BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+               for (y = 0; y < (e->nr_cap_states); y++) {
+                       BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+                       BUG_ON(e->cap_states[y].power !=
+                                       sge->cap_states[y].power);
+               }
+       }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+                             sched_domain_energy_f fn)
+{
+       if (!(fn && fn(cpu)))
+               return;
+
+       if (cpu != group_balance_cpu(sd->groups))
+               return;
+
+       if (sd->child && !sd->child->groups->sge) {
+               pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+               pr_err("     energy data on %s but not on %s domain\n",
+                       sd->name, sd->child->name);
+#endif
+               return;
+       }
+
+       check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+       sd->groups->sge = fn(cpu);
+}
+
+/*
   * Initializers for schedule domains
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
@@ -6451,10 +6753,19 @@ static int sched_domains_curr_level;
  /*
   * SD_flags allowed in topology descriptions.
   *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *   SD_SHARE_CAP_STATES    - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
   *
   * Odd one out:
   * SD_ASYM_PACKING        - describes SMT quirks
@@ -6464,10 +6775,13 @@ static int sched_domains_curr_level;
          SD_SHARE_PKG_RESOURCES |       \
          SD_NUMA |                      \
          SD_ASYM_PACKING |              \
-        SD_SHARE_POWERDOMAIN)
+        SD_ASYM_CPUCAPACITY |          \
+        SD_SHARE_POWERDOMAIN |         \
+        SD_SHARE_CAP_STATES)
  
  static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       struct sched_domain *child, int cpu)
  {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
         int sd_weight, sd_flags = 0;
@@ -6519,6 +6833,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                 .smt_gain               = 0,
                 .max_newidle_lb_cost    = 0,
                 .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
  #ifdef CONFIG_SCHED_DEBUG
                 .name                   = tl->name,
  #endif
@@ -6528,6 +6843,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
          * Convert topological properties into behaviour.
          */
  
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
         if (sd->flags & SD_SHARE_CPUCAPACITY) {
                 sd->flags |= SD_PREFER_SIBLING;
                 sd->imbalance_pct = 110;
@@ -6974,16 +7296,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, child, cpu);
  
         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
         if (child) {
                 sd->level = child->level + 1;
                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
                 child->parent = sd;
-               sd->child = child;
  
                 if (!cpumask_subset(sched_domain_span(child),
                                     sched_domain_span(sd))) {
@@ -7031,8 +7350,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                                 *per_cpu_ptr(d.sd, i) = sd;
                         if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                                 sd->flags |= SD_OVERLAP;
-                       if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-                               break;
                 }
         }
  
@@ -7052,10 +7369,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
  
         /* Calculate CPU capacity for physical packages and nodes */
         for (i = nr_cpumask_bits-1; i >= 0; i--) {
+               struct sched_domain_topology_level *tl = sched_domain_topology;
+
                 if (!cpumask_test_cpu(i, cpu_map))
                         continue;
  
-               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+                       init_sched_energy(i, sd, tl->energy);
                         claim_allocations(i, sd);
                         init_sched_groups_capacity(i, sd);
                 }
@@ -7064,7 +7384,19 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
+               int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+               int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+               if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+                   cpu_rq(max_cpu)->cpu_capacity_orig))
+                       WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+               if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+                   cpu_rq(min_cpu)->cpu_capacity_orig))
+                       WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
                 sd = *per_cpu_ptr(d.sd, i);
+
                 cpu_attach_domain(sd, d.rd, i);
         }
         rcu_read_unlock();
@@ -7475,6 +7807,7 @@ void __init sched_init(void)
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+               rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                 /*
                  * How much cpu bandwidth does root_task_group get?
                  *
@@ -7516,11 +7849,17 @@ void __init sched_init(void)
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
+               rq->push_task = NULL;
                 rq->cpu = i;
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+               rq->cur_irqload = 0;
+               rq->avg_irqload = 0;
+               rq->irqload_ts = 0;
+#endif
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
@@ -7584,6 +7923,14 @@ static inline int preempt_count_equals(int preempt_offset)
         return (nested == preempt_offset);
  }
  
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+       __might_sleep_init_called = 1;
+       return 0;
+}
+early_initcall(__might_sleep_init);
+
  void __might_sleep(const char *file, int line, int preempt_offset)
  {
         /*
@@ -7608,8 +7955,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
  
         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-            !is_idle_task(current)) ||
-           system_state != SYSTEM_RUNNING || oops_in_progress)
+            !is_idle_task(current)) || oops_in_progress)
+               return;
+       if (system_state != SYSTEM_RUNNING &&
+           (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
@@ -7795,11 +8144,9 @@ void sched_destroy_group(struct task_group *tg)
  void sched_offline_group(struct task_group *tg)
  {
         unsigned long flags;
-       int i;
  
         /* end participation in shares distribution */
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
+       unregister_fair_sched_group(tg);
  
         spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
@@ -7807,27 +8154,9 @@ void sched_offline_group(struct task_group *tg)
         spin_unlock_irqrestore(&task_group_lock, flags);
  }
  
-/* change task's runqueue when it moves between groups.
- *     The caller of this function should have put the task in its new group
- *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *     reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
  {
         struct task_group *tg;
-       int queued, running;
-       unsigned long flags;
-       struct rq *rq;
-
-       rq = task_rq_lock(tsk, &flags);
-
-       running = task_current(rq, tsk);
-       queued = task_on_rq_queued(tsk);
-
-       if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE);
-       if (unlikely(running))
-               put_prev_task(rq, tsk);
  
         /*
          * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7840,11 +8169,37 @@ void sched_move_task(struct task_struct *tsk)
         tsk->sched_task_group = tg;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk);
+       if (tsk->sched_class->task_change_group)
+               tsk->sched_class->task_change_group(tsk, type);
         else
  #endif
                 set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+       int queued, running;
+       unsigned long flags;
+       struct rq *rq;
+
+       rq = task_rq_lock(tsk, &flags);
+
+       running = task_current(rq, tsk);
+       queued = task_on_rq_queued(tsk);
+
+       if (queued)
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
+       if (unlikely(running))
+               put_prev_task(rq, tsk);
+
+       sched_change_group(tsk, TASK_MOVE_GROUP);
  
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
@@ -8281,15 +8636,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
         sched_free_group(tg);
  }
  
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
  static void cpu_cgroup_fork(struct task_struct *task, void *private)
  {
-       sched_move_task(task);
+       unsigned long flags;
+       struct rq *rq;
+
+       rq = task_rq_lock(task, &flags);
+
+       update_rq_clock(rq);
+       sched_change_group(task, TASK_SET_GROUP);
+
+       task_rq_unlock(rq, task, &flags);
  }
  
  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
+       int ret = 0;
  
         cgroup_taskset_for_each(task, css, tset) {
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -8300,8 +8668,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                 if (task->sched_class != &fair_sched_class)
                         return -EINVAL;
  #endif
+               /*
+                * Serialize against wake_up_new_task() such that if its
+                * running, we're sure to observe its full state.
+                */
+               raw_spin_lock_irq(&task->pi_lock);
+               /*
+                * Avoid calling sched_move_task() before wake_up_new_task()
+                * has happened. This would lead to problems with PELT, due to
+                * move wanting to detach+attach while we're not attached yet.
+                */
+               if (task->state == TASK_NEW)
+                       ret = -EINVAL;
+               raw_spin_unlock_irq(&task->pi_lock);
+
+               if (ret)
+                       break;
         }
-       return 0;
+       return ret;
  }
  
  static void cpu_cgroup_attach(struct cgroup_taskset *tset)