Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

[tomoyo/tomoyo-test1.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 5724577..f9a1346 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)
  {
         struct hrtimer *timer = &rq->hrtick_timer;
  
-       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
  }
  
  /*
@@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)
          */
         delay = max_t(u64, delay, 10000LL);
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
  }
  #endif /* CONFIG_SMP */
  
@@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)
         rq->hrtick_csd.info = rq;
  #endif
  
-       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
  }
  #else  /* CONFIG_SCHED_HRTICK */
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
  }
  
  static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
  {
         /*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_none(UCLAMP_MIN);
  }
  
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
  {
         /* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
  }
  
  static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
  {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+#endif
+
+       return uc_req;
+}
+
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
  
         /* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
         return uc_req;
  }
  
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_eff;
  
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                 uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+
+       task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+{
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+       struct task_group *tg = &root_task_group;
+
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
  {
+       bool update_root_tg = false;
         int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
         int result;
  
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
  
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
         }
  
+       if (update_root_tg)
+               uclamp_update_root_tg();
+
         /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
          */
+
         goto done;
  
  undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
  done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
  
         return result;
  }
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
  static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         /*
          * On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
  
  static void uclamp_fork(struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
  static void __init init_uclamp(void)
  {
         struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         int cpu;
  
+       mutex_init(&uclamp_mutex);
+
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
  
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+       }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  }
  
  /*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
  {
-       struct mm_struct *mm, *oldmm;
-
         prepare_task_switch(rq, prev, next);
  
-       mm = next->mm;
-       oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
         arch_start_context_switch(prev);
  
         /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
          */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
  
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
         }
  
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3486,8 +3616,36 @@ void scheduler_tick(void)
  
  struct tick_work {
         int                     cpu;
+       atomic_t                state;
         struct delayed_work     work;
  };
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE      0
+#define TICK_SCHED_REMOTE_OFFLINING    1
+#define TICK_SCHED_REMOTE_RUNNING      2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ *          TICK_SCHED_REMOTE_OFFLINE
+ *                    |   ^
+ *                    |   |
+ *                    |   | sched_tick_remote()
+ *                    |   |
+ *                    |   |
+ *                    +--TICK_SCHED_REMOTE_OFFLINING
+ *                    |   ^
+ *                    |   |
+ * sched_tick_start() |   | sched_tick_stop()
+ *                    |   |
+ *                    V   |
+ *          TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
  
  static struct tick_work __percpu *tick_work_cpu;
  
@@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)
         struct task_struct *curr;
         struct rq_flags rf;
         u64 delta;
+       int os;
  
         /*
          * Handle the tick only if it appears the remote CPU is running in full
@@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)
  
         rq_lock_irq(rq, &rf);
         curr = rq->curr;
-       if (is_idle_task(curr))
+       if (is_idle_task(curr) || cpu_is_offline(cpu))
                 goto out_unlock;
  
         update_rq_clock(rq);
@@ -3533,13 +3692,18 @@ out_requeue:
         /*
          * Run the remote tick once per second (1Hz). This arbitrary
          * frequency is large enough to avoid overload but short enough
-        * to keep scheduler internal stats reasonably up to date.
+        * to keep scheduler internal stats reasonably up to date.  But
+        * first update state to reflect hotplug activity if required.
          */
-       queue_delayed_work(system_unbound_wq, dwork, HZ);
+       os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+       WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+       if (os == TICK_SCHED_REMOTE_RUNNING)
+               queue_delayed_work(system_unbound_wq, dwork, HZ);
  }
  
  static void sched_tick_start(int cpu)
  {
+       int os;
         struct tick_work *twork;
  
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)
         WARN_ON_ONCE(!tick_work_cpu);
  
         twork = per_cpu_ptr(tick_work_cpu, cpu);
-       twork->cpu = cpu;
-       INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
-       queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+       os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+       WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+       if (os == TICK_SCHED_REMOTE_OFFLINE) {
+               twork->cpu = cpu;
+               INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+               queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+       }
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
  static void sched_tick_stop(int cpu)
  {
         struct tick_work *twork;
+       int os;
  
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                 return;
@@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)
         WARN_ON_ONCE(!tick_work_cpu);
  
         twork = per_cpu_ptr(tick_work_cpu, cpu);
-       cancel_delayed_work_sync(&twork->work);
+       /* There cannot be competing actions, but don't rely on stop-machine. */
+       os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+       WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+       /* Don't cancel, as this would mess up the state machine. */
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)
  {
         tick_work_cpu = alloc_percpu(struct tick_work);
         BUG_ON(!tick_work_cpu);
-
         return 0;
  }
  
@@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@ -3748,7 +3919,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
  
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@ -3757,14 +3928,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                 return p;
         }
  
-again:
+restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
+
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                         return p;
-               }
         }
  
         /* The idle class should always have a runnable task: */
@@ -3791,7 +3967,7 @@ again:
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3800,7 +3976,7 @@ again:
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@ -3913,7 +4089,7 @@ void __noreturn do_task_dead(void)
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
-       if (!tsk->state || tsk_is_pi_blocked(tsk))
+       if (!tsk->state)
                 return;
  
         /*
@@ -3929,6 +4105,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
                 preempt_enable_no_resched();
         }
  
+       if (tsk_is_pi_blocked(tsk))
+               return;
+
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@ -4042,7 +4221,7 @@ static void __sched notrace preempt_schedule_common(void)
         } while (need_resched());
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4114,7 +4293,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@ -4282,7 +4461,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -4349,7 +4528,7 @@ void set_user_nice(struct task_struct *p, long nice)
                         resched_curr(rq);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  out_unlock:
         task_rq_unlock(rq, p, &rf);
  }
@@ -4666,6 +4845,9 @@ recheck:
                         return retval;
         }
  
+       if (pi)
+               cpuset_read_lock();
+
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@ -4680,8 +4862,8 @@ recheck:
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
         }
  
         /*
@@ -4699,8 +4881,8 @@ recheck:
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
         }
  change:
  
@@ -4713,8 +4895,8 @@ change:
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                 }
  #endif
  #ifdef CONFIG_SMP
@@ -4729,8 +4911,8 @@ change:
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                         }
                 }
  #endif
@@ -4740,6 +4922,8 @@ change:
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                 goto recheck;
         }
  
@@ -4749,8 +4933,8 @@ change:
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
         }
  
         p->sched_reset_on_fork = reset_on_fork;
@@ -4792,7 +4976,7 @@ change:
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  
@@ -4800,14 +4984,22 @@ change:
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
  
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+       }
  
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
  
         return 0;
+
+unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4891,10 +5083,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
         rcu_read_unlock();
  
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
+
         return retval;
  }
  
@@ -5111,37 +5308,40 @@ out_unlock:
         return retval;
  }
  
-static int sched_read_attr(struct sched_attr __user *uattr,
-                          struct sched_attr *attr,
-                          unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+                       struct sched_attr *kattr,
+                       unsigned int usize)
  {
-       int ret;
+       unsigned int ksize = sizeof(*kattr);
  
         if (!access_ok(uattr, usize))
                 return -EFAULT;
  
         /*
-        * If we're handed a smaller struct than we know of,
-        * ensure all the unknown bits are 0 - i.e. old
-        * user-space does not get uncomplete information.
+        * sched_getattr() ABI forwards and backwards compatibility:
+        *
+        * If usize == ksize then we just copy everything to user-space and all is good.
+        *
+        * If usize < ksize then we only copy as much as user-space has space for,
+        * this keeps ABI compatibility as well. We skip the rest.
+        *
+        * If usize > ksize then user-space is using a newer version of the ABI,
+        * which part the kernel doesn't know about. Just ignore it - tooling can
+        * detect the kernel's knowledge of attributes from the attr->size value
+        * which is set to ksize in this case.
          */
-       if (usize < sizeof(*attr)) {
-               unsigned char *addr;
-               unsigned char *end;
-
-               addr = (void *)attr + usize;
-               end  = (void *)attr + sizeof(*attr);
-
-               for (; addr < end; addr++) {
-                       if (*addr)
-                               return -EFBIG;
-               }
-
-               attr->size = usize;
-       }
+       kattr->size = min(usize, ksize);
  
-       ret = copy_to_user(uattr, attr, attr->size);
-       if (ret)
+       if (copy_to_user(uattr, kattr, kattr->size))
                 return -EFAULT;
  
         return 0;
@@ -5151,20 +5351,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
   * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   * @pid: the pid in question.
   * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
   * @flags: for future extension.
   */
  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-               unsigned int, size, unsigned int, flags)
+               unsigned int, usize, unsigned int, flags)
  {
-       struct sched_attr attr = {
-               .size = sizeof(struct sched_attr),
-       };
+       struct sched_attr kattr = { };
         struct task_struct *p;
         int retval;
  
-       if (!uattr || pid < 0 || size > PAGE_SIZE ||
-           size < SCHED_ATTR_SIZE_VER0 || flags)
+       if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+           usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
  
         rcu_read_lock();
@@ -5177,25 +5375,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
         if (retval)
                 goto out_unlock;
  
-       attr.sched_policy = p->policy;
+       kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
-               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+               kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
         if (task_has_dl_policy(p))
-               __getparam_dl(p, &attr);
+               __getparam_dl(p, &kattr);
         else if (task_has_rt_policy(p))
-               attr.sched_priority = p->rt_priority;
+               kattr.sched_priority = p->rt_priority;
         else
-               attr.sched_nice = task_nice(p);
+               kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
-       attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
-       attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+       kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+       kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
  
         rcu_read_unlock();
  
-       retval = sched_read_attr(uattr, &attr, size);
-       return retval;
+       return sched_attr_copy_to_user(uattr, &kattr, usize);
  
  out_unlock:
         rcu_read_unlock();
@@ -5425,7 +5622,7 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
         if (should_resched(0)) {
@@ -5442,7 +5639,7 @@ EXPORT_SYMBOL(_cond_resched);
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@ -5981,7 +6178,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@ -6021,21 +6218,22 @@ static void calc_load_migrate(struct rq *rq)
                 atomic_long_add(delta, &calc_load_tasks);
  }
  
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
-}
+       const struct sched_class *class;
+       struct task_struct *next;
  
-static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
-};
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
  
-static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
-};
+       /* The idle class should always have a runnable task */
+       BUG();
+}
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6078,12 +6276,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                 if (rq->nr_running == 1)
                         break;
  
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
  
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@ -6380,19 +6573,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
  
  void __init sched_init(void)
  {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
         int i;
  
         wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@ -6711,7 +6904,7 @@ struct task_struct *curr_task(int cpu)
  
  #ifdef CONFIG_IA64
  /**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@ -6736,6 +6929,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+#endif
+}
+
  static void sched_free_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
@@ -6759,6 +6966,8 @@ struct task_group *sched_create_group(struct task_group *parent)
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
+       alloc_uclamp_sched_group(tg, parent);
+
         return tg;
  
  err:
@@ -6862,7 +7071,7 @@ void sched_move_task(struct task_struct *tsk)
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
  
         task_rq_unlock(rq, tsk, &rf);
  }
@@ -6945,10 +7154,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
-#else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
  #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@ -6979,6 +7184,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
                 sched_move_task(task);
  }
  
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT   2
+#define UCLAMP_PERCENT_SCALE   (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+
+       return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+{
+       struct uclamp_request req;
+       struct task_group *tg;
+
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+
+       return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+{
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@ -7324,6 +7701,20 @@ static struct cftype cpu_legacy_files[] = {
                 .write_u64 = cpu_rt_period_write_uint,
         },
  #endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
+#endif
         { }     /* Terminate */
  };
  
@@ -7491,6 +7882,20 @@ static struct cftype cpu_files[] = {
                 .write = cpu_max_write,
         },
  #endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
+#endif
         { }     /* terminate */
  };