OSDN Git Service

Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
Pull scheduler updates from Ingo Molnar:
 "The changes in this cycle are:

   - Optimize the task wakeup CPU selection logic, to improve
     scalability and reduce wakeup latency spikes

   - PELT enhancements

   - CFS bandwidth handling fixes

   - Optimize the wakeup path by remove rq->wake_list and replacing it
     with ->ttwu_pending

   - Optimize IPI cross-calls by making flush_smp_call_function_queue()
     process sync callbacks first.

   - Misc fixes and enhancements"

* tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too
  sched/headers: Split out open-coded prototypes into kernel/sched/smp.h
  sched: Replace rq::wake_list
  sched: Add rq::ttwu_pending
  irq_work, smp: Allow irq_work on call_single_queue
  smp: Optimize send_call_function_single_ipi()
  smp: Move irq_work_run() out of flush_smp_call_function_queue()
  smp: Optimize flush_smp_call_function_queue()
  sched: Fix smp_call_function_single_async() usage for ILB
  sched/core: Offload wakee task activation if it the wakee is descheduling
  sched/core: Optimize ttwu() spinning on p->on_cpu
  sched: Defend cfs and rt bandwidth quota against overflow
  sched/cpuacct: Fix charge cpuacct.usage_sys
  sched/fair: Replace zero-length array with flexible-array
  sched/pelt: Sync util/runnable_sum with PELT window when propagating
  sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr()
  sched/fair: Optimize enqueue_task_fair()
  sched: Make scheduler_ipi inline
  sched: Clean up scheduler_ipi()
  sched/core: Simplify sched_init()
  ...

1  2 
include/linux/sched.h
include/linux/smp.h
kernel/cpu.c
kernel/exit.c
kernel/sched/core.c
kernel/smp.c

diff --combined include/linux/sched.h
@@@ -654,6 -654,7 +654,7 @@@ struct task_struct 
  
  #ifdef CONFIG_SMP
        struct llist_node               wake_entry;
+       unsigned int                    wake_entry_type;
        int                             on_cpu;
  #ifdef CONFIG_THREAD_INFO_IN_TASK
        /* Current CPU: */
@@@ -1495,8 -1496,7 +1496,8 @@@ extern struct pid *cad_pid
  #define PF_KSWAPD             0x00020000      /* I am kswapd */
  #define PF_MEMALLOC_NOFS      0x00040000      /* All allocation requests will inherit GFP_NOFS */
  #define PF_MEMALLOC_NOIO      0x00080000      /* All allocation requests will inherit GFP_NOIO */
 -#define PF_LESS_THROTTLE      0x00100000      /* Throttle me less: I clean memory */
 +#define PF_LOCAL_THROTTLE     0x00100000      /* Throttle writes only against the bdi I write to,
 +                                               * I am cleaning dirty pages from some other bdi. */
  #define PF_KTHREAD            0x00200000      /* I am a kernel thread */
  #define PF_RANDOMIZE          0x00400000      /* Randomize virtual address space */
  #define PF_SWAPWRITE          0x00800000      /* Allowed to write to swap */
@@@ -1730,7 -1730,15 +1731,15 @@@ extern char *__get_task_comm(char *to, 
  })
  
  #ifdef CONFIG_SMP
- void scheduler_ipi(void);
+ static __always_inline void scheduler_ipi(void)
+ {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       preempt_fold_need_resched();
+ }
  extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
  #else
  static inline void scheduler_ipi(void) { }
diff --combined include/linux/smp.h
  
  typedef void (*smp_call_func_t)(void *info);
  typedef bool (*smp_cond_func_t)(int cpu, void *info);
+ enum {
+       CSD_FLAG_LOCK           = 0x01,
+       /* IRQ_WORK_flags */
+       CSD_TYPE_ASYNC          = 0x00,
+       CSD_TYPE_SYNC           = 0x10,
+       CSD_TYPE_IRQ_WORK       = 0x20,
+       CSD_TYPE_TTWU           = 0x30,
+       CSD_FLAG_TYPE_MASK      = 0xF0,
+ };
+ /*
+  * structure shares (partial) layout with struct irq_work
+  */
  struct __call_single_data {
        struct llist_node llist;
+       unsigned int flags;
        smp_call_func_t func;
        void *info;
-       unsigned int flags;
  };
  
  /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
  typedef struct __call_single_data call_single_data_t
        __aligned(sizeof(struct __call_single_data));
  
+ /*
+  * Enqueue a llist_node on the call_single_queue; be very careful, read
+  * flush_smp_call_function_queue() in detail.
+  */
+ extern void __smp_call_single_queue(int cpu, struct llist_node *node);
  /* total number of cpus in this system (may exceed NR_CPUS) */
  extern unsigned int total_cpus;
  
@@@ -227,8 -249,8 +249,8 @@@ static inline int get_boot_cpu_id(void
   */
  extern void arch_disable_smp_support(void);
  
 -extern void arch_enable_nonboot_cpus_begin(void);
 -extern void arch_enable_nonboot_cpus_end(void);
 +extern void arch_thaw_secondary_cpus_begin(void);
 +extern void arch_thaw_secondary_cpus_end(void);
  
  void smp_setup_processor_id(void);
  
diff --combined kernel/cpu.c
@@@ -3,6 -3,7 +3,7 @@@
   *
   * This code is licenced under the GPL.
   */
+ #include <linux/sched/mm.h>
  #include <linux/proc_fs.h>
  #include <linux/smp.h>
  #include <linux/init.h>
@@@ -432,7 -433,7 +433,7 @@@ static inline bool cpu_smt_allowed(unsi
        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
 -       * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
 +       * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
@@@ -564,6 -565,21 +565,21 @@@ static int bringup_cpu(unsigned int cpu
        return bringup_wait_for_ap(cpu);
  }
  
+ static int finish_cpu(unsigned int cpu)
+ {
+       struct task_struct *idle = idle_thread_get(cpu);
+       struct mm_struct *mm = idle->active_mm;
+       /*
+        * idle_task_exit() will have switched to &init_mm, now
+        * clean up any remaining active_mm state.
+        */
+       if (mm != &init_mm)
+               idle->active_mm = &init_mm;
+       mmdrop(mm);
+       return 0;
+ }
  /*
   * Hotplug state machine related functions
   */
@@@ -1327,7 -1343,7 +1343,7 @@@ void bringup_nonboot_cpus(unsigned int 
  #ifdef CONFIG_PM_SLEEP_SMP
  static cpumask_var_t frozen_cpus;
  
 -int __freeze_secondary_cpus(int primary, bool suspend)
 +int freeze_secondary_cpus(int primary)
  {
        int cpu, error = 0;
  
                if (cpu == primary)
                        continue;
  
 -              if (suspend && pm_wakeup_pending()) {
 +              if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
  
        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
 -       * this even in case of failure as all disable_nonboot_cpus() users are
 -       * supposed to do enable_nonboot_cpus() on the failure path.
 +       * this even in case of failure as all freeze_secondary_cpus() users are
 +       * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;
  
        return error;
  }
  
 -void __weak arch_enable_nonboot_cpus_begin(void)
 +void __weak arch_thaw_secondary_cpus_begin(void)
  {
  }
  
 -void __weak arch_enable_nonboot_cpus_end(void)
 +void __weak arch_thaw_secondary_cpus_end(void)
  {
  }
  
 -void enable_nonboot_cpus(void)
 +void thaw_secondary_cpus(void)
  {
        int cpu, error;
  
  
        pr_info("Enabling non-boot CPUs ...\n");
  
 -      arch_enable_nonboot_cpus_begin();
 +      arch_thaw_secondary_cpus_begin();
  
        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }
  
 -      arch_enable_nonboot_cpus_end();
 +      arch_thaw_secondary_cpus_end();
  
        cpumask_clear(frozen_cpus);
  out:
@@@ -1549,7 -1565,7 +1565,7 @@@ static struct cpuhp_step cpuhp_hp_state
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
                .startup.single         = bringup_cpu,
-               .teardown.single        = NULL,
+               .teardown.single        = finish_cpu,
                .cant_stop              = true,
        },
        /* Final state before CPU kills itself */
diff --combined kernel/exit.c
@@@ -708,8 -708,12 +708,12 @@@ void __noreturn do_exit(long code
        struct task_struct *tsk = current;
        int group_dead;
  
-       profile_task_exit(tsk);
-       kcov_task_exit(tsk);
+       /*
+        * We can get here from a kernel oops, sometimes with preemption off.
+        * Start by checking for critical errors.
+        * Then fix up important state like USER_DS and preemption.
+        * Then do everything else.
+        */
  
        WARN_ON(blk_needs_flush_plug(tsk));
  
         */
        set_fs(USER_DS);
  
+       if (unlikely(in_atomic())) {
+               pr_info("note: %s[%d] exited with preempt_count %d\n",
+                       current->comm, task_pid_nr(current),
+                       preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
+       profile_task_exit(tsk);
+       kcov_task_exit(tsk);
        ptrace_event(PTRACE_EVENT_EXIT, code);
  
        validate_creds_for_do_exit(tsk);
  
        exit_signals(tsk);  /* sets PF_EXITING */
  
-       if (unlikely(in_atomic())) {
-               pr_info("note: %s[%d] exited with preempt_count %d\n",
-                       current->comm, task_pid_nr(current),
-                       preempt_count());
-               preempt_count_set(PREEMPT_ENABLED);
-       }
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
                sync_mm_rss(tsk->mm);
@@@ -1558,7 -1565,7 +1565,7 @@@ SYSCALL_DEFINE5(waitid, int, which, pid
        if (!infop)
                return err;
  
 -      if (!user_access_begin(infop, sizeof(*infop)))
 +      if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;
  
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
 -      user_access_end();
 +      user_write_access_end();
        return err;
  Efault:
 -      user_access_end();
 +      user_write_access_end();
        return -EFAULT;
  }
  
@@@ -1685,7 -1692,7 +1692,7 @@@ COMPAT_SYSCALL_DEFINE5(waitid
        if (!infop)
                return err;
  
 -      if (!user_access_begin(infop, sizeof(*infop)))
 +      if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;
  
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
 -      user_access_end();
 +      user_write_access_end();
        return err;
  Efault:
 -      user_access_end();
 +      user_write_access_end();
        return -EFAULT;
  }
  #endif
diff --combined kernel/sched/core.c
@@@ -11,7 -11,6 +11,7 @@@
  #include <linux/nospec.h>
  
  #include <linux/kcov.h>
 +#include <linux/scs.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
@@@ -21,6 -20,7 +21,7 @@@
  #include "../smpboot.h"
  
  #include "pelt.h"
+ #include "smp.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@@ -220,6 -220,13 +221,13 @@@ void update_rq_clock(struct rq *rq
        update_rq_clock_task(rq, delta);
  }
  
+ static inline void
+ rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+ {
+       csd->flags = 0;
+       csd->func = func;
+       csd->info = rq;
+ }
  
  #ifdef CONFIG_SCHED_HRTICK
  /*
@@@ -315,16 -322,14 +323,14 @@@ void hrtick_start(struct rq *rq, u64 de
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                      HRTIMER_MODE_REL_PINNED_HARD);
  }
  #endif /* CONFIG_SMP */
  
  static void hrtick_rq_init(struct rq *rq)
  {
  #ifdef CONFIG_SMP
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
+       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
  #endif
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
  }
@@@ -633,29 -638,23 +639,23 @@@ void wake_up_nohz_cpu(int cpu
                wake_up_idle_cpu(cpu);
  }
  
- static inline bool got_nohz_idle_kick(void)
+ static void nohz_csd_func(void *info)
  {
-       int cpu = smp_processor_id();
-       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
-               return false;
-       if (idle_cpu(cpu) && !need_resched())
-               return true;
+       struct rq *rq = info;
+       int cpu = cpu_of(rq);
+       unsigned int flags;
  
        /*
-        * We can't run Idle Load Balance on this CPU for this time so we
-        * cancel it and clear NOHZ_BALANCE_KICK
+        * Release the rq::nohz_csd.
         */
-       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
-       return false;
- }
+       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+       WARN_ON(!(flags & NOHZ_KICK_MASK));
  
- #else /* CONFIG_NO_HZ_COMMON */
- static inline bool got_nohz_idle_kick(void)
- {
-       return false;
+       rq->idle_balance = idle_cpu(cpu);
+       if (rq->idle_balance && !need_resched()) {
+               rq->nohz_idle_balance = flags;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
  }
  
  #endif /* CONFIG_NO_HZ_COMMON */
@@@ -1540,7 -1539,7 +1540,7 @@@ static int migration_cpu_stop(void *dat
         * __migrate_task() such that we will not miss enforcing cpus_ptr
         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
         */
-       sched_ttwu_pending();
+       flush_smp_call_function_from_idle();
  
        raw_spin_lock(&p->pi_lock);
        rq_lock(rq, &rf);
@@@ -2274,16 -2273,23 +2274,23 @@@ static int ttwu_remote(struct task_stru
  }
  
  #ifdef CONFIG_SMP
- void sched_ttwu_pending(void)
+ void sched_ttwu_pending(void *arg)
  {
+       struct llist_node *llist = arg;
        struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p, *t;
        struct rq_flags rf;
  
        if (!llist)
                return;
  
+       /*
+        * rq::ttwu_pending racy indication of out-standing wakeups.
+        * Races such that false-negatives are possible, since they
+        * are shorter lived that false-positives would be.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
  
        rq_unlock_irqrestore(rq, &rf);
  }
  
- void scheduler_ipi(void)
+ void send_call_function_single_ipi(int cpu)
  {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
+       struct rq *rq = cpu_rq(cpu);
  
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
+       if (!set_nr_if_polling(rq->idle))
+               arch_send_call_function_single_ipi(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
  }
  
- static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+ /*
+  * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+  * necessary. The wakee CPU on receipt of the IPI will queue the task
+  * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+  * of the wakeup instead of the waker.
+  */
+ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
  {
        struct rq *rq = cpu_rq(cpu);
  
        p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
  
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-               if (!set_nr_if_polling(rq->idle))
-                       smp_send_reschedule(cpu);
-               else
-                       trace_sched_wake_idle_without_ipi(cpu);
-       }
+       WRITE_ONCE(rq->ttwu_pending, 1);
+       __smp_call_single_queue(cpu, &p->wake_entry);
  }
  
  void wake_up_if_idle(int cpu)
@@@ -2373,6 -2353,38 +2354,38 @@@ bool cpus_share_cache(int this_cpu, in
  {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
+ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+ {
+       /*
+        * If the CPU does not share cache, then queue the task on the
+        * remote rqs wakelist to avoid accessing remote data.
+        */
+       if (!cpus_share_cache(smp_processor_id(), cpu))
+               return true;
+       /*
+        * If the task is descheduling and the only running task on the
+        * CPU then use the wakelist to offload the task activation to
+        * the soon-to-be-idle CPU as the current CPU is likely busy.
+        * nr_running is checked to avoid unnecessary task stacking.
+        */
+       if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+               return true;
+       return false;
+ }
+ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+ {
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+               __ttwu_queue_wakelist(p, cpu, wake_flags);
+               return true;
+       }
+       return false;
+ }
  #endif /* CONFIG_SMP */
  
  static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
        struct rq_flags rf;
  
  #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               ttwu_queue_remote(p, cpu, wake_flags);
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                return;
-       }
  #endif
  
        rq_lock(rq, &rf);
@@@ -2569,7 -2578,15 +2579,15 @@@ try_to_wake_up(struct task_struct *p, u
        if (p->on_rq && ttwu_remote(p, wake_flags))
                goto unlock;
  
+       if (p->in_iowait) {
+               delayacct_blkio_end(p);
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
  #ifdef CONFIG_SMP
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
        /*
         * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
         * possible to, falsely, observe p->on_cpu == 0.
  
        /*
         * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        */
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+               goto unlock;
+       /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
         * this task as prev, wait until its done referencing the task.
         *
         * Pairs with the smp_store_release() in finish_task().
         */
        smp_cond_load_acquire(&p->on_cpu, !VAL);
  
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                psi_ttwu_dequeue(p);
                set_task_cpu(p, cpu);
        }
- #else /* CONFIG_SMP */
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
  #endif /* CONFIG_SMP */
  
        ttwu_queue(p, cpu, wake_flags);
@@@ -2751,6 -2762,9 +2763,9 @@@ static void __sched_fork(unsigned long 
        p->capture_control = NULL;
  #endif
        init_numa_balancing(clone_flags, p);
+ #ifdef CONFIG_SMP
+       p->wake_entry_type = CSD_TYPE_TTWU;
+ #endif
  }
  
  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@@ -3926,9 -3940,6 +3941,9 @@@ static inline void schedule_debug(struc
  #ifdef CONFIG_SCHED_STACK_END_CHECK
        if (task_stack_end_corrupted(prev))
                panic("corrupted stack end detected inside scheduler\n");
 +
 +      if (task_scs_end_corrupted(prev))
 +              panic("corrupted shadow stack detected inside scheduler\n");
  #endif
  
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        schedstat_inc(this_rq()->sched_count);
  }
  
+ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+                                 struct rq_flags *rf)
+ {
+ #ifdef CONFIG_SMP
+       const struct sched_class *class;
+       /*
+        * We must do the balancing pass before put_prev_task(), such
+        * that when we release the rq->lock the task is in the same
+        * state as before we took rq->lock.
+        *
+        * We can terminate the balance pass as soon as we know there is
+        * a runnable task of @class priority or higher.
+        */
+       for_class_range(class, prev->sched_class, &idle_sched_class) {
+               if (class->balance(rq, prev, rf))
+                       break;
+       }
+ #endif
+       put_prev_task(rq, prev);
+ }
  /*
   * Pick up the highest-prio task:
   */
@@@ -3984,22 -4017,7 +4021,7 @@@ pick_next_task(struct rq *rq, struct ta
        }
  
  restart:
- #ifdef CONFIG_SMP
-       /*
-        * We must do the balancing pass before put_next_task(), such
-        * that when we release the rq->lock the task is in the same
-        * state as before we took rq->lock.
-        *
-        * We can terminate the balance pass as soon as we know there is
-        * a runnable task of @class priority or higher.
-        */
-       for_class_range(class, prev->sched_class, &idle_sched_class) {
-               if (class->balance(rq, prev, rf))
-                       break;
-       }
- #endif
-       put_prev_task(rq, prev);
+       put_prev_task_balance(rq, prev, rf);
  
        for_each_class(class) {
                p = class->pick_next_task(rq);
@@@ -4689,7 -4707,7 +4711,7 @@@ int idle_cpu(int cpu
                return 0;
  
  #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                return 0;
  #endif
  
@@@ -6092,7 -6110,6 +6114,7 @@@ void init_idle(struct task_struct *idle
        idle->se.exec_start = sched_clock();
        idle->flags |= PF_IDLE;
  
 +      scs_task_reset(idle);
        kasan_unpoison_task_stack(idle);
  
  #ifdef CONFIG_SMP
@@@ -6243,13 -6260,14 +6265,14 @@@ void idle_task_exit(void
        struct mm_struct *mm = current->active_mm;
  
        BUG_ON(cpu_online(smp_processor_id()));
+       BUG_ON(current != this_rq()->idle);
  
        if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
                finish_arch_post_lock_switch();
        }
-       mmdrop(mm);
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
  }
  
  /*
@@@ -6539,7 -6557,6 +6562,6 @@@ int sched_cpu_dying(unsigned int cpu
        struct rq_flags rf;
  
        /* Handle pending wakeups and then migrate everything off */
-       sched_ttwu_pending();
        sched_tick_stop(cpu);
  
        rq_lock_irqsave(rq, &rf);
@@@ -6642,6 -6659,8 +6664,8 @@@ void __init sched_init(void
                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
  
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                init_rt_rq(&rq->rt);
                init_dl_rq(&rq->dl);
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                /*
                 * We achieve this by letting root_task_group's tasks sit
                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_NO_HZ_COMMON
                rq->last_blocked_load_update_tick = jiffies;
                atomic_set(&rq->nohz_flags, 0);
+               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
  #endif
  #endif /* CONFIG_SMP */
                hrtick_rq_init(rq);
@@@ -7438,6 -7457,8 +7462,8 @@@ static DEFINE_MUTEX(cfs_constraints_mut
  
  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
  static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+ /* More than 203 days if BW_SHIFT equals 20. */
+ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
  
@@@ -7466,6 -7487,12 +7492,12 @@@ static int tg_set_cfs_bandwidth(struct 
                return -EINVAL;
  
        /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+               return -EINVAL;
+       /*
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
         */
diff --combined kernel/smp.c
  #include <linux/hypervisor.h>
  
  #include "smpboot.h"
+ #include "sched/smp.h"
  
- enum {
-       CSD_FLAG_LOCK           = 0x01,
-       CSD_FLAG_SYNCHRONOUS    = 0x02,
- };
+ #define CSD_TYPE(_csd)        ((_csd)->flags & CSD_FLAG_TYPE_MASK)
  
  struct call_function_data {
        call_single_data_t      __percpu *csd;
@@@ -84,6 -82,7 +82,7 @@@ int smpcfd_dying_cpu(unsigned int cpu
         * still pending.
         */
        flush_smp_call_function_queue(false);
+       irq_work_run();
        return 0;
  }
  
@@@ -134,15 -133,33 +133,33 @@@ static __always_inline void csd_unlock(
  
  static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
  
+ void __smp_call_single_queue(int cpu, struct llist_node *node)
+ {
+       /*
+        * The list addition should be visible before sending the IPI
+        * handler locks the list to pull the entry off it because of
+        * normal cache coherency rules implied by spinlocks.
+        *
+        * If IPIs can go out of order to the cache coherency protocol
+        * in an architecture, sufficient synchronisation should be added
+        * to arch code to make it appear to obey cache coherency WRT
+        * locking and barrier primitives. Generic code isn't really
+        * equipped to do the right thing...
+        */
+       if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+               send_call_function_single_ipi(cpu);
+ }
  /*
   * Insert a previously allocated call_single_data_t element
   * for execution on the given CPU. data must already have
   * ->func, ->info, and ->flags set.
   */
- static int generic_exec_single(int cpu, call_single_data_t *csd,
-                              smp_call_func_t func, void *info)
+ static int generic_exec_single(int cpu, call_single_data_t *csd)
  {
        if (cpu == smp_processor_id()) {
+               smp_call_func_t func = csd->func;
+               void *info = csd->info;
                unsigned long flags;
  
                /*
                return 0;
        }
  
        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }
  
-       csd->func = func;
-       csd->info = info;
-       /*
-        * The list addition should be visible before sending the IPI
-        * handler locks the list to pull the entry off it because of
-        * normal cache coherency rules implied by spinlocks.
-        *
-        * If IPIs can go out of order to the cache coherency protocol
-        * in an architecture, sufficient synchronisation should be added
-        * to arch code to make it appear to obey cache coherency WRT
-        * locking and barrier primitives. Generic code isn't really
-        * equipped to do the right thing...
-        */
-       if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
-               arch_send_call_function_single_ipi(cpu);
+       __smp_call_single_queue(cpu, &csd->llist);
  
        return 0;
  }
@@@ -209,9 -210,9 +210,9 @@@ void generic_smp_call_function_single_i
   */
  static void flush_smp_call_function_queue(bool warn_cpu_offline)
  {
-       struct llist_head *head;
-       struct llist_node *entry;
        call_single_data_t *csd, *csd_next;
+       struct llist_node *entry, *prev;
+       struct llist_head *head;
        static bool warned;
  
        lockdep_assert_irqs_disabled();
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
-               llist_for_each_entry(csd, entry, llist)
-                       pr_warn("IPI callback %pS sent to offline CPU\n",
-                               csd->func);
+               llist_for_each_entry(csd, entry, llist) {
+                       switch (CSD_TYPE(csd)) {
+                       case CSD_TYPE_ASYNC:
+                       case CSD_TYPE_SYNC:
+                       case CSD_TYPE_IRQ_WORK:
+                               pr_warn("IPI callback %pS sent to offline CPU\n",
+                                       csd->func);
+                               break;
+                       case CSD_TYPE_TTWU:
+                               pr_warn("IPI task-wakeup sent to offline CPU\n");
+                               break;
+                       default:
+                               pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+                                       CSD_TYPE(csd));
+                               break;
+                       }
+               }
        }
  
+       /*
+        * First; run all SYNC callbacks, people are waiting for us.
+        */
+       prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-               smp_call_func_t func = csd->func;
-               void *info = csd->info;
                /* Do we wait until *after* callback? */
-               if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+               if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+                       smp_call_func_t func = csd->func;
+                       void *info = csd->info;
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
                        func(info);
                        csd_unlock(csd);
                } else {
-                       csd_unlock(csd);
-                       func(info);
+                       prev = &csd->llist;
                }
        }
  
+       if (!entry)
+               return;
        /*
-        * Handle irq works queued remotely by irq_work_queue_on().
-        * Smp functions above are typically synchronous so they
-        * better run first since some other CPUs may be busy waiting
-        * for them.
+        * Second; run all !SYNC callbacks.
         */
-       irq_work_run();
+       prev = NULL;
+       llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+               int type = CSD_TYPE(csd);
+               if (type != CSD_TYPE_TTWU) {
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+                       if (type == CSD_TYPE_ASYNC) {
+                               smp_call_func_t func = csd->func;
+                               void *info = csd->info;
+                               csd_unlock(csd);
+                               func(info);
+                       } else if (type == CSD_TYPE_IRQ_WORK) {
+                               irq_work_single(csd);
+                       }
+               } else {
+                       prev = &csd->llist;
+               }
+       }
+       /*
+        * Third; only CSD_TYPE_TTWU is left, issue those.
+        */
+       if (entry)
+               sched_ttwu_pending(entry);
+ }
+ void flush_smp_call_function_from_idle(void)
+ {
+       unsigned long flags;
+       if (llist_empty(this_cpu_ptr(&call_single_queue)))
+               return;
+       local_irq_save(flags);
+       flush_smp_call_function_queue(true);
+       local_irq_restore(flags);
  }
  
  /*
@@@ -271,7 -339,7 +339,7 @@@ int smp_call_function_single(int cpu, s
  {
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
-               .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+               .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
        };
        int this_cpu;
        int err;
                csd_lock(csd);
        }
  
-       err = generic_exec_single(cpu, csd, func, info);
+       csd->func = func;
+       csd->info = info;
+       err = generic_exec_single(cpu, csd);
  
        if (wait)
                csd_lock_wait(csd);
@@@ -351,7 -422,7 +422,7 @@@ int smp_call_function_single_async(int 
        csd->flags = CSD_FLAG_LOCK;
        smp_wmb();
  
-       err = generic_exec_single(cpu, csd, csd->func, csd->info);
+       err = generic_exec_single(cpu, csd);
  
  out:
        preempt_enable();
@@@ -466,7 -537,7 +537,7 @@@ static void smp_call_function_many_cond
  
                csd_lock(csd);
                if (wait)
-                       csd->flags |= CSD_FLAG_SYNCHRONOUS;
+                       csd->flags |= CSD_TYPE_SYNC;
                csd->func = func;
                csd->info = info;
                if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@@ -598,6 -669,24 +669,24 @@@ void __init smp_init(void
  {
        int num_nodes, num_cpus;
  
+       /*
+        * Ensure struct irq_work layout matches so that
+        * flush_smp_call_function_queue() can do horrible things.
+        */
+       BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+                    offsetof(struct __call_single_data, llist));
+       BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+                    offsetof(struct __call_single_data, func));
+       BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+                    offsetof(struct __call_single_data, flags));
+       /*
+        * Assert the CSD_TYPE_TTWU layout is similar enough
+        * for task_struct to be on the @call_single_queue.
+        */
+       BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+                    offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
        idle_threads_init();
        cpuhp_threads_init();
  
   * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
   * of local_irq_disable/enable().
   */
 -void on_each_cpu(void (*func) (void *info), void *info, int wait)
 +void on_each_cpu(smp_call_func_t func, void *info, int wait)
  {
        unsigned long flags;