Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
diff --combined include/linux/sched.h

index 33bb7c5,e0f5f41..12938d4
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -654,6 -654,7 +654,7 @@@ struct task_struct 
   
   #ifdef CONFIG_SMP
         struct llist_node               wake_entry;
+       unsigned int                    wake_entry_type;
         int                             on_cpu;
   #ifdef CONFIG_THREAD_INFO_IN_TASK
         /* Current CPU: */
@@@ -1495,8 -1496,7 +1496,8 @@@ extern struct pid *cad_pid
   #define PF_KSWAPD             0x00020000      /* I am kswapd */
   #define PF_MEMALLOC_NOFS      0x00040000      /* All allocation requests will inherit GFP_NOFS */
   #define PF_MEMALLOC_NOIO      0x00080000      /* All allocation requests will inherit GFP_NOIO */
- -#define PF_LESS_THROTTLE      0x00100000      /* Throttle me less: I clean memory */
+ +#define PF_LOCAL_THROTTLE     0x00100000      /* Throttle writes only against the bdi I write to,
+ +                                               * I am cleaning dirty pages from some other bdi. */
   #define PF_KTHREAD            0x00200000      /* I am a kernel thread */
   #define PF_RANDOMIZE          0x00400000      /* Randomize virtual address space */
   #define PF_SWAPWRITE          0x00800000      /* Allowed to write to swap */
@@@ -1730,7 -1730,15 +1731,15 @@@ extern char *__get_task_comm(char *to, 
   })
   
   #ifdef CONFIG_SMP
- void scheduler_ipi(void);
+ static __always_inline void scheduler_ipi(void)
+ {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       preempt_fold_need_resched();
+ }
   extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
   #else
   static inline void scheduler_ipi(void) { }
diff --combined include/linux/smp.h

index 0401987,84f90e2..7ee202a
--- 1/include/linux/smp.h
--- 2/include/linux/smp.h
+++ b/include/linux/smp.h
@@@ -16,17 -16,39 +16,39 @@@
   
   typedef void (*smp_call_func_t)(void *info);
   typedef bool (*smp_cond_func_t)(int cpu, void *info);
+ 
+ enum {
+       CSD_FLAG_LOCK           = 0x01,
+ 
+       /* IRQ_WORK_flags */
+ 
+       CSD_TYPE_ASYNC          = 0x00,
+       CSD_TYPE_SYNC           = 0x10,
+       CSD_TYPE_IRQ_WORK       = 0x20,
+       CSD_TYPE_TTWU           = 0x30,
+       CSD_FLAG_TYPE_MASK      = 0xF0,
+ };
+ 
+ /*
+  * structure shares (partial) layout with struct irq_work
+  */
   struct __call_single_data {
         struct llist_node llist;
+       unsigned int flags;
         smp_call_func_t func;
         void *info;
-       unsigned int flags;
   };
   
   /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
   typedef struct __call_single_data call_single_data_t
         __aligned(sizeof(struct __call_single_data));
   
+ /*
+  * Enqueue a llist_node on the call_single_queue; be very careful, read
+  * flush_smp_call_function_queue() in detail.
+  */
+ extern void __smp_call_single_queue(int cpu, struct llist_node *node);
+ 
   /* total number of cpus in this system (may exceed NR_CPUS) */
   extern unsigned int total_cpus;
   
@@@ -227,8 -249,8 +249,8 @@@ static inline int get_boot_cpu_id(void
    */
   extern void arch_disable_smp_support(void);
   
- -extern void arch_enable_nonboot_cpus_begin(void);
- -extern void arch_enable_nonboot_cpus_end(void);
+ +extern void arch_thaw_secondary_cpus_begin(void);
+ +extern void arch_thaw_secondary_cpus_end(void);
   
   void smp_setup_processor_id(void);
   
diff --combined kernel/cpu.c

index 9f89214,244d305..6ff2578
--- 1/kernel/cpu.c
--- 2/kernel/cpu.c
+++ b/kernel/cpu.c
@@@ -3,6 -3,7 +3,7 @@@
    *
    * This code is licenced under the GPL.
    */
+ #include <linux/sched/mm.h>
   #include <linux/proc_fs.h>
   #include <linux/smp.h>
   #include <linux/init.h>
@@@ -432,7 -433,7 +433,7 @@@ static inline bool cpu_smt_allowed(unsi
         /*
          * On x86 it's required to boot all logical CPUs at least once so
          * that the init code can get a chance to set CR4.MCE on each
- -       * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
+ +       * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
          * core will shutdown the machine.
          */
         return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
@@@ -564,6 -565,21 +565,21 @@@ static int bringup_cpu(unsigned int cpu
         return bringup_wait_for_ap(cpu);
   }
   
+ static int finish_cpu(unsigned int cpu)
+ {
+       struct task_struct *idle = idle_thread_get(cpu);
+       struct mm_struct *mm = idle->active_mm;
+ 
+       /*
+        * idle_task_exit() will have switched to &init_mm, now
+        * clean up any remaining active_mm state.
+        */
+       if (mm != &init_mm)
+               idle->active_mm = &init_mm;
+       mmdrop(mm);
+       return 0;
+ }
+ 
   /*
    * Hotplug state machine related functions
    */
@@@ -1327,7 -1343,7 +1343,7 @@@ void bringup_nonboot_cpus(unsigned int 
   #ifdef CONFIG_PM_SLEEP_SMP
   static cpumask_var_t frozen_cpus;
   
- -int __freeze_secondary_cpus(int primary, bool suspend)
+ +int freeze_secondary_cpus(int primary)
   {
         int cpu, error = 0;
   
@@@ -1352,7 -1368,7 +1368,7 @@@
                 if (cpu == primary)
                         continue;
   
- -              if (suspend && pm_wakeup_pending()) {
+ +              if (pm_wakeup_pending()) {
                         pr_info("Wakeup pending. Abort CPU freeze\n");
                         error = -EBUSY;
                         break;
@@@ -1376,8 -1392,8 +1392,8 @@@
   
         /*
          * Make sure the CPUs won't be enabled by someone else. We need to do
- -       * this even in case of failure as all disable_nonboot_cpus() users are
- -       * supposed to do enable_nonboot_cpus() on the failure path.
+ +       * this even in case of failure as all freeze_secondary_cpus() users are
+ +       * supposed to do thaw_secondary_cpus() on the failure path.
          */
         cpu_hotplug_disabled++;
   
@@@ -1385,15 -1401,15 +1401,15 @@@
         return error;
   }
   
- -void __weak arch_enable_nonboot_cpus_begin(void)
+ +void __weak arch_thaw_secondary_cpus_begin(void)
   {
   }
   
- -void __weak arch_enable_nonboot_cpus_end(void)
+ +void __weak arch_thaw_secondary_cpus_end(void)
   {
   }
   
- -void enable_nonboot_cpus(void)
+ +void thaw_secondary_cpus(void)
   {
         int cpu, error;
   
@@@ -1405,7 -1421,7 +1421,7 @@@
   
         pr_info("Enabling non-boot CPUs ...\n");
   
- -      arch_enable_nonboot_cpus_begin();
+ +      arch_thaw_secondary_cpus_begin();
   
         for_each_cpu(cpu, frozen_cpus) {
                 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
@@@ -1418,7 -1434,7 +1434,7 @@@
                 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
         }
   
- -      arch_enable_nonboot_cpus_end();
+ +      arch_thaw_secondary_cpus_end();
   
         cpumask_clear(frozen_cpus);
   out:
@@@ -1549,7 -1565,7 +1565,7 @@@ static struct cpuhp_step cpuhp_hp_state
         [CPUHP_BRINGUP_CPU] = {
                 .name                   = "cpu:bringup",
                 .startup.single         = bringup_cpu,
-               .teardown.single        = NULL,
+               .teardown.single        = finish_cpu,
                 .cant_stop              = true,
         },
         /* Final state before CPU kills itself */
diff --combined kernel/exit.c

index 1b772f2,d56fe51..c81805a
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -708,8 -708,12 +708,12 @@@ void __noreturn do_exit(long code
         struct task_struct *tsk = current;
         int group_dead;
   
-       profile_task_exit(tsk);
-       kcov_task_exit(tsk);
+       /*
+        * We can get here from a kernel oops, sometimes with preemption off.
+        * Start by checking for critical errors.
+        * Then fix up important state like USER_DS and preemption.
+        * Then do everything else.
+        */
   
         WARN_ON(blk_needs_flush_plug(tsk));
   
@@@ -727,6 -731,16 +731,16 @@@
          */
         set_fs(USER_DS);
   
+       if (unlikely(in_atomic())) {
+               pr_info("note: %s[%d] exited with preempt_count %d\n",
+                       current->comm, task_pid_nr(current),
+                       preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
+ 
+       profile_task_exit(tsk);
+       kcov_task_exit(tsk);
+ 
         ptrace_event(PTRACE_EVENT_EXIT, code);
   
         validate_creds_for_do_exit(tsk);
@@@ -744,13 -758,6 +758,6 @@@
   
         exit_signals(tsk);  /* sets PF_EXITING */
   
-       if (unlikely(in_atomic())) {
-               pr_info("note: %s[%d] exited with preempt_count %d\n",
-                       current->comm, task_pid_nr(current),
-                       preempt_count());
-               preempt_count_set(PREEMPT_ENABLED);
-       }
- 
         /* sync mm's RSS info before statistics gathering */
         if (tsk->mm)
                 sync_mm_rss(tsk->mm);
@@@ -1558,7 -1565,7 +1565,7 @@@ SYSCALL_DEFINE5(waitid, int, which, pid
         if (!infop)
                 return err;
   
- -      if (!user_access_begin(infop, sizeof(*infop)))
+ +      if (!user_write_access_begin(infop, sizeof(*infop)))
                 return -EFAULT;
   
         unsafe_put_user(signo, &infop->si_signo, Efault);
@@@ -1567,10 -1574,10 +1574,10 @@@
         unsafe_put_user(info.pid, &infop->si_pid, Efault);
         unsafe_put_user(info.uid, &infop->si_uid, Efault);
         unsafe_put_user(info.status, &infop->si_status, Efault);
- -      user_access_end();
+ +      user_write_access_end();
         return err;
   Efault:
- -      user_access_end();
+ +      user_write_access_end();
         return -EFAULT;
   }
   
@@@ -1685,7 -1692,7 +1692,7 @@@ COMPAT_SYSCALL_DEFINE5(waitid
         if (!infop)
                 return err;
   
- -      if (!user_access_begin(infop, sizeof(*infop)))
+ +      if (!user_write_access_begin(infop, sizeof(*infop)))
                 return -EFAULT;
   
         unsafe_put_user(signo, &infop->si_signo, Efault);
@@@ -1694,10 -1701,10 +1701,10 @@@
         unsafe_put_user(info.pid, &infop->si_pid, Efault);
         unsafe_put_user(info.uid, &infop->si_uid, Efault);
         unsafe_put_user(info.status, &infop->si_status, Efault);
- -      user_access_end();
+ +      user_write_access_end();
         return err;
   Efault:
- -      user_access_end();
+ +      user_write_access_end();
         return -EFAULT;
   }
   #endif
diff --combined kernel/sched/core.c

index 0ae29fd,43ba2d4..d766902
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -11,7 -11,6 +11,7 @@@
   #include <linux/nospec.h>
   
   #include <linux/kcov.h>
+ +#include <linux/scs.h>
   
   #include <asm/switch_to.h>
   #include <asm/tlb.h>
@@@ -21,6 -20,7 +21,7 @@@
   #include "../smpboot.h"
   
   #include "pelt.h"
+ #include "smp.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
@@@ -220,6 -220,13 +221,13 @@@ void update_rq_clock(struct rq *rq
         update_rq_clock_task(rq, delta);
   }
   
+ static inline void
+ rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+ {
+       csd->flags = 0;
+       csd->func = func;
+       csd->info = rq;
+ }
   
   #ifdef CONFIG_SCHED_HRTICK
   /*
@@@ -315,16 -322,14 +323,14 @@@ void hrtick_start(struct rq *rq, u64 de
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                       HRTIMER_MODE_REL_PINNED_HARD);
   }
+ 
   #endif /* CONFIG_SMP */
   
   static void hrtick_rq_init(struct rq *rq)
   {
   #ifdef CONFIG_SMP
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
+       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
   #endif
- 
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
   }
@@@ -633,29 -638,23 +639,23 @@@ void wake_up_nohz_cpu(int cpu
                 wake_up_idle_cpu(cpu);
   }
   
- static inline bool got_nohz_idle_kick(void)
+ static void nohz_csd_func(void *info)
   {
-       int cpu = smp_processor_id();
- 
-       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
-               return false;
- 
-       if (idle_cpu(cpu) && !need_resched())
-               return true;
+       struct rq *rq = info;
+       int cpu = cpu_of(rq);
+       unsigned int flags;
   
         /*
-        * We can't run Idle Load Balance on this CPU for this time so we
-        * cancel it and clear NOHZ_BALANCE_KICK
+        * Release the rq::nohz_csd.
          */
-       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
-       return false;
- }
+       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+       WARN_ON(!(flags & NOHZ_KICK_MASK));
   
- #else /* CONFIG_NO_HZ_COMMON */
- 
- static inline bool got_nohz_idle_kick(void)
- {
-       return false;
+       rq->idle_balance = idle_cpu(cpu);
+       if (rq->idle_balance && !need_resched()) {
+               rq->nohz_idle_balance = flags;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
   }
   
   #endif /* CONFIG_NO_HZ_COMMON */
@@@ -1540,7 -1539,7 +1540,7 @@@ static int migration_cpu_stop(void *dat
          * __migrate_task() such that we will not miss enforcing cpus_ptr
          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
          */
-       sched_ttwu_pending();
+       flush_smp_call_function_from_idle();
   
         raw_spin_lock(&p->pi_lock);
         rq_lock(rq, &rf);
@@@ -2274,16 -2273,23 +2274,23 @@@ static int ttwu_remote(struct task_stru
   }
   
   #ifdef CONFIG_SMP
- void sched_ttwu_pending(void)
+ void sched_ttwu_pending(void *arg)
   {
+       struct llist_node *llist = arg;
         struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
         struct task_struct *p, *t;
         struct rq_flags rf;
   
         if (!llist)
                 return;
   
+       /*
+        * rq::ttwu_pending racy indication of out-standing wakeups.
+        * Races such that false-negatives are possible, since they
+        * are shorter lived that false-positives would be.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
+ 
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
   
@@@ -2293,56 -2299,30 +2300,30 @@@
         rq_unlock_irqrestore(rq, &rf);
   }
   
- void scheduler_ipi(void)
+ void send_call_function_single_ipi(int cpu)
   {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
- 
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
- 
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
+       struct rq *rq = cpu_rq(cpu);
   
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
+       if (!set_nr_if_polling(rq->idle))
+               arch_send_call_function_single_ipi(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
   }
   
- static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+ /*
+  * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+  * necessary. The wakee CPU on receipt of the IPI will queue the task
+  * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+  * of the wakeup instead of the waker.
+  */
+ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
   {
         struct rq *rq = cpu_rq(cpu);
   
         p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
   
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-               if (!set_nr_if_polling(rq->idle))
-                       smp_send_reschedule(cpu);
-               else
-                       trace_sched_wake_idle_without_ipi(cpu);
-       }
+       WRITE_ONCE(rq->ttwu_pending, 1);
+       __smp_call_single_queue(cpu, &p->wake_entry);
   }
   
   void wake_up_if_idle(int cpu)
@@@ -2373,6 -2353,38 +2354,38 @@@ bool cpus_share_cache(int this_cpu, in
   {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
   }
+ 
+ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+ {
+       /*
+        * If the CPU does not share cache, then queue the task on the
+        * remote rqs wakelist to avoid accessing remote data.
+        */
+       if (!cpus_share_cache(smp_processor_id(), cpu))
+               return true;
+ 
+       /*
+        * If the task is descheduling and the only running task on the
+        * CPU then use the wakelist to offload the task activation to
+        * the soon-to-be-idle CPU as the current CPU is likely busy.
+        * nr_running is checked to avoid unnecessary task stacking.
+        */
+       if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+               return true;
+ 
+       return false;
+ }
+ 
+ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+ {
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+               __ttwu_queue_wakelist(p, cpu, wake_flags);
+               return true;
+       }
+ 
+       return false;
+ }
   #endif /* CONFIG_SMP */
   
   static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@@ -2381,11 -2393,8 +2394,8 @@@
         struct rq_flags rf;
   
   #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               ttwu_queue_remote(p, cpu, wake_flags);
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                 return;
-       }
   #endif
   
         rq_lock(rq, &rf);
@@@ -2569,7 -2578,15 +2579,15 @@@ try_to_wake_up(struct task_struct *p, u
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto unlock;
   
+       if (p->in_iowait) {
+               delayacct_blkio_end(p);
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+ 
   #ifdef CONFIG_SMP
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
+ 
         /*
          * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
          * possible to, falsely, observe p->on_cpu == 0.
@@@ -2593,6 -2610,16 +2611,16 @@@
   
         /*
          * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        */
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+               goto unlock;
+ 
+       /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
          * this task as prev, wait until its done referencing the task.
          *
          * Pairs with the smp_store_release() in finish_task().
@@@ -2602,28 -2629,12 +2630,12 @@@
          */
         smp_cond_load_acquire(&p->on_cpu, !VAL);
   
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
- 
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
- 
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 psi_ttwu_dequeue(p);
                 set_task_cpu(p, cpu);
         }
- 
- #else /* CONFIG_SMP */
- 
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
- 
   #endif /* CONFIG_SMP */
   
         ttwu_queue(p, cpu, wake_flags);
@@@ -2751,6 -2762,9 +2763,9 @@@ static void __sched_fork(unsigned long 
         p->capture_control = NULL;
   #endif
         init_numa_balancing(clone_flags, p);
+ #ifdef CONFIG_SMP
+       p->wake_entry_type = CSD_TYPE_TTWU;
+ #endif
   }
   
   DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@@ -3926,9 -3940,6 +3941,9 @@@ static inline void schedule_debug(struc
   #ifdef CONFIG_SCHED_STACK_END_CHECK
         if (task_stack_end_corrupted(prev))
                 panic("corrupted stack end detected inside scheduler\n");
+ +
+ +      if (task_scs_end_corrupted(prev))
+ +              panic("corrupted shadow stack detected inside scheduler\n");
   #endif
   
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@@ -3951,6 -3962,28 +3966,28 @@@
         schedstat_inc(this_rq()->sched_count);
   }
   
+ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+                                 struct rq_flags *rf)
+ {
+ #ifdef CONFIG_SMP
+       const struct sched_class *class;
+       /*
+        * We must do the balancing pass before put_prev_task(), such
+        * that when we release the rq->lock the task is in the same
+        * state as before we took rq->lock.
+        *
+        * We can terminate the balance pass as soon as we know there is
+        * a runnable task of @class priority or higher.
+        */
+       for_class_range(class, prev->sched_class, &idle_sched_class) {
+               if (class->balance(rq, prev, rf))
+                       break;
+       }
+ #endif
+ 
+       put_prev_task(rq, prev);
+ }
+ 
   /*
    * Pick up the highest-prio task:
    */
@@@ -3984,22 -4017,7 +4021,7 @@@ pick_next_task(struct rq *rq, struct ta
         }
   
   restart:
- #ifdef CONFIG_SMP
-       /*
-        * We must do the balancing pass before put_next_task(), such
-        * that when we release the rq->lock the task is in the same
-        * state as before we took rq->lock.
-        *
-        * We can terminate the balance pass as soon as we know there is
-        * a runnable task of @class priority or higher.
-        */
-       for_class_range(class, prev->sched_class, &idle_sched_class) {
-               if (class->balance(rq, prev, rf))
-                       break;
-       }
- #endif
- 
-       put_prev_task(rq, prev);
+       put_prev_task_balance(rq, prev, rf);
   
         for_each_class(class) {
                 p = class->pick_next_task(rq);
@@@ -4689,7 -4707,7 +4711,7 @@@ int idle_cpu(int cpu
                 return 0;
   
   #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                 return 0;
   #endif
   
@@@ -6092,7 -6110,6 +6114,7 @@@ void init_idle(struct task_struct *idle
         idle->se.exec_start = sched_clock();
         idle->flags |= PF_IDLE;
   
+ +      scs_task_reset(idle);
         kasan_unpoison_task_stack(idle);
   
   #ifdef CONFIG_SMP
@@@ -6243,13 -6260,14 +6265,14 @@@ void idle_task_exit(void
         struct mm_struct *mm = current->active_mm;
   
         BUG_ON(cpu_online(smp_processor_id()));
+       BUG_ON(current != this_rq()->idle);
   
         if (mm != &init_mm) {
                 switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
                 finish_arch_post_lock_switch();
         }
-       mmdrop(mm);
+ 
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
   }
   
   /*
@@@ -6539,7 -6557,6 +6562,6 @@@ int sched_cpu_dying(unsigned int cpu
         struct rq_flags rf;
   
         /* Handle pending wakeups and then migrate everything off */
-       sched_ttwu_pending();
         sched_tick_stop(cpu);
   
         rq_lock_irqsave(rq, &rf);
@@@ -6642,6 -6659,8 +6664,8 @@@ void __init sched_init(void
                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   #ifdef CONFIG_RT_GROUP_SCHED
                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@@ -6694,7 -6713,6 +6718,6 @@@
                 init_rt_rq(&rq->rt);
                 init_dl_rq(&rq->dl);
   #ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                 /*
@@@ -6716,7 -6734,6 +6739,6 @@@
                  * We achieve this by letting root_task_group's tasks sit
                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
@@@ -6744,6 -6761,8 +6766,8 @@@
   #ifdef CONFIG_NO_HZ_COMMON
                 rq->last_blocked_load_update_tick = jiffies;
                 atomic_set(&rq->nohz_flags, 0);
+ 
+               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
   #endif
   #endif /* CONFIG_SMP */
                 hrtick_rq_init(rq);
@@@ -7438,6 -7457,8 +7462,8 @@@ static DEFINE_MUTEX(cfs_constraints_mut
   
   const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
   static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+ /* More than 203 days if BW_SHIFT equals 20. */
+ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
   
   static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
   
@@@ -7466,6 -7487,12 +7492,12 @@@ static int tg_set_cfs_bandwidth(struct 
                 return -EINVAL;
   
         /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+               return -EINVAL;
+ 
+       /*
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
diff --combined kernel/smp.c

index 8430319,c80486a..472c2b2
--- 1/kernel/smp.c
--- 2/kernel/smp.c
+++ b/kernel/smp.c
@@@ -22,11 -22,9 +22,9 @@@
   #include <linux/hypervisor.h>
   
   #include "smpboot.h"
+ #include "sched/smp.h"
   
- enum {
-       CSD_FLAG_LOCK           = 0x01,
-       CSD_FLAG_SYNCHRONOUS    = 0x02,
- };
+ #define CSD_TYPE(_csd)        ((_csd)->flags & CSD_FLAG_TYPE_MASK)
   
   struct call_function_data {
         call_single_data_t      __percpu *csd;
@@@ -84,6 -82,7 +82,7 @@@ int smpcfd_dying_cpu(unsigned int cpu
          * still pending.
          */
         flush_smp_call_function_queue(false);
+       irq_work_run();
         return 0;
   }
   
@@@ -134,15 -133,33 +133,33 @@@ static __always_inline void csd_unlock(
   
   static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
   
+ void __smp_call_single_queue(int cpu, struct llist_node *node)
+ {
+       /*
+        * The list addition should be visible before sending the IPI
+        * handler locks the list to pull the entry off it because of
+        * normal cache coherency rules implied by spinlocks.
+        *
+        * If IPIs can go out of order to the cache coherency protocol
+        * in an architecture, sufficient synchronisation should be added
+        * to arch code to make it appear to obey cache coherency WRT
+        * locking and barrier primitives. Generic code isn't really
+        * equipped to do the right thing...
+        */
+       if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+               send_call_function_single_ipi(cpu);
+ }
+ 
   /*
    * Insert a previously allocated call_single_data_t element
    * for execution on the given CPU. data must already have
    * ->func, ->info, and ->flags set.
    */
- static int generic_exec_single(int cpu, call_single_data_t *csd,
-                              smp_call_func_t func, void *info)
+ static int generic_exec_single(int cpu, call_single_data_t *csd)
   {
         if (cpu == smp_processor_id()) {
+               smp_call_func_t func = csd->func;
+               void *info = csd->info;
                 unsigned long flags;
   
                 /*
@@@ -156,28 -173,12 +173,12 @@@
                 return 0;
         }
   
- 
         if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                 csd_unlock(csd);
                 return -ENXIO;
         }
   
-       csd->func = func;
-       csd->info = info;
- 
-       /*
-        * The list addition should be visible before sending the IPI
-        * handler locks the list to pull the entry off it because of
-        * normal cache coherency rules implied by spinlocks.
-        *
-        * If IPIs can go out of order to the cache coherency protocol
-        * in an architecture, sufficient synchronisation should be added
-        * to arch code to make it appear to obey cache coherency WRT
-        * locking and barrier primitives. Generic code isn't really
-        * equipped to do the right thing...
-        */
-       if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
-               arch_send_call_function_single_ipi(cpu);
+       __smp_call_single_queue(cpu, &csd->llist);
   
         return 0;
   }
@@@ -209,9 -210,9 +210,9 @@@ void generic_smp_call_function_single_i
    */
   static void flush_smp_call_function_queue(bool warn_cpu_offline)
   {
-       struct llist_head *head;
-       struct llist_node *entry;
         call_single_data_t *csd, *csd_next;
+       struct llist_node *entry, *prev;
+       struct llist_head *head;
         static bool warned;
   
         lockdep_assert_irqs_disabled();
@@@ -230,32 -231,99 +231,99 @@@
                  * We don't have to use the _safe() variant here
                  * because we are not invoking the IPI handlers yet.
                  */
-               llist_for_each_entry(csd, entry, llist)
-                       pr_warn("IPI callback %pS sent to offline CPU\n",
-                               csd->func);
+               llist_for_each_entry(csd, entry, llist) {
+                       switch (CSD_TYPE(csd)) {
+                       case CSD_TYPE_ASYNC:
+                       case CSD_TYPE_SYNC:
+                       case CSD_TYPE_IRQ_WORK:
+                               pr_warn("IPI callback %pS sent to offline CPU\n",
+                                       csd->func);
+                               break;
+ 
+                       case CSD_TYPE_TTWU:
+                               pr_warn("IPI task-wakeup sent to offline CPU\n");
+                               break;
+ 
+                       default:
+                               pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+                                       CSD_TYPE(csd));
+                               break;
+                       }
+               }
         }
   
+       /*
+        * First; run all SYNC callbacks, people are waiting for us.
+        */
+       prev = NULL;
         llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-               smp_call_func_t func = csd->func;
-               void *info = csd->info;
- 
                 /* Do we wait until *after* callback? */
-               if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+               if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+                       smp_call_func_t func = csd->func;
+                       void *info = csd->info;
+ 
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+ 
                         func(info);
                         csd_unlock(csd);
                 } else {
-                       csd_unlock(csd);
-                       func(info);
+                       prev = &csd->llist;
                 }
         }
   
+       if (!entry)
+               return;
+ 
         /*
-        * Handle irq works queued remotely by irq_work_queue_on().
-        * Smp functions above are typically synchronous so they
-        * better run first since some other CPUs may be busy waiting
-        * for them.
+        * Second; run all !SYNC callbacks.
          */
-       irq_work_run();
+       prev = NULL;
+       llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+               int type = CSD_TYPE(csd);
+ 
+               if (type != CSD_TYPE_TTWU) {
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+ 
+                       if (type == CSD_TYPE_ASYNC) {
+                               smp_call_func_t func = csd->func;
+                               void *info = csd->info;
+ 
+                               csd_unlock(csd);
+                               func(info);
+                       } else if (type == CSD_TYPE_IRQ_WORK) {
+                               irq_work_single(csd);
+                       }
+ 
+               } else {
+                       prev = &csd->llist;
+               }
+       }
+ 
+       /*
+        * Third; only CSD_TYPE_TTWU is left, issue those.
+        */
+       if (entry)
+               sched_ttwu_pending(entry);
+ }
+ 
+ void flush_smp_call_function_from_idle(void)
+ {
+       unsigned long flags;
+ 
+       if (llist_empty(this_cpu_ptr(&call_single_queue)))
+               return;
+ 
+       local_irq_save(flags);
+       flush_smp_call_function_queue(true);
+       local_irq_restore(flags);
   }
   
   /*
@@@ -271,7 -339,7 +339,7 @@@ int smp_call_function_single(int cpu, s
   {
         call_single_data_t *csd;
         call_single_data_t csd_stack = {
-               .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+               .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
         };
         int this_cpu;
         int err;
@@@ -305,7 -373,10 +373,10 @@@
                 csd_lock(csd);
         }
   
-       err = generic_exec_single(cpu, csd, func, info);
+       csd->func = func;
+       csd->info = info;
+ 
+       err = generic_exec_single(cpu, csd);
   
         if (wait)
                 csd_lock_wait(csd);
@@@ -351,7 -422,7 +422,7 @@@ int smp_call_function_single_async(int 
         csd->flags = CSD_FLAG_LOCK;
         smp_wmb();
   
-       err = generic_exec_single(cpu, csd, csd->func, csd->info);
+       err = generic_exec_single(cpu, csd);
   
   out:
         preempt_enable();
@@@ -466,7 -537,7 +537,7 @@@ static void smp_call_function_many_cond
   
                 csd_lock(csd);
                 if (wait)
-                       csd->flags |= CSD_FLAG_SYNCHRONOUS;
+                       csd->flags |= CSD_TYPE_SYNC;
                 csd->func = func;
                 csd->info = info;
                 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@@ -598,6 -669,24 +669,24 @@@ void __init smp_init(void
   {
         int num_nodes, num_cpus;
   
+       /*
+        * Ensure struct irq_work layout matches so that
+        * flush_smp_call_function_queue() can do horrible things.
+        */
+       BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+                    offsetof(struct __call_single_data, llist));
+       BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+                    offsetof(struct __call_single_data, func));
+       BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+                    offsetof(struct __call_single_data, flags));
+ 
+       /*
+        * Assert the CSD_TYPE_TTWU layout is similar enough
+        * for task_struct to be on the @call_single_queue.
+        */
+       BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+                    offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+ 
         idle_threads_init();
         cpuhp_threads_init();
   
@@@ -620,7 -709,7 +709,7 @@@
    * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
    * of local_irq_disable/enable().
    */
- -void on_each_cpu(void (*func) (void *info), void *info, int wait)
+ +void on_each_cpu(smp_call_func_t func, void *info, int wait)
   {
         unsigned long flags;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history