OSDN Git Service

Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
Pull scheduler updates from Ingo Molnar:
 "The changes in this cycle are:

   - Optimize the task wakeup CPU selection logic, to improve
     scalability and reduce wakeup latency spikes

   - PELT enhancements

   - CFS bandwidth handling fixes

   - Optimize the wakeup path by remove rq->wake_list and replacing it
     with ->ttwu_pending

   - Optimize IPI cross-calls by making flush_smp_call_function_queue()
     process sync callbacks first.

   - Misc fixes and enhancements"

* tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too
  sched/headers: Split out open-coded prototypes into kernel/sched/smp.h
  sched: Replace rq::wake_list
  sched: Add rq::ttwu_pending
  irq_work, smp: Allow irq_work on call_single_queue
  smp: Optimize send_call_function_single_ipi()
  smp: Move irq_work_run() out of flush_smp_call_function_queue()
  smp: Optimize flush_smp_call_function_queue()
  sched: Fix smp_call_function_single_async() usage for ILB
  sched/core: Offload wakee task activation if it the wakee is descheduling
  sched/core: Optimize ttwu() spinning on p->on_cpu
  sched: Defend cfs and rt bandwidth quota against overflow
  sched/cpuacct: Fix charge cpuacct.usage_sys
  sched/fair: Replace zero-length array with flexible-array
  sched/pelt: Sync util/runnable_sum with PELT window when propagating
  sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr()
  sched/fair: Optimize enqueue_task_fair()
  sched: Make scheduler_ipi inline
  sched: Clean up scheduler_ipi()
  sched/core: Simplify sched_init()
  ...

21 files changed:
arch/powerpc/platforms/powernv/smp.c
include/linux/irq_work.h
include/linux/sched.h
include/linux/sched/mm.h
include/linux/sched/topology.h
include/linux/smp.h
include/linux/swait.h
kernel/cpu.c
kernel/exit.c
kernel/irq_work.c
kernel/sched/core.c
kernel/sched/cpuacct.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/idle.c
kernel/sched/pelt.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/smp.h [new file with mode: 0644]
kernel/sched/topology.c
kernel/smp.c

index 13e2516..b2ba3e9 100644 (file)
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
        /* Standard hot unplug procedure */
 
        idle_task_exit();
-       current->active_mm = NULL; /* for sanity */
        cpu = smp_processor_id();
        DBG("CPU%d offline\n", cpu);
        generic_set_cpu_dead(cpu);
index 3b752e8..2735da5 100644 (file)
@@ -13,6 +13,8 @@
  * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
  */
 
+/* flags share CSD_FLAG_ space */
+
 #define IRQ_WORK_PENDING       BIT(0)
 #define IRQ_WORK_BUSY          BIT(1)
 
 
 #define IRQ_WORK_CLAIMED       (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
 
+/*
+ * structure shares layout with single_call_data_t.
+ */
 struct irq_work {
-       atomic_t flags;
        struct llist_node llnode;
+       atomic_t flags;
        void (*func)(struct irq_work *);
 };
 
@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
 
 void irq_work_run(void);
 bool irq_work_needs_cpu(void);
+void irq_work_single(void *arg);
 #else
 static inline bool irq_work_needs_cpu(void) { return false; }
 static inline void irq_work_run(void) { }
+static inline void irq_work_single(void *arg) { }
 #endif
 
 #endif /* _LINUX_IRQ_WORK_H */
index 33bb7c5..12938d4 100644 (file)
@@ -654,6 +654,7 @@ struct task_struct {
 
 #ifdef CONFIG_SMP
        struct llist_node               wake_entry;
+       unsigned int                    wake_entry_type;
        int                             on_cpu;
 #ifdef CONFIG_THREAD_INFO_IN_TASK
        /* Current CPU: */
@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
 })
 
 #ifdef CONFIG_SMP
-void scheduler_ipi(void);
+static __always_inline void scheduler_ipi(void)
+{
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       preempt_fold_need_resched();
+}
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
 static inline void scheduler_ipi(void) { }
index c49257a..a132d87 100644 (file)
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
                __mmdrop(mm);
 }
 
+void mmdrop(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
index 95253ad..fb11091 100644 (file)
  */
 #ifdef CONFIG_SMP
 
-#define SD_LOAD_BALANCE                0x0001  /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE     0x0002  /* Balance when about to become idle */
-#define SD_BALANCE_EXEC                0x0004  /* Balance on exec */
-#define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
-#define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
-#define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY    0x0040  /* Domain members have different CPU capacities */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share CPU capacity */
-#define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share CPU pkg resources */
-#define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
-#define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
-#define SD_NUMA                        0x4000  /* cross-node balancing */
+#define SD_BALANCE_NEWIDLE     0x0001  /* Balance when about to become idle */
+#define SD_BALANCE_EXEC                0x0002  /* Balance on exec */
+#define SD_BALANCE_FORK                0x0004  /* Balance on fork, clone */
+#define SD_BALANCE_WAKE                0x0008  /* Balance on wakeup */
+#define SD_WAKE_AFFINE         0x0010  /* Wake task to waking CPU */
+#define SD_ASYM_CPUCAPACITY    0x0020  /* Domain members have different CPU capacities */
+#define SD_SHARE_CPUCAPACITY   0x0040  /* Domain members share CPU capacity */
+#define SD_SHARE_POWERDOMAIN   0x0080  /* Domain members share power domain */
+#define SD_SHARE_PKG_RESOURCES 0x0100  /* Domain members share CPU pkg resources */
+#define SD_SERIALIZE           0x0200  /* Only a single load balancing instance */
+#define SD_ASYM_PACKING                0x0400  /* Place busy groups earlier in the domain */
+#define SD_PREFER_SIBLING      0x0800  /* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP             0x1000  /* sched_domains of this level overlap */
+#define SD_NUMA                        0x2000  /* cross-node balancing */
 
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
index 0401987..7ee202a 100644 (file)
 
 typedef void (*smp_call_func_t)(void *info);
 typedef bool (*smp_cond_func_t)(int cpu, void *info);
+
+enum {
+       CSD_FLAG_LOCK           = 0x01,
+
+       /* IRQ_WORK_flags */
+
+       CSD_TYPE_ASYNC          = 0x00,
+       CSD_TYPE_SYNC           = 0x10,
+       CSD_TYPE_IRQ_WORK       = 0x20,
+       CSD_TYPE_TTWU           = 0x30,
+       CSD_FLAG_TYPE_MASK      = 0xF0,
+};
+
+/*
+ * structure shares (partial) layout with struct irq_work
+ */
 struct __call_single_data {
        struct llist_node llist;
+       unsigned int flags;
        smp_call_func_t func;
        void *info;
-       unsigned int flags;
 };
 
 /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
 typedef struct __call_single_data call_single_data_t
        __aligned(sizeof(struct __call_single_data));
 
+/*
+ * Enqueue a llist_node on the call_single_queue; be very careful, read
+ * flush_smp_call_function_queue() in detail.
+ */
+extern void __smp_call_single_queue(int cpu, struct llist_node *node);
+
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
 
index 73e06e9..6a8c22b 100644 (file)
@@ -9,23 +9,10 @@
 #include <asm/current.h>
 
 /*
- * BROKEN wait-queues.
- *
- * These "simple" wait-queues are broken garbage, and should never be
- * used. The comments below claim that they are "similar" to regular
- * wait-queues, but the semantics are actually completely different, and
- * every single user we have ever had has been buggy (or pointless).
- *
- * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
- * "wake_up()" does, and has led to problems. In other cases, it has
- * been fine, because there's only ever one waiter (kvm), but in that
- * case gthe whole "simple" wait-queue is just pointless to begin with,
- * since there is no "queue". Use "wake_up_process()" with a direct
- * pointer instead.
- *
- * While these are very similar to regular wait queues (wait.h) the most
- * important difference is that the simple waitqueue allows for deterministic
- * behaviour -- IOW it has strictly bounded IRQ and lock hold times.
+ * Simple waitqueues are semantically very different to regular wait queues
+ * (wait.h). The most important difference is that the simple waitqueue allows
+ * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
  *
  * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
  * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
@@ -39,7 +26,7 @@
  *    sleeper state.
  *
  *  - the !exclusive mode; because that leads to O(n) wakeups, everything is
- *    exclusive.
+ *    exclusive. As such swake_up_one will only ever awake _one_ waiter.
  *
  *  - custom wake callback functions; because you cannot give any guarantees
  *    about random code. This also allows swait to be used in RT, such that
index 9f89214..6ff2578 100644 (file)
@@ -3,6 +3,7 @@
  *
  * This code is licenced under the GPL.
  */
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
        return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+       struct task_struct *idle = idle_thread_get(cpu);
+       struct mm_struct *mm = idle->active_mm;
+
+       /*
+        * idle_task_exit() will have switched to &init_mm, now
+        * clean up any remaining active_mm state.
+        */
+       if (mm != &init_mm)
+               idle->active_mm = &init_mm;
+       mmdrop(mm);
+       return 0;
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
                .startup.single         = bringup_cpu,
-               .teardown.single        = NULL,
+               .teardown.single        = finish_cpu,
                .cant_stop              = true,
        },
        /* Final state before CPU kills itself */
index 1b772f2..c81805a 100644 (file)
@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
        struct task_struct *tsk = current;
        int group_dead;
 
-       profile_task_exit(tsk);
-       kcov_task_exit(tsk);
+       /*
+        * We can get here from a kernel oops, sometimes with preemption off.
+        * Start by checking for critical errors.
+        * Then fix up important state like USER_DS and preemption.
+        * Then do everything else.
+        */
 
        WARN_ON(blk_needs_flush_plug(tsk));
 
@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
         */
        set_fs(USER_DS);
 
+       if (unlikely(in_atomic())) {
+               pr_info("note: %s[%d] exited with preempt_count %d\n",
+                       current->comm, task_pid_nr(current),
+                       preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
+
+       profile_task_exit(tsk);
+       kcov_task_exit(tsk);
+
        ptrace_event(PTRACE_EVENT_EXIT, code);
 
        validate_creds_for_do_exit(tsk);
@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
 
        exit_signals(tsk);  /* sets PF_EXITING */
 
-       if (unlikely(in_atomic())) {
-               pr_info("note: %s[%d] exited with preempt_count %d\n",
-                       current->comm, task_pid_nr(current),
-                       preempt_count());
-               preempt_count_set(PREEMPT_ENABLED);
-       }
-
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
                sync_mm_rss(tsk->mm);
index 48b5d1b..eca8396 100644 (file)
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
 {
        int oflags;
 
-       oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+       oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
        /*
         * If the work is already pending, no need to raise the IPI.
         * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
        if (cpu != smp_processor_id()) {
                /* Arch remote IPI send/receive backend aren't NMI safe */
                WARN_ON_ONCE(in_nmi());
-               if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-                       arch_send_call_function_single_ipi(cpu);
+               __smp_call_single_queue(cpu, &work->llnode);
        } else {
                __irq_work_queue_local(work);
        }
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
        return true;
 }
 
+void irq_work_single(void *arg)
+{
+       struct irq_work *work = arg;
+       int flags;
+
+       /*
+        * Clear the PENDING bit, after this point the @work
+        * can be re-used.
+        * Make it immediately visible so that other CPUs trying
+        * to claim that work don't rely on us to handle their data
+        * while we are in the middle of the func.
+        */
+       flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+
+       lockdep_irq_work_enter(work);
+       work->func(work);
+       lockdep_irq_work_exit(work);
+       /*
+        * Clear the BUSY bit and return to the free state if
+        * no-one else claimed it meanwhile.
+        */
+       flags &= ~IRQ_WORK_PENDING;
+       (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
 static void irq_work_run_list(struct llist_head *list)
 {
        struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
                return;
 
        llnode = llist_del_all(list);
-       llist_for_each_entry_safe(work, tmp, llnode, llnode) {
-               int flags;
-               /*
-                * Clear the PENDING bit, after this point the @work
-                * can be re-used.
-                * Make it immediately visible so that other CPUs trying
-                * to claim that work don't rely on us to handle their data
-                * while we are in the middle of the func.
-                */
-               flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
-
-               lockdep_irq_work_enter(work);
-               work->func(work);
-               lockdep_irq_work_exit(work);
-               /*
-                * Clear the BUSY bit and return to the free state if
-                * no-one else claimed it meanwhile.
-                */
-               flags &= ~IRQ_WORK_PENDING;
-               (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
-       }
+       llist_for_each_entry_safe(work, tmp, llnode, llnode)
+               irq_work_single(work);
 }
 
 /*
index 0ae29fd..d766902 100644 (file)
@@ -21,6 +21,7 @@
 #include "../smpboot.h"
 
 #include "pelt.h"
+#include "smp.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
        update_rq_clock_task(rq, delta);
 }
 
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+       csd->flags = 0;
+       csd->func = func;
+       csd->info = rq;
+}
 
 #ifdef CONFIG_SCHED_HRTICK
 /*
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                      HRTIMER_MODE_REL_PINNED_HARD);
 }
+
 #endif /* CONFIG_SMP */
 
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
+       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
 #endif
-
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
 }
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
                wake_up_idle_cpu(cpu);
 }
 
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
 {
-       int cpu = smp_processor_id();
-
-       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
-               return false;
-
-       if (idle_cpu(cpu) && !need_resched())
-               return true;
+       struct rq *rq = info;
+       int cpu = cpu_of(rq);
+       unsigned int flags;
 
        /*
-        * We can't run Idle Load Balance on this CPU for this time so we
-        * cancel it and clear NOHZ_BALANCE_KICK
+        * Release the rq::nohz_csd.
         */
-       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
-       return false;
-}
+       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+       WARN_ON(!(flags & NOHZ_KICK_MASK));
 
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
-       return false;
+       rq->idle_balance = idle_cpu(cpu);
+       if (rq->idle_balance && !need_resched()) {
+               rq->nohz_idle_balance = flags;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
 }
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
         * __migrate_task() such that we will not miss enforcing cpus_ptr
         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
         */
-       sched_ttwu_pending();
+       flush_smp_call_function_from_idle();
 
        raw_spin_lock(&p->pi_lock);
        rq_lock(rq, &rf);
@@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
 {
+       struct llist_node *llist = arg;
        struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p, *t;
        struct rq_flags rf;
 
        if (!llist)
                return;
 
+       /*
+        * rq::ttwu_pending racy indication of out-standing wakeups.
+        * Races such that false-negatives are possible, since they
+        * are shorter lived that false-positives would be.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
+
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
 
@@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
        rq_unlock_irqrestore(rq, &rf);
 }
 
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
 {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
-
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
-
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
+       struct rq *rq = cpu_rq(cpu);
 
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
+       if (!set_nr_if_polling(rq->idle))
+               arch_send_call_function_single_ipi(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 {
        struct rq *rq = cpu_rq(cpu);
 
        p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
 
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-               if (!set_nr_if_polling(rq->idle))
-                       smp_send_reschedule(cpu);
-               else
-                       trace_sched_wake_idle_without_ipi(cpu);
-       }
+       WRITE_ONCE(rq->ttwu_pending, 1);
+       __smp_call_single_queue(cpu, &p->wake_entry);
 }
 
 void wake_up_if_idle(int cpu)
@@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+       /*
+        * If the CPU does not share cache, then queue the task on the
+        * remote rqs wakelist to avoid accessing remote data.
+        */
+       if (!cpus_share_cache(smp_processor_id(), cpu))
+               return true;
+
+       /*
+        * If the task is descheduling and the only running task on the
+        * CPU then use the wakelist to offload the task activation to
+        * the soon-to-be-idle CPU as the current CPU is likely busy.
+        * nr_running is checked to avoid unnecessary task stacking.
+        */
+       if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+               return true;
+
+       return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+               __ttwu_queue_wakelist(p, cpu, wake_flags);
+               return true;
+       }
+
+       return false;
+}
 #endif /* CONFIG_SMP */
 
 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
        struct rq_flags rf;
 
 #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               ttwu_queue_remote(p, cpu, wake_flags);
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                return;
-       }
 #endif
 
        rq_lock(rq, &rf);
@@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->on_rq && ttwu_remote(p, wake_flags))
                goto unlock;
 
+       if (p->in_iowait) {
+               delayacct_blkio_end(p);
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
 #ifdef CONFIG_SMP
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
+
        /*
         * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
         * possible to, falsely, observe p->on_cpu == 0.
@@ -2593,6 +2611,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        /*
         * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        */
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+               goto unlock;
+
+       /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
         * this task as prev, wait until its done referencing the task.
         *
         * Pairs with the smp_store_release() in finish_task().
@@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_cond_load_acquire(&p->on_cpu, !VAL);
 
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                psi_ttwu_dequeue(p);
                set_task_cpu(p, cpu);
        }
-
-#else /* CONFIG_SMP */
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
 #endif /* CONFIG_SMP */
 
        ttwu_queue(p, cpu, wake_flags);
@@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->capture_control = NULL;
 #endif
        init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+       p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
        schedstat_inc(this_rq()->sched_count);
 }
 
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+                                 struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+       const struct sched_class *class;
+       /*
+        * We must do the balancing pass before put_prev_task(), such
+        * that when we release the rq->lock the task is in the same
+        * state as before we took rq->lock.
+        *
+        * We can terminate the balance pass as soon as we know there is
+        * a runnable task of @class priority or higher.
+        */
+       for_class_range(class, prev->sched_class, &idle_sched_class) {
+               if (class->balance(rq, prev, rf))
+                       break;
+       }
+#endif
+
+       put_prev_task(rq, prev);
+}
+
 /*
  * Pick up the highest-prio task:
  */
@@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        }
 
 restart:
-#ifdef CONFIG_SMP
-       /*
-        * We must do the balancing pass before put_next_task(), such
-        * that when we release the rq->lock the task is in the same
-        * state as before we took rq->lock.
-        *
-        * We can terminate the balance pass as soon as we know there is
-        * a runnable task of @class priority or higher.
-        */
-       for_class_range(class, prev->sched_class, &idle_sched_class) {
-               if (class->balance(rq, prev, rf))
-                       break;
-       }
-#endif
-
-       put_prev_task(rq, prev);
+       put_prev_task_balance(rq, prev, rf);
 
        for_each_class(class) {
                p = class->pick_next_task(rq);
@@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
                return 0;
 
 #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                return 0;
 #endif
 
@@ -6243,13 +6265,14 @@ void idle_task_exit(void)
        struct mm_struct *mm = current->active_mm;
 
        BUG_ON(cpu_online(smp_processor_id()));
+       BUG_ON(current != this_rq()->idle);
 
        if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
                finish_arch_post_lock_switch();
        }
-       mmdrop(mm);
+
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*
@@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
        struct rq_flags rf;
 
        /* Handle pending wakeups and then migrate everything off */
-       sched_ttwu_pending();
        sched_tick_stop(cpu);
 
        rq_lock_irqsave(rq, &rf);
@@ -6642,6 +6664,8 @@ void __init sched_init(void)
                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6694,7 +6718,6 @@ void __init sched_init(void)
                init_rt_rq(&rq->rt);
                init_dl_rq(&rq->dl);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                /*
@@ -6716,7 +6739,6 @@ void __init sched_init(void)
                 * We achieve this by letting root_task_group's tasks sit
                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -6744,6 +6766,8 @@ void __init sched_init(void)
 #ifdef CONFIG_NO_HZ_COMMON
                rq->last_blocked_load_update_tick = jiffies;
                atomic_set(&rq->nohz_flags, 0);
+
+               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
 #endif /* CONFIG_SMP */
                hrtick_rq_init(rq);
@@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
 
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
@@ -7466,6 +7492,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                return -EINVAL;
 
        /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+               return -EINVAL;
+
+       /*
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
         */
index 9fbb103..941c28c 100644 (file)
@@ -5,6 +5,7 @@
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
+#include <asm/irq_regs.h>
 #include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
        int index = CPUACCT_STAT_SYSTEM;
-       struct pt_regs *regs = task_pt_regs(tsk);
+       struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
 
        if (regs && user_mode(regs))
                index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        rcu_read_lock();
 
        for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-               this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+               __this_cpu_add(ca->cpuusage->usages[index], cputime);
 
        rcu_read_unlock();
 }
@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 
        rcu_read_lock();
        for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
-               this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+               __this_cpu_add(ca->cpustat->cpustat[index], val);
        rcu_read_unlock();
 }
 
index 239970b..36c5426 100644 (file)
@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
        set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
        set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
        set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0644, proc_dointvec_minmax);
+       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, proc_dointvec_minmax);
        set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
        set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
        /* &table[8] is terminator */
@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        else
                SEQ_printf(m, " %c", task_state_to_char(p));
 
-       SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+       SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
                p->comm, task_pid_nr(p),
                SPLIT_NS(p->se.vruntime),
                (long long)(p->nvcsw + p->nivcsw),
@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
        SEQ_printf(m, "\n");
        SEQ_printf(m, "runnable tasks:\n");
-       SEQ_printf(m, " S           task   PID         tree-key  switches  prio"
+       SEQ_printf(m, " S            task   PID         tree-key  switches  prio"
                   "     wait-time             sum-exec        sum-sleep\n");
        SEQ_printf(m, "-------------------------------------------------------"
-                  "----------------------------------------------------\n");
+                  "------------------------------------------------------\n");
 
        rcu_read_lock();
        for_each_process_thread(g, p) {
@@ -638,7 +638,6 @@ do {                                                                        \
 
        P(nr_running);
        P(nr_switches);
-       P(nr_load_updates);
        P(nr_uninterruptible);
        PN(next_balance);
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
index da3e5b5..0ed04d2 100644 (file)
@@ -191,7 +191,7 @@ static void update_sysctl(void)
 #undef SET_SYSCTL
 }
 
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
 {
        update_sysctl();
 }
@@ -1094,7 +1094,7 @@ struct numa_group {
         * more by CPU use than by memory faults.
         */
        unsigned long *faults_cpu;
-       unsigned long faults[0];
+       unsigned long faults[];
 };
 
 /*
@@ -3441,52 +3441,46 @@ static inline void
 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
        long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 
        /* Nothing to update */
        if (!delta)
                return;
 
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
        /* Set new sched_entity's utilization */
        se->avg.util_avg = gcfs_rq->avg.util_avg;
-       se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+       se->avg.util_sum = se->avg.util_avg * divider;
 
        /* Update parent cfs_rq utilization */
        add_positive(&cfs_rq->avg.util_avg, delta);
-       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
 }
 
 static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
        long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 
        /* Nothing to update */
        if (!delta)
                return;
 
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
        /* Set new sched_entity's runnable */
        se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
-       se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+       se->avg.runnable_sum = se->avg.runnable_avg * divider;
 
        /* Update parent cfs_rq runnable */
        add_positive(&cfs_rq->avg.runnable_avg, delta);
-       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
 }
 
 static inline void
@@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
        unsigned long load_avg;
        u64 load_sum = 0;
        s64 delta_sum;
+       u32 divider;
 
        if (!runnable_sum)
                return;
 
        gcfs_rq->prop_runnable_sum = 0;
 
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
        if (runnable_sum >= 0) {
                /*
                 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
                 * the CPU is saturated running == runnable.
                 */
                runnable_sum += se->avg.load_sum;
-               runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+               runnable_sum = min_t(long, runnable_sum, divider);
        } else {
                /*
                 * Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
        runnable_sum = max(runnable_sum, running_sum);
 
        load_sum = (s64)se_weight(se) * runnable_sum;
-       load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+       load_avg = div_s64(load_sum, divider);
 
        delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
        delta_avg = load_avg - se->avg.load_avg;
@@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
        u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 
        /*
@@ -3873,6 +3878,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
        return cfs_rq->avg.load_avg;
 }
 
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
 static inline unsigned long task_util(struct task_struct *p)
 {
        return READ_ONCE(p->se.avg.util_avg);
@@ -4054,7 +4061,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
 {
        return 0;
 }
@@ -4588,16 +4595,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 }
 
 /* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+                                  struct cfs_rq *cfs_rq, u64 target_runtime)
 {
-       struct task_group *tg = cfs_rq->tg;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount;
+       u64 min_amount, amount = 0;
+
+       lockdep_assert_held(&cfs_b->lock);
 
        /* note: this is a positive sum as runtime_remaining <= 0 */
-       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+       min_amount = target_runtime - cfs_rq->runtime_remaining;
 
-       raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota == RUNTIME_INF)
                amount = min_amount;
        else {
@@ -4609,13 +4616,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                        cfs_b->idle = 0;
                }
        }
-       raw_spin_unlock(&cfs_b->lock);
 
        cfs_rq->runtime_remaining += amount;
 
        return cfs_rq->runtime_remaining > 0;
 }
 
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       int ret;
+
+       raw_spin_lock(&cfs_b->lock);
+       ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+       raw_spin_unlock(&cfs_b->lock);
+
+       return ret;
+}
+
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4723,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        return 0;
 }
 
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
        long task_delta, idle_task_delta, dequeue = 1;
-       bool empty;
+
+       raw_spin_lock(&cfs_b->lock);
+       /* This will start the period timer if necessary */
+       if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+               /*
+                * We have raced with bandwidth becoming available, and if we
+                * actually throttled the timer might not unthrottle us for an
+                * entire period. We additionally needed to make sure that any
+                * subsequent check_cfs_rq_runtime calls agree not to throttle
+                * us, as we may commit to do cfs put_prev+pick_next, so we ask
+                * for 1ns of runtime rather than just check cfs_b.
+                */
+               dequeue = 0;
+       } else {
+               list_add_tail_rcu(&cfs_rq->throttled_list,
+                                 &cfs_b->throttled_cfs_rq);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!dequeue)
+               return false;  /* Throttle no longer required. */
 
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -4744,29 +4783,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        if (!se)
                sub_nr_running(rq, task_delta);
 
-       cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq_clock(rq);
-       raw_spin_lock(&cfs_b->lock);
-       empty = list_empty(&cfs_b->throttled_cfs_rq);
-
        /*
-        * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
-        * not running add to the tail so that later runqueues don't get starved.
+        * Note: distribution will already see us throttled via the
+        * throttled-list.  rq->lock protects completion.
         */
-       if (cfs_b->distribute_running)
-               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       else
-               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
-       /*
-        * If we're the first throttled task, make sure the bandwidth
-        * timer is running.
-        */
-       if (empty)
-               start_cfs_bandwidth(cfs_b);
-
-       raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled = 1;
+       cfs_rq->throttled_clock = rq_clock(rq);
+       return true;
 }
 
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4933,14 +4956,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
        /*
         * This check is repeated as we release cfs_b->lock while we unthrottle.
         */
-       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
-               cfs_b->distribute_running = 1;
+       while (throttled && cfs_b->runtime > 0) {
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                distribute_cfs_runtime(cfs_b);
                raw_spin_lock_irqsave(&cfs_b->lock, flags);
 
-               cfs_b->distribute_running = 0;
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
        }
 
@@ -5054,10 +5075,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
        cfs_b->slack_started = false;
-       if (cfs_b->distribute_running) {
-               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
-               return;
-       }
 
        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5067,9 +5084,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                runtime = cfs_b->runtime;
 
-       if (runtime)
-               cfs_b->distribute_running = 1;
-
        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 
        if (!runtime)
@@ -5078,7 +5092,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        distribute_cfs_runtime(cfs_b);
 
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       cfs_b->distribute_running = 0;
        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 }
 
@@ -5139,8 +5152,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        if (cfs_rq_throttled(cfs_rq))
                return true;
 
-       throttle_cfs_rq(cfs_rq);
-       return true;
+       return throttle_cfs_rq(cfs_rq);
 }
 
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5170,6 +5182,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                if (!overrun)
                        break;
 
+               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
                if (++count > 3) {
                        u64 new, old = ktime_to_ns(cfs_b->period);
 
@@ -5199,8 +5213,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                        /* reset count so we don't come right back in here */
                        count = 0;
                }
-
-               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
        }
        if (idle)
                cfs_b->period_active = 0;
@@ -5221,7 +5233,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        cfs_b->period_timer.function = sched_cfs_period_timer;
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
-       cfs_b->distribute_running = 0;
        cfs_b->slack_started = false;
 }
 
@@ -5506,28 +5517,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        list_add_leaf_cfs_rq(cfs_rq);
        }
 
-enqueue_throttle:
-       if (!se) {
-               add_nr_running(rq, 1);
-               /*
-                * Since new tasks are assigned an initial util_avg equal to
-                * half of the spare capacity of their CPU, tiny tasks have the
-                * ability to cross the overutilized threshold, which will
-                * result in the load balancer ruining all the task placement
-                * done by EAS. As a way to mitigate that effect, do not account
-                * for the first enqueue operation of new tasks during the
-                * overutilized flag detection.
-                *
-                * A better way of solving this problem would be to wait for
-                * the PELT signals of tasks to converge before taking them
-                * into account, but that is not straightforward to implement,
-                * and the following generally works well enough in practice.
-                */
-               if (flags & ENQUEUE_WAKEUP)
-                       update_overutilized_status(rq);
+       /* At this point se is NULL and we are at root level*/
+       add_nr_running(rq, 1);
 
-       }
+       /*
+        * Since new tasks are assigned an initial util_avg equal to
+        * half of the spare capacity of their CPU, tiny tasks have the
+        * ability to cross the overutilized threshold, which will
+        * result in the load balancer ruining all the task placement
+        * done by EAS. As a way to mitigate that effect, do not account
+        * for the first enqueue operation of new tasks during the
+        * overutilized flag detection.
+        *
+        * A better way of solving this problem would be to wait for
+        * the PELT signals of tasks to converge before taking them
+        * into account, but that is not straightforward to implement,
+        * and the following generally works well enough in practice.
+        */
+       if (flags & ENQUEUE_WAKEUP)
+               update_overutilized_status(rq);
 
+enqueue_throttle:
        if (cfs_bandwidth_used()) {
                /*
                 * When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5737,7 +5747,7 @@ static int wake_wide(struct task_struct *p)
 {
        unsigned int master = current->wakee_flips;
        unsigned int slave = p->wakee_flips;
-       int factor = this_cpu_read(sd_llc_size);
+       int factor = __this_cpu_read(sd_llc_size);
 
        if (master < slave)
                swap(master, slave);
@@ -5846,8 +5856,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 }
 
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
 
 /*
  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5930,7 +5939,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                        continue;
                }
 
-               group = find_idlest_group(sd, p, cpu, sd_flag);
+               group = find_idlest_group(sd, p, cpu);
                if (!group) {
                        sd = sd->child;
                        continue;
@@ -6671,9 +6680,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
-               if (!(tmp->flags & SD_LOAD_BALANCE))
-                       break;
-
                /*
                 * If both 'cpu' and 'prev_cpu' are part of this domain,
                 * cpu is a valid SD_WAKE_AFFINE target.
@@ -8584,7 +8590,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
         */
 
 #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                return 0;
 #endif
 
@@ -8702,8 +8708,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * Assumes p is allowed on at least one CPU in sd.
  */
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
        struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
        struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9434,7 +9439,7 @@ static int active_load_balance_cpu_stop(void *data);
 static int should_we_balance(struct lb_env *env)
 {
        struct sched_group *sg = env->sd->groups;
-       int cpu, balance_cpu = -1;
+       int cpu;
 
        /*
         * Ensure the balancing environment is consistent; can happen
@@ -9455,18 +9460,12 @@ static int should_we_balance(struct lb_env *env)
                if (!idle_cpu(cpu))
                        continue;
 
-               balance_cpu = cpu;
-               break;
+               /* Are we the first idle CPU? */
+               return cpu == env->dst_cpu;
        }
 
-       if (balance_cpu == -1)
-               balance_cpu = group_balance_cpu(sg);
-
-       /*
-        * First idle CPU or the first CPU(busiest) in this sched group
-        * is eligible for doing load balancing at this and above domains.
-        */
-       return balance_cpu == env->dst_cpu;
+       /* Are we the first CPU of this group ? */
+       return group_balance_cpu(sg) == env->dst_cpu;
 }
 
 /*
@@ -9819,9 +9818,8 @@ static int active_load_balance_cpu_stop(void *data)
        /* Search for an sd spanning us and the target CPU. */
        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
-               if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                               break;
+               if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                       break;
        }
 
        if (likely(sd)) {
@@ -9910,9 +9908,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                }
                max_cost += sd->max_newidle_lb_cost;
 
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                /*
                 * Stop the load balance at this level. There is another
                 * CPU in our sched group which is doing load balancing more
@@ -10029,17 +10024,20 @@ static void kick_ilb(unsigned int flags)
        if (ilb_cpu >= nr_cpu_ids)
                return;
 
+       /*
+        * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+        * the first flag owns it; cleared by nohz_csd_func().
+        */
        flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
        if (flags & NOHZ_KICK_MASK)
                return;
 
        /*
-        * Use smp_send_reschedule() instead of resched_cpu().
-        * This way we generate a sched IPI on the target CPU which
+        * This way we generate an IPI on the target CPU which
         * is idle. And the softirq performing nohz idle load balance
         * will be run before returning from the IPI.
         */
-       smp_send_reschedule(ilb_cpu);
+       smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
 }
 
 /*
@@ -10377,20 +10375,14 @@ abort:
  */
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-       int this_cpu = this_rq->cpu;
-       unsigned int flags;
+       unsigned int flags = this_rq->nohz_idle_balance;
 
-       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+       if (!flags)
                return false;
 
-       if (idle != CPU_IDLE) {
-               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-               return false;
-       }
+       this_rq->nohz_idle_balance = 0;
 
-       /* could be _relaxed() */
-       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-       if (!(flags & NOHZ_KICK_MASK))
+       if (idle != CPU_IDLE)
                return false;
 
        _nohz_idle_balance(this_rq, flags, idle);
@@ -10450,7 +10442,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  *     0 - failed, no new tasks
  *   > 0 - success, new (fair) tasks present
  */
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
        unsigned long next_balance = jiffies + HZ;
        int this_cpu = this_rq->cpu;
@@ -10501,9 +10493,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
                int continue_balancing = 1;
                u64 t0, domain_cost;
 
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
                        update_next_balance(sd, &next_balance);
                        break;
index b743bf3..05deb81 100644 (file)
@@ -289,7 +289,11 @@ static void do_idle(void)
         */
        smp_mb__after_atomic();
 
-       sched_ttwu_pending();
+       /*
+        * RCU relies on this call to be done outside of an RCU read-side
+        * critical section.
+        */
+       flush_smp_call_function_from_idle();
        schedule_idle();
 
        if (unlikely(klp_patch_pending(current)))
index b647d04..b4b1ff9 100644 (file)
@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
        return 1;
 }
 
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ *   LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ *   LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
 static __always_inline void
 ___update_load_avg(struct sched_avg *sa, unsigned long load)
 {
index df11d88..6d60ba2 100644 (file)
@@ -9,6 +9,8 @@
 
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
        if (rt_period == 0)
                return -EINVAL;
 
+       /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+               return -EINVAL;
+
        mutex_lock(&rt_constraints_mutex);
        err = __rt_schedulable(tg, rt_period, rt_runtime);
        if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
                return -EINVAL;
 
        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
-               (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+               ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+                ((u64)sysctl_sched_rt_runtime *
+                       NSEC_PER_USEC > max_rt_runtime)))
                return -EINVAL;
 
        return 0;
index db3a576..1d4e94c 100644 (file)
@@ -349,7 +349,6 @@ struct cfs_bandwidth {
 
        u8                      idle;
        u8                      period_active;
-       u8                      distribute_running;
        u8                      slack_started;
        struct hrtimer          period_timer;
        struct hrtimer          slack_timer;
@@ -890,12 +889,15 @@ struct rq {
 #ifdef CONFIG_SMP
        unsigned long           last_blocked_load_update_tick;
        unsigned int            has_blocked_load;
+       call_single_data_t      nohz_csd;
 #endif /* CONFIG_SMP */
        unsigned int            nohz_tick_stopped;
-       atomic_t nohz_flags;
+       atomic_t                nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 
-       unsigned long           nr_load_updates;
+#ifdef CONFIG_SMP
+       unsigned int            ttwu_pending;
+#endif
        u64                     nr_switches;
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -951,6 +953,7 @@ struct rq {
 
        struct callback_head    *balance_callback;
 
+       unsigned char           nohz_idle_balance;
        unsigned char           idle_balance;
 
        unsigned long           misfit_task_load;
@@ -979,7 +982,7 @@ struct rq {
 
        /* This is used to determine avg_idle's max value */
        u64                     max_idle_balance_cost;
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64                     prev_irq_time;
@@ -1020,10 +1023,6 @@ struct rq {
        unsigned int            ttwu_local;
 #endif
 
-#ifdef CONFIG_SMP
-       struct llist_head       wake_list;
-#endif
-
 #ifdef CONFIG_CPU_IDLE
        /* Must be inspected within a rcu lock section */
        struct cpuidle_state    *idle_state;
@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
        rq->balance_callback = head;
 }
 
-extern void sched_ttwu_pending(void);
-
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
                              lockdep_is_held(&sched_domains_mutex))
@@ -1461,7 +1458,7 @@ struct sched_group {
         * by attaching extra space to the end of the structure,
         * depending on how many CPUs the kernel has booted up with)
         */
-       unsigned long           cpumask[0];
+       unsigned long           cpumask[];
 };
 
 static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
 }
 #endif
 
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
-
-#else
-
-static inline void sched_ttwu_pending(void) { }
+extern void flush_smp_call_function_from_idle(void);
 
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
 
 #include "stats.h"
 #include "autogroup.h"
@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
  */
 #define WF_SYNC                        0x01            /* Waker goes to sleep after wakeup */
 #define WF_FORK                        0x02            /* Child wakeup after fork */
-#define WF_MIGRATED            0x4             /* Internal use, task got migrated */
+#define WF_MIGRATED            0x04            /* Internal use, task got migrated */
+#define WF_ON_RQ               0x08            /* Wakee is on_rq */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 #define BW_SHIFT               20
 #define BW_UNIT                        (1 << BW_SHIFT)
 #define RATIO_SHIFT            8
+#define MAX_BW_BITS            (64 - BW_SHIFT)
+#define MAX_BW                 ((1ULL << MAX_BW_BITS) - 1)
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
new file mode 100644 (file)
index 0000000..9620e32
--- /dev/null
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+
+extern void sched_ttwu_pending(void *arg);
+
+extern void send_call_function_single_ipi(int cpu);
index 8344757..1d7b446 100644 (file)
@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
        cpumask_clear(groupmask);
 
        printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
-
-       if (!(sd->flags & SD_LOAD_BALANCE)) {
-               printk("does not load-balance\n");
-               if (sd->parent)
-                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
-               return -1;
-       }
-
        printk(KERN_CONT "span=%*pbl level=%s\n",
               cpumask_pr_args(sched_domain_span(sd)), sd->name);
 
@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
                return 1;
 
        /* Following flags need at least 2 groups */
-       if (sd->flags & (SD_LOAD_BALANCE |
-                        SD_BALANCE_NEWIDLE |
+       if (sd->flags & (SD_BALANCE_NEWIDLE |
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUCAPACITY |
@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
        /* Flags needing groups don't count if only 1 group in parent */
        if (parent->groups == parent->groups->next) {
-               pflags &= ~(SD_LOAD_BALANCE |
-                               SD_BALANCE_NEWIDLE |
-                               SD_BALANCE_FORK |
-                               SD_BALANCE_EXEC |
-                               SD_ASYM_CPUCAPACITY |
-                               SD_SHARE_CPUCAPACITY |
-                               SD_SHARE_PKG_RESOURCES |
-                               SD_PREFER_SIBLING |
-                               SD_SHARE_POWERDOMAIN);
+               pflags &= ~(SD_BALANCE_NEWIDLE |
+                           SD_BALANCE_FORK |
+                           SD_BALANCE_EXEC |
+                           SD_ASYM_CPUCAPACITY |
+                           SD_SHARE_CPUCAPACITY |
+                           SD_SHARE_PKG_RESOURCES |
+                           SD_PREFER_SIBLING |
+                           SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
 
                .cache_nice_tries       = 0,
 
-               .flags                  = 1*SD_LOAD_BALANCE
-                                       | 1*SD_BALANCE_NEWIDLE
+               .flags                  = 1*SD_BALANCE_NEWIDLE
                                        | 1*SD_BALANCE_EXEC
                                        | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
index 8430319..472c2b2 100644 (file)
 #include <linux/hypervisor.h>
 
 #include "smpboot.h"
+#include "sched/smp.h"
 
-enum {
-       CSD_FLAG_LOCK           = 0x01,
-       CSD_FLAG_SYNCHRONOUS    = 0x02,
-};
+#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
 
 struct call_function_data {
        call_single_data_t      __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
         * still pending.
         */
        flush_smp_call_function_queue(false);
+       irq_work_run();
        return 0;
 }
 
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 
+void __smp_call_single_queue(int cpu, struct llist_node *node)
+{
+       /*
+        * The list addition should be visible before sending the IPI
+        * handler locks the list to pull the entry off it because of
+        * normal cache coherency rules implied by spinlocks.
+        *
+        * If IPIs can go out of order to the cache coherency protocol
+        * in an architecture, sufficient synchronisation should be added
+        * to arch code to make it appear to obey cache coherency WRT
+        * locking and barrier primitives. Generic code isn't really
+        * equipped to do the right thing...
+        */
+       if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+               send_call_function_single_ipi(cpu);
+}
+
 /*
  * Insert a previously allocated call_single_data_t element
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, call_single_data_t *csd,
-                              smp_call_func_t func, void *info)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
 {
        if (cpu == smp_processor_id()) {
+               smp_call_func_t func = csd->func;
+               void *info = csd->info;
                unsigned long flags;
 
                /*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
                return 0;
        }
 
-
        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }
 
-       csd->func = func;
-       csd->info = info;
-
-       /*
-        * The list addition should be visible before sending the IPI
-        * handler locks the list to pull the entry off it because of
-        * normal cache coherency rules implied by spinlocks.
-        *
-        * If IPIs can go out of order to the cache coherency protocol
-        * in an architecture, sufficient synchronisation should be added
-        * to arch code to make it appear to obey cache coherency WRT
-        * locking and barrier primitives. Generic code isn't really
-        * equipped to do the right thing...
-        */
-       if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
-               arch_send_call_function_single_ipi(cpu);
+       __smp_call_single_queue(cpu, &csd->llist);
 
        return 0;
 }
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
  */
 static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
-       struct llist_head *head;
-       struct llist_node *entry;
        call_single_data_t *csd, *csd_next;
+       struct llist_node *entry, *prev;
+       struct llist_head *head;
        static bool warned;
 
        lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
-               llist_for_each_entry(csd, entry, llist)
-                       pr_warn("IPI callback %pS sent to offline CPU\n",
-                               csd->func);
+               llist_for_each_entry(csd, entry, llist) {
+                       switch (CSD_TYPE(csd)) {
+                       case CSD_TYPE_ASYNC:
+                       case CSD_TYPE_SYNC:
+                       case CSD_TYPE_IRQ_WORK:
+                               pr_warn("IPI callback %pS sent to offline CPU\n",
+                                       csd->func);
+                               break;
+
+                       case CSD_TYPE_TTWU:
+                               pr_warn("IPI task-wakeup sent to offline CPU\n");
+                               break;
+
+                       default:
+                               pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+                                       CSD_TYPE(csd));
+                               break;
+                       }
+               }
        }
 
+       /*
+        * First; run all SYNC callbacks, people are waiting for us.
+        */
+       prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-               smp_call_func_t func = csd->func;
-               void *info = csd->info;
-
                /* Do we wait until *after* callback? */
-               if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+               if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+                       smp_call_func_t func = csd->func;
+                       void *info = csd->info;
+
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+
                        func(info);
                        csd_unlock(csd);
                } else {
-                       csd_unlock(csd);
-                       func(info);
+                       prev = &csd->llist;
                }
        }
 
+       if (!entry)
+               return;
+
        /*
-        * Handle irq works queued remotely by irq_work_queue_on().
-        * Smp functions above are typically synchronous so they
-        * better run first since some other CPUs may be busy waiting
-        * for them.
+        * Second; run all !SYNC callbacks.
         */
-       irq_work_run();
+       prev = NULL;
+       llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+               int type = CSD_TYPE(csd);
+
+               if (type != CSD_TYPE_TTWU) {
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+
+                       if (type == CSD_TYPE_ASYNC) {
+                               smp_call_func_t func = csd->func;
+                               void *info = csd->info;
+
+                               csd_unlock(csd);
+                               func(info);
+                       } else if (type == CSD_TYPE_IRQ_WORK) {
+                               irq_work_single(csd);
+                       }
+
+               } else {
+                       prev = &csd->llist;
+               }
+       }
+
+       /*
+        * Third; only CSD_TYPE_TTWU is left, issue those.
+        */
+       if (entry)
+               sched_ttwu_pending(entry);
+}
+
+void flush_smp_call_function_from_idle(void)
+{
+       unsigned long flags;
+
+       if (llist_empty(this_cpu_ptr(&call_single_queue)))
+               return;
+
+       local_irq_save(flags);
+       flush_smp_call_function_queue(true);
+       local_irq_restore(flags);
 }
 
 /*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 {
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
-               .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+               .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
        };
        int this_cpu;
        int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                csd_lock(csd);
        }
 
-       err = generic_exec_single(cpu, csd, func, info);
+       csd->func = func;
+       csd->info = info;
+
+       err = generic_exec_single(cpu, csd);
 
        if (wait)
                csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
        csd->flags = CSD_FLAG_LOCK;
        smp_wmb();
 
-       err = generic_exec_single(cpu, csd, csd->func, csd->info);
+       err = generic_exec_single(cpu, csd);
 
 out:
        preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 
                csd_lock(csd);
                if (wait)
-                       csd->flags |= CSD_FLAG_SYNCHRONOUS;
+                       csd->flags |= CSD_TYPE_SYNC;
                csd->func = func;
                csd->info = info;
                if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@ -598,6 +669,24 @@ void __init smp_init(void)
 {
        int num_nodes, num_cpus;
 
+       /*
+        * Ensure struct irq_work layout matches so that
+        * flush_smp_call_function_queue() can do horrible things.
+        */
+       BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+                    offsetof(struct __call_single_data, llist));
+       BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+                    offsetof(struct __call_single_data, func));
+       BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+                    offsetof(struct __call_single_data, flags));
+
+       /*
+        * Assert the CSD_TYPE_TTWU layout is similar enough
+        * for task_struct to be on the @call_single_queue.
+        */
+       BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+                    offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+
        idle_threads_init();
        cpuhp_threads_init();