Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c

index 13e2516..b2ba3e9 100644 (file)
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
         /* Standard hot unplug procedure */
  
         idle_task_exit();
-       current->active_mm = NULL; /* for sanity */
         cpu = smp_processor_id();
         DBG("CPU%d offline\n", cpu);
         generic_set_cpu_dead(cpu);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h

index 3b752e8..2735da5 100644 (file)
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -13,6 +13,8 @@
   * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
   */
  
+/* flags share CSD_FLAG_ space */
+
  #define IRQ_WORK_PENDING       BIT(0)
  #define IRQ_WORK_BUSY          BIT(1)
  
@@ -23,9 +25,12 @@
  
  #define IRQ_WORK_CLAIMED       (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
  
+/*
+ * structure shares layout with single_call_data_t.
+ */
  struct irq_work {
-       atomic_t flags;
         struct llist_node llnode;
+       atomic_t flags;
         void (*func)(struct irq_work *);
  };
  
@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
  
  void irq_work_run(void);
  bool irq_work_needs_cpu(void);
+void irq_work_single(void *arg);
  #else
  static inline bool irq_work_needs_cpu(void) { return false; }
  static inline void irq_work_run(void) { }
+static inline void irq_work_single(void *arg) { }
  #endif
  
  #endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 33bb7c5..12938d4 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -654,6 +654,7 @@ struct task_struct {
  
  #ifdef CONFIG_SMP
         struct llist_node               wake_entry;
+       unsigned int                    wake_entry_type;
         int                             on_cpu;
  #ifdef CONFIG_THREAD_INFO_IN_TASK
         /* Current CPU: */
@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
  })
  
  #ifdef CONFIG_SMP
-void scheduler_ipi(void);
+static __always_inline void scheduler_ipi(void)
+{
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       preempt_fold_need_resched();
+}
  extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
  #else
  static inline void scheduler_ipi(void) { }
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index c49257a..a132d87 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
                 __mmdrop(mm);
  }
  
+void mmdrop(struct mm_struct *mm);
+
  /*
   * This has to be called after a get_task_mm()/mmget_not_zero()
   * followed by taking the mmap_sem for writing before modifying the
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 95253ad..fb11091 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -11,21 +11,20 @@
   */
  #ifdef CONFIG_SMP
  
-#define SD_LOAD_BALANCE                0x0001  /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE     0x0002  /* Balance when about to become idle */
-#define SD_BALANCE_EXEC                0x0004  /* Balance on exec */
-#define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
-#define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
-#define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY    0x0040  /* Domain members have different CPU capacities */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share CPU capacity */
-#define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share CPU pkg resources */
-#define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
-#define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
-#define SD_NUMA                        0x4000  /* cross-node balancing */
+#define SD_BALANCE_NEWIDLE     0x0001  /* Balance when about to become idle */
+#define SD_BALANCE_EXEC                0x0002  /* Balance on exec */
+#define SD_BALANCE_FORK                0x0004  /* Balance on fork, clone */
+#define SD_BALANCE_WAKE                0x0008  /* Balance on wakeup */
+#define SD_WAKE_AFFINE         0x0010  /* Wake task to waking CPU */
+#define SD_ASYM_CPUCAPACITY    0x0020  /* Domain members have different CPU capacities */
+#define SD_SHARE_CPUCAPACITY   0x0040  /* Domain members share CPU capacity */
+#define SD_SHARE_POWERDOMAIN   0x0080  /* Domain members share power domain */
+#define SD_SHARE_PKG_RESOURCES 0x0100  /* Domain members share CPU pkg resources */
+#define SD_SERIALIZE           0x0200  /* Only a single load balancing instance */
+#define SD_ASYM_PACKING                0x0400  /* Place busy groups earlier in the domain */
+#define SD_PREFER_SIBLING      0x0800  /* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP             0x1000  /* sched_domains of this level overlap */
+#define SD_NUMA                        0x2000  /* cross-node balancing */
  
  #ifdef CONFIG_SCHED_SMT
  static inline int cpu_smt_flags(void)
diff --git a/include/linux/smp.h b/include/linux/smp.h

index 0401987..7ee202a 100644 (file)
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -16,17 +16,39 @@
  
  typedef void (*smp_call_func_t)(void *info);
  typedef bool (*smp_cond_func_t)(int cpu, void *info);
+
+enum {
+       CSD_FLAG_LOCK           = 0x01,
+
+       /* IRQ_WORK_flags */
+
+       CSD_TYPE_ASYNC          = 0x00,
+       CSD_TYPE_SYNC           = 0x10,
+       CSD_TYPE_IRQ_WORK       = 0x20,
+       CSD_TYPE_TTWU           = 0x30,
+       CSD_FLAG_TYPE_MASK      = 0xF0,
+};
+
+/*
+ * structure shares (partial) layout with struct irq_work
+ */
  struct __call_single_data {
         struct llist_node llist;
+       unsigned int flags;
         smp_call_func_t func;
         void *info;
-       unsigned int flags;
  };
  
  /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
  typedef struct __call_single_data call_single_data_t
         __aligned(sizeof(struct __call_single_data));
  
+/*
+ * Enqueue a llist_node on the call_single_queue; be very careful, read
+ * flush_smp_call_function_queue() in detail.
+ */
+extern void __smp_call_single_queue(int cpu, struct llist_node *node);
+
  /* total number of cpus in this system (may exceed NR_CPUS) */
  extern unsigned int total_cpus;
  
diff --git a/include/linux/swait.h b/include/linux/swait.h

index 73e06e9..6a8c22b 100644 (file)
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -9,23 +9,10 @@
  #include <asm/current.h>
  
  /*
- * BROKEN wait-queues.
- *
- * These "simple" wait-queues are broken garbage, and should never be
- * used. The comments below claim that they are "similar" to regular
- * wait-queues, but the semantics are actually completely different, and
- * every single user we have ever had has been buggy (or pointless).
- *
- * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
- * "wake_up()" does, and has led to problems. In other cases, it has
- * been fine, because there's only ever one waiter (kvm), but in that
- * case gthe whole "simple" wait-queue is just pointless to begin with,
- * since there is no "queue". Use "wake_up_process()" with a direct
- * pointer instead.
- *
- * While these are very similar to regular wait queues (wait.h) the most
- * important difference is that the simple waitqueue allows for deterministic
- * behaviour -- IOW it has strictly bounded IRQ and lock hold times.
+ * Simple waitqueues are semantically very different to regular wait queues
+ * (wait.h). The most important difference is that the simple waitqueue allows
+ * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
   *
   * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
   * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
@@ -39,7 +26,7 @@
   *    sleeper state.
   *
   *  - the !exclusive mode; because that leads to O(n) wakeups, everything is
- *    exclusive.
+ *    exclusive. As such swake_up_one will only ever awake _one_ waiter.
   *
   *  - custom wake callback functions; because you cannot give any guarantees
   *    about random code. This also allows swait to be used in RT, such that
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 9f89214..6ff2578 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
   *
   * This code is licenced under the GPL.
   */
+#include <linux/sched/mm.h>
  #include <linux/proc_fs.h>
  #include <linux/smp.h>
  #include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
         return bringup_wait_for_ap(cpu);
  }
  
+static int finish_cpu(unsigned int cpu)
+{
+       struct task_struct *idle = idle_thread_get(cpu);
+       struct mm_struct *mm = idle->active_mm;
+
+       /*
+        * idle_task_exit() will have switched to &init_mm, now
+        * clean up any remaining active_mm state.
+        */
+       if (mm != &init_mm)
+               idle->active_mm = &init_mm;
+       mmdrop(mm);
+       return 0;
+}
+
  /*
   * Hotplug state machine related functions
   */
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
         [CPUHP_BRINGUP_CPU] = {
                 .name                   = "cpu:bringup",
                 .startup.single         = bringup_cpu,
-               .teardown.single        = NULL,
+               .teardown.single        = finish_cpu,
                 .cant_stop              = true,
         },
         /* Final state before CPU kills itself */
diff --git a/kernel/exit.c b/kernel/exit.c

index 1b772f2..c81805a 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
         struct task_struct *tsk = current;
         int group_dead;
  
-       profile_task_exit(tsk);
-       kcov_task_exit(tsk);
+       /*
+        * We can get here from a kernel oops, sometimes with preemption off.
+        * Start by checking for critical errors.
+        * Then fix up important state like USER_DS and preemption.
+        * Then do everything else.
+        */
  
         WARN_ON(blk_needs_flush_plug(tsk));
  
@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
          */
         set_fs(USER_DS);
  
+       if (unlikely(in_atomic())) {
+               pr_info("note: %s[%d] exited with preempt_count %d\n",
+                       current->comm, task_pid_nr(current),
+                       preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
+
+       profile_task_exit(tsk);
+       kcov_task_exit(tsk);
+
         ptrace_event(PTRACE_EVENT_EXIT, code);
  
         validate_creds_for_do_exit(tsk);
@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
  
         exit_signals(tsk);  /* sets PF_EXITING */
  
-       if (unlikely(in_atomic())) {
-               pr_info("note: %s[%d] exited with preempt_count %d\n",
-                       current->comm, task_pid_nr(current),
-                       preempt_count());
-               preempt_count_set(PREEMPT_ENABLED);
-       }
-
         /* sync mm's RSS info before statistics gathering */
         if (tsk->mm)
                 sync_mm_rss(tsk->mm);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c

index 48b5d1b..eca8396 100644 (file)
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
  {
         int oflags;
  
-       oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+       oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
         /*
          * If the work is already pending, no need to raise the IPI.
          * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
         if (cpu != smp_processor_id()) {
                 /* Arch remote IPI send/receive backend aren't NMI safe */
                 WARN_ON_ONCE(in_nmi());
-               if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-                       arch_send_call_function_single_ipi(cpu);
+               __smp_call_single_queue(cpu, &work->llnode);
         } else {
                 __irq_work_queue_local(work);
         }
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
         return true;
  }
  
+void irq_work_single(void *arg)
+{
+       struct irq_work *work = arg;
+       int flags;
+
+       /*
+        * Clear the PENDING bit, after this point the @work
+        * can be re-used.
+        * Make it immediately visible so that other CPUs trying
+        * to claim that work don't rely on us to handle their data
+        * while we are in the middle of the func.
+        */
+       flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+
+       lockdep_irq_work_enter(work);
+       work->func(work);
+       lockdep_irq_work_exit(work);
+       /*
+        * Clear the BUSY bit and return to the free state if
+        * no-one else claimed it meanwhile.
+        */
+       flags &= ~IRQ_WORK_PENDING;
+       (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
  static void irq_work_run_list(struct llist_head *list)
  {
         struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
                 return;
  
         llnode = llist_del_all(list);
-       llist_for_each_entry_safe(work, tmp, llnode, llnode) {
-               int flags;
-               /*
-                * Clear the PENDING bit, after this point the @work
-                * can be re-used.
-                * Make it immediately visible so that other CPUs trying
-                * to claim that work don't rely on us to handle their data
-                * while we are in the middle of the func.
-                */
-               flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
-
-               lockdep_irq_work_enter(work);
-               work->func(work);
-               lockdep_irq_work_exit(work);
-               /*
-                * Clear the BUSY bit and return to the free state if
-                * no-one else claimed it meanwhile.
-                */
-               flags &= ~IRQ_WORK_PENDING;
-               (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
-       }
+       llist_for_each_entry_safe(work, tmp, llnode, llnode)
+               irq_work_single(work);
  }
  
  /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 0ae29fd..d766902 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -21,6 +21,7 @@
  #include "../smpboot.h"
  
  #include "pelt.h"
+#include "smp.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
         update_rq_clock_task(rq, delta);
  }
  
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+       csd->flags = 0;
+       csd->func = func;
+       csd->info = rq;
+}
  
  #ifdef CONFIG_SCHED_HRTICK
  /*
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                       HRTIMER_MODE_REL_PINNED_HARD);
  }
+
  #endif /* CONFIG_SMP */
  
  static void hrtick_rq_init(struct rq *rq)
  {
  #ifdef CONFIG_SMP
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
+       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
  #endif
-
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
  }
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
                 wake_up_idle_cpu(cpu);
  }
  
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
  {
-       int cpu = smp_processor_id();
-
-       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
-               return false;
-
-       if (idle_cpu(cpu) && !need_resched())
-               return true;
+       struct rq *rq = info;
+       int cpu = cpu_of(rq);
+       unsigned int flags;
  
         /*
-        * We can't run Idle Load Balance on this CPU for this time so we
-        * cancel it and clear NOHZ_BALANCE_KICK
+        * Release the rq::nohz_csd.
          */
-       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
-       return false;
-}
+       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+       WARN_ON(!(flags & NOHZ_KICK_MASK));
  
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
-       return false;
+       rq->idle_balance = idle_cpu(cpu);
+       if (rq->idle_balance && !need_resched()) {
+               rq->nohz_idle_balance = flags;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
  }
  
  #endif /* CONFIG_NO_HZ_COMMON */
@@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
          * __migrate_task() such that we will not miss enforcing cpus_ptr
          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
          */
-       sched_ttwu_pending();
+       flush_smp_call_function_from_idle();
  
         raw_spin_lock(&p->pi_lock);
         rq_lock(rq, &rf);
@@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
  }
  
  #ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
  {
+       struct llist_node *llist = arg;
         struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
         struct task_struct *p, *t;
         struct rq_flags rf;
  
         if (!llist)
                 return;
  
+       /*
+        * rq::ttwu_pending racy indication of out-standing wakeups.
+        * Races such that false-negatives are possible, since they
+        * are shorter lived that false-positives would be.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
+
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
  
@@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
         rq_unlock_irqrestore(rq, &rf);
  }
  
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
  {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
-
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
-
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
+       struct rq *rq = cpu_rq(cpu);
  
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
+       if (!set_nr_if_polling(rq->idle))
+               arch_send_call_function_single_ipi(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
  }
  
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
  {
         struct rq *rq = cpu_rq(cpu);
  
         p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
  
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-               if (!set_nr_if_polling(rq->idle))
-                       smp_send_reschedule(cpu);
-               else
-                       trace_sched_wake_idle_without_ipi(cpu);
-       }
+       WRITE_ONCE(rq->ttwu_pending, 1);
+       __smp_call_single_queue(cpu, &p->wake_entry);
  }
  
  void wake_up_if_idle(int cpu)
@@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
  {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+       /*
+        * If the CPU does not share cache, then queue the task on the
+        * remote rqs wakelist to avoid accessing remote data.
+        */
+       if (!cpus_share_cache(smp_processor_id(), cpu))
+               return true;
+
+       /*
+        * If the task is descheduling and the only running task on the
+        * CPU then use the wakelist to offload the task activation to
+        * the soon-to-be-idle CPU as the current CPU is likely busy.
+        * nr_running is checked to avoid unnecessary task stacking.
+        */
+       if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+               return true;
+
+       return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+               __ttwu_queue_wakelist(p, cpu, wake_flags);
+               return true;
+       }
+
+       return false;
+}
  #endif /* CONFIG_SMP */
  
  static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
         struct rq_flags rf;
  
  #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               ttwu_queue_remote(p, cpu, wake_flags);
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                 return;
-       }
  #endif
  
         rq_lock(rq, &rf);
@@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto unlock;
  
+       if (p->in_iowait) {
+               delayacct_blkio_end(p);
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
  #ifdef CONFIG_SMP
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
+
         /*
          * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
          * possible to, falsely, observe p->on_cpu == 0.
@@ -2593,6 +2611,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         /*
          * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        */
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+               goto unlock;
+
+       /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
          * this task as prev, wait until its done referencing the task.
          *
          * Pairs with the smp_store_release() in finish_task().
@@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         smp_cond_load_acquire(&p->on_cpu, !VAL);
  
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 psi_ttwu_dequeue(p);
                 set_task_cpu(p, cpu);
         }
-
-#else /* CONFIG_SMP */
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
  #endif /* CONFIG_SMP */
  
         ttwu_queue(p, cpu, wake_flags);
@@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->capture_control = NULL;
  #endif
         init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+       p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
  }
  
  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
         schedstat_inc(this_rq()->sched_count);
  }
  
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+                                 struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+       const struct sched_class *class;
+       /*
+        * We must do the balancing pass before put_prev_task(), such
+        * that when we release the rq->lock the task is in the same
+        * state as before we took rq->lock.
+        *
+        * We can terminate the balance pass as soon as we know there is
+        * a runnable task of @class priority or higher.
+        */
+       for_class_range(class, prev->sched_class, &idle_sched_class) {
+               if (class->balance(rq, prev, rf))
+                       break;
+       }
+#endif
+
+       put_prev_task(rq, prev);
+}
+
  /*
   * Pick up the highest-prio task:
   */
@@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         }
  
  restart:
-#ifdef CONFIG_SMP
-       /*
-        * We must do the balancing pass before put_next_task(), such
-        * that when we release the rq->lock the task is in the same
-        * state as before we took rq->lock.
-        *
-        * We can terminate the balance pass as soon as we know there is
-        * a runnable task of @class priority or higher.
-        */
-       for_class_range(class, prev->sched_class, &idle_sched_class) {
-               if (class->balance(rq, prev, rf))
-                       break;
-       }
-#endif
-
-       put_prev_task(rq, prev);
+       put_prev_task_balance(rq, prev, rf);
  
         for_each_class(class) {
                 p = class->pick_next_task(rq);
@@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
                 return 0;
  
  #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                 return 0;
  #endif
  
@@ -6243,13 +6265,14 @@ void idle_task_exit(void)
         struct mm_struct *mm = current->active_mm;
  
         BUG_ON(cpu_online(smp_processor_id()));
+       BUG_ON(current != this_rq()->idle);
  
         if (mm != &init_mm) {
                 switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
                 finish_arch_post_lock_switch();
         }
-       mmdrop(mm);
+
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
  }
  
  /*
@@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
         struct rq_flags rf;
  
         /* Handle pending wakeups and then migrate everything off */
-       sched_ttwu_pending();
         sched_tick_stop(cpu);
  
         rq_lock_irqsave(rq, &rf);
@@ -6642,6 +6664,8 @@ void __init sched_init(void)
                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6694,7 +6718,6 @@ void __init sched_init(void)
                 init_rt_rq(&rq->rt);
                 init_dl_rq(&rq->dl);
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                 /*
@@ -6716,7 +6739,6 @@ void __init sched_init(void)
                  * We achieve this by letting root_task_group's tasks sit
                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
@@ -6744,6 +6766,8 @@ void __init sched_init(void)
  #ifdef CONFIG_NO_HZ_COMMON
                 rq->last_blocked_load_update_tick = jiffies;
                 atomic_set(&rq->nohz_flags, 0);
+
+               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
  #endif
  #endif /* CONFIG_SMP */
                 hrtick_rq_init(rq);
@@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
  
  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
  static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
  
@@ -7466,6 +7492,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                 return -EINVAL;
  
         /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+               return -EINVAL;
+
+       /*
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c

index 9fbb103..941c28c 100644 (file)
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
   * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
   * (balbir@in.ibm.com).
   */
+#include <asm/irq_regs.h>
  #include "sched.h"
  
  /* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
         struct cpuacct *ca;
         int index = CPUACCT_STAT_SYSTEM;
-       struct pt_regs *regs = task_pt_regs(tsk);
+       struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
  
         if (regs && user_mode(regs))
                 index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
         rcu_read_lock();
  
         for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-               this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+               __this_cpu_add(ca->cpuusage->usages[index], cputime);
  
         rcu_read_unlock();
  }
@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
  
         rcu_read_lock();
         for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
-               this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+               __this_cpu_add(ca->cpustat->cpustat[index], val);
         rcu_read_unlock();
  }
  
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 239970b..36c5426 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
         set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
         set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
         set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0644, proc_dointvec_minmax);
+       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, proc_dointvec_minmax);
         set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
         set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
         /* &table[8] is terminator */
@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
         else
                 SEQ_printf(m, " %c", task_state_to_char(p));
  
-       SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+       SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
                 p->comm, task_pid_nr(p),
                 SPLIT_NS(p->se.vruntime),
                 (long long)(p->nvcsw + p->nivcsw),
@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
  
         SEQ_printf(m, "\n");
         SEQ_printf(m, "runnable tasks:\n");
-       SEQ_printf(m, " S           task   PID         tree-key  switches  prio"
+       SEQ_printf(m, " S            task   PID         tree-key  switches  prio"
                    "     wait-time             sum-exec        sum-sleep\n");
         SEQ_printf(m, "-------------------------------------------------------"
-                  "----------------------------------------------------\n");
+                  "------------------------------------------------------\n");
  
         rcu_read_lock();
         for_each_process_thread(g, p) {
@@ -638,7 +638,6 @@ do {                                                                        \
  
         P(nr_running);
         P(nr_switches);
-       P(nr_load_updates);
         P(nr_uninterruptible);
         PN(next_balance);
         SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index da3e5b5..0ed04d2 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -191,7 +191,7 @@ static void update_sysctl(void)
  #undef SET_SYSCTL
  }
  
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
  {
         update_sysctl();
  }
@@ -1094,7 +1094,7 @@ struct numa_group {
          * more by CPU use than by memory faults.
          */
         unsigned long *faults_cpu;
-       unsigned long faults[0];
+       unsigned long faults[];
  };
  
  /*
@@ -3441,52 +3441,46 @@ static inline void
  update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /* Nothing to update */
         if (!delta)
                 return;
  
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
         /* Set new sched_entity's utilization */
         se->avg.util_avg = gcfs_rq->avg.util_avg;
-       se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+       se->avg.util_sum = se->avg.util_avg * divider;
  
         /* Update parent cfs_rq utilization */
         add_positive(&cfs_rq->avg.util_avg, delta);
-       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
  }
  
  static inline void
  update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
         long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /* Nothing to update */
         if (!delta)
                 return;
  
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
         /* Set new sched_entity's runnable */
         se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
-       se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+       se->avg.runnable_sum = se->avg.runnable_avg * divider;
  
         /* Update parent cfs_rq runnable */
         add_positive(&cfs_rq->avg.runnable_avg, delta);
-       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
  }
  
  static inline void
@@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
         unsigned long load_avg;
         u64 load_sum = 0;
         s64 delta_sum;
+       u32 divider;
  
         if (!runnable_sum)
                 return;
  
         gcfs_rq->prop_runnable_sum = 0;
  
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
         if (runnable_sum >= 0) {
                 /*
                  * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
                  * the CPU is saturated running == runnable.
                  */
                 runnable_sum += se->avg.load_sum;
-               runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+               runnable_sum = min_t(long, runnable_sum, divider);
         } else {
                 /*
                  * Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
         runnable_sum = max(runnable_sum, running_sum);
  
         load_sum = (s64)se_weight(se) * runnable_sum;
-       load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+       load_avg = div_s64(load_sum, divider);
  
         delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
         delta_avg = load_avg - se->avg.load_avg;
@@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
         u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /*
@@ -3873,6 +3878,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
         return cfs_rq->avg.load_avg;
  }
  
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
  static inline unsigned long task_util(struct task_struct *p)
  {
         return READ_ONCE(p->se.avg.util_avg);
@@ -4054,7 +4061,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
  detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
  {
         return 0;
  }
@@ -4588,16 +4595,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  }
  
  /* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+                                  struct cfs_rq *cfs_rq, u64 target_runtime)
  {
-       struct task_group *tg = cfs_rq->tg;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount;
+       u64 min_amount, amount = 0;
+
+       lockdep_assert_held(&cfs_b->lock);
  
         /* note: this is a positive sum as runtime_remaining <= 0 */
-       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+       min_amount = target_runtime - cfs_rq->runtime_remaining;
  
-       raw_spin_lock(&cfs_b->lock);
         if (cfs_b->quota == RUNTIME_INF)
                 amount = min_amount;
         else {
@@ -4609,13 +4616,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                         cfs_b->idle = 0;
                 }
         }
-       raw_spin_unlock(&cfs_b->lock);
  
         cfs_rq->runtime_remaining += amount;
  
         return cfs_rq->runtime_remaining > 0;
  }
  
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       int ret;
+
+       raw_spin_lock(&cfs_b->lock);
+       ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+       raw_spin_unlock(&cfs_b->lock);
+
+       return ret;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4723,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, idle_task_delta, dequeue = 1;
-       bool empty;
+
+       raw_spin_lock(&cfs_b->lock);
+       /* This will start the period timer if necessary */
+       if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+               /*
+                * We have raced with bandwidth becoming available, and if we
+                * actually throttled the timer might not unthrottle us for an
+                * entire period. We additionally needed to make sure that any
+                * subsequent check_cfs_rq_runtime calls agree not to throttle
+                * us, as we may commit to do cfs put_prev+pick_next, so we ask
+                * for 1ns of runtime rather than just check cfs_b.
+                */
+               dequeue = 0;
+       } else {
+               list_add_tail_rcu(&cfs_rq->throttled_list,
+                                 &cfs_b->throttled_cfs_rq);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!dequeue)
+               return false;  /* Throttle no longer required. */
  
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
@@ -4744,29 +4783,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         if (!se)
                 sub_nr_running(rq, task_delta);
  
-       cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq_clock(rq);
-       raw_spin_lock(&cfs_b->lock);
-       empty = list_empty(&cfs_b->throttled_cfs_rq);
-
         /*
-        * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
-        * not running add to the tail so that later runqueues don't get starved.
+        * Note: distribution will already see us throttled via the
+        * throttled-list.  rq->lock protects completion.
          */
-       if (cfs_b->distribute_running)
-               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       else
-               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
-       /*
-        * If we're the first throttled task, make sure the bandwidth
-        * timer is running.
-        */
-       if (empty)
-               start_cfs_bandwidth(cfs_b);
-
-       raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled = 1;
+       cfs_rq->throttled_clock = rq_clock(rq);
+       return true;
  }
  
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4933,14 +4956,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
         /*
          * This check is repeated as we release cfs_b->lock while we unthrottle.
          */
-       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
-               cfs_b->distribute_running = 1;
+       while (throttled && cfs_b->runtime > 0) {
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 distribute_cfs_runtime(cfs_b);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
-               cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
         }
  
@@ -5054,10 +5075,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
         cfs_b->slack_started = false;
-       if (cfs_b->distribute_running) {
-               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
-               return;
-       }
  
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5067,9 +5084,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
  
-       if (runtime)
-               cfs_b->distribute_running = 1;
-
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
         if (!runtime)
@@ -5078,7 +5092,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         distribute_cfs_runtime(cfs_b);
  
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       cfs_b->distribute_running = 0;
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
  
@@ -5139,8 +5152,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         if (cfs_rq_throttled(cfs_rq))
                 return true;
  
-       throttle_cfs_rq(cfs_rq);
-       return true;
+       return throttle_cfs_rq(cfs_rq);
  }
  
  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5170,6 +5182,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                 if (!overrun)
                         break;
  
+               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
                 if (++count > 3) {
                         u64 new, old = ktime_to_ns(cfs_b->period);
  
@@ -5199,8 +5213,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                         /* reset count so we don't come right back in here */
                         count = 0;
                 }
-
-               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
         }
         if (idle)
                 cfs_b->period_active = 0;
@@ -5221,7 +5233,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
-       cfs_b->distribute_running = 0;
         cfs_b->slack_started = false;
  }
  
@@ -5506,28 +5517,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         list_add_leaf_cfs_rq(cfs_rq);
         }
  
-enqueue_throttle:
-       if (!se) {
-               add_nr_running(rq, 1);
-               /*
-                * Since new tasks are assigned an initial util_avg equal to
-                * half of the spare capacity of their CPU, tiny tasks have the
-                * ability to cross the overutilized threshold, which will
-                * result in the load balancer ruining all the task placement
-                * done by EAS. As a way to mitigate that effect, do not account
-                * for the first enqueue operation of new tasks during the
-                * overutilized flag detection.
-                *
-                * A better way of solving this problem would be to wait for
-                * the PELT signals of tasks to converge before taking them
-                * into account, but that is not straightforward to implement,
-                * and the following generally works well enough in practice.
-                */
-               if (flags & ENQUEUE_WAKEUP)
-                       update_overutilized_status(rq);
+       /* At this point se is NULL and we are at root level*/
+       add_nr_running(rq, 1);
  
-       }
+       /*
+        * Since new tasks are assigned an initial util_avg equal to
+        * half of the spare capacity of their CPU, tiny tasks have the
+        * ability to cross the overutilized threshold, which will
+        * result in the load balancer ruining all the task placement
+        * done by EAS. As a way to mitigate that effect, do not account
+        * for the first enqueue operation of new tasks during the
+        * overutilized flag detection.
+        *
+        * A better way of solving this problem would be to wait for
+        * the PELT signals of tasks to converge before taking them
+        * into account, but that is not straightforward to implement,
+        * and the following generally works well enough in practice.
+        */
+       if (flags & ENQUEUE_WAKEUP)
+               update_overutilized_status(rq);
  
+enqueue_throttle:
         if (cfs_bandwidth_used()) {
                 /*
                  * When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5737,7 +5747,7 @@ static int wake_wide(struct task_struct *p)
  {
         unsigned int master = current->wakee_flips;
         unsigned int slave = p->wakee_flips;
-       int factor = this_cpu_read(sd_llc_size);
+       int factor = __this_cpu_read(sd_llc_size);
  
         if (master < slave)
                 swap(master, slave);
@@ -5846,8 +5856,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
  }
  
  static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
  
  /*
   * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5930,7 +5939,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                         continue;
                 }
  
-               group = find_idlest_group(sd, p, cpu, sd_flag);
+               group = find_idlest_group(sd, p, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
@@ -6671,9 +6680,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
-               if (!(tmp->flags & SD_LOAD_BALANCE))
-                       break;
-
                 /*
                  * If both 'cpu' and 'prev_cpu' are part of this domain,
                  * cpu is a valid SD_WAKE_AFFINE target.
@@ -8584,7 +8590,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
          */
  
  #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                 return 0;
  #endif
  
@@ -8702,8 +8708,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
   * Assumes p is allowed on at least one CPU in sd.
   */
  static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  {
         struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
         struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9434,7 +9439,7 @@ static int active_load_balance_cpu_stop(void *data);
  static int should_we_balance(struct lb_env *env)
  {
         struct sched_group *sg = env->sd->groups;
-       int cpu, balance_cpu = -1;
+       int cpu;
  
         /*
          * Ensure the balancing environment is consistent; can happen
@@ -9455,18 +9460,12 @@ static int should_we_balance(struct lb_env *env)
                 if (!idle_cpu(cpu))
                         continue;
  
-               balance_cpu = cpu;
-               break;
+               /* Are we the first idle CPU? */
+               return cpu == env->dst_cpu;
         }
  
-       if (balance_cpu == -1)
-               balance_cpu = group_balance_cpu(sg);
-
-       /*
-        * First idle CPU or the first CPU(busiest) in this sched group
-        * is eligible for doing load balancing at this and above domains.
-        */
-       return balance_cpu == env->dst_cpu;
+       /* Are we the first CPU of this group ? */
+       return group_balance_cpu(sg) == env->dst_cpu;
  }
  
  /*
@@ -9819,9 +9818,8 @@ static int active_load_balance_cpu_stop(void *data)
         /* Search for an sd spanning us and the target CPU. */
         rcu_read_lock();
         for_each_domain(target_cpu, sd) {
-               if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                               break;
+               if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                       break;
         }
  
         if (likely(sd)) {
@@ -9910,9 +9908,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                 }
                 max_cost += sd->max_newidle_lb_cost;
  
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                 /*
                  * Stop the load balance at this level. There is another
                  * CPU in our sched group which is doing load balancing more
@@ -10029,17 +10024,20 @@ static void kick_ilb(unsigned int flags)
         if (ilb_cpu >= nr_cpu_ids)
                 return;
  
+       /*
+        * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+        * the first flag owns it; cleared by nohz_csd_func().
+        */
         flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
         if (flags & NOHZ_KICK_MASK)
                 return;
  
         /*
-        * Use smp_send_reschedule() instead of resched_cpu().
-        * This way we generate a sched IPI on the target CPU which
+        * This way we generate an IPI on the target CPU which
          * is idle. And the softirq performing nohz idle load balance
          * will be run before returning from the IPI.
          */
-       smp_send_reschedule(ilb_cpu);
+       smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
  }
  
  /*
@@ -10377,20 +10375,14 @@ abort:
   */
  static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  {
-       int this_cpu = this_rq->cpu;
-       unsigned int flags;
+       unsigned int flags = this_rq->nohz_idle_balance;
  
-       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+       if (!flags)
                 return false;
  
-       if (idle != CPU_IDLE) {
-               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-               return false;
-       }
+       this_rq->nohz_idle_balance = 0;
  
-       /* could be _relaxed() */
-       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-       if (!(flags & NOHZ_KICK_MASK))
+       if (idle != CPU_IDLE)
                 return false;
  
         _nohz_idle_balance(this_rq, flags, idle);
@@ -10450,7 +10442,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
   *     0 - failed, no new tasks
   *   > 0 - success, new (fair) tasks present
   */
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@ -10501,9 +10493,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
                 int continue_balancing = 1;
                 u64 t0, domain_cost;
  
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
                         update_next_balance(sd, &next_balance);
                         break;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index b743bf3..05deb81 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -289,7 +289,11 @@ static void do_idle(void)
          */
         smp_mb__after_atomic();
  
-       sched_ttwu_pending();
+       /*
+        * RCU relies on this call to be done outside of an RCU read-side
+        * critical section.
+        */
+       flush_smp_call_function_from_idle();
         schedule_idle();
  
         if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

index b647d04..b4b1ff9 100644 (file)
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
         return 1;
  }
  
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ *   LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ *   LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
  static __always_inline void
  ___update_load_avg(struct sched_avg *sa, unsigned long load)
  {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index df11d88..6d60ba2 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
  
  int sched_rr_timeslice = RR_TIMESLICE;
  int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
  
  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  
@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
         if (rt_period == 0)
                 return -EINVAL;
  
+       /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+               return -EINVAL;
+
         mutex_lock(&rt_constraints_mutex);
         err = __rt_schedulable(tg, rt_period, rt_runtime);
         if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
                 return -EINVAL;
  
         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
-               (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+               ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+                ((u64)sysctl_sched_rt_runtime *
+                       NSEC_PER_USEC > max_rt_runtime)))
                 return -EINVAL;
  
         return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index db3a576..1d4e94c 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -349,7 +349,6 @@ struct cfs_bandwidth {
  
         u8                      idle;
         u8                      period_active;
-       u8                      distribute_running;
         u8                      slack_started;
         struct hrtimer          period_timer;
         struct hrtimer          slack_timer;
@@ -890,12 +889,15 @@ struct rq {
  #ifdef CONFIG_SMP
         unsigned long           last_blocked_load_update_tick;
         unsigned int            has_blocked_load;
+       call_single_data_t      nohz_csd;
  #endif /* CONFIG_SMP */
         unsigned int            nohz_tick_stopped;
-       atomic_t nohz_flags;
+       atomic_t                nohz_flags;
  #endif /* CONFIG_NO_HZ_COMMON */
  
-       unsigned long           nr_load_updates;
+#ifdef CONFIG_SMP
+       unsigned int            ttwu_pending;
+#endif
         u64                     nr_switches;
  
  #ifdef CONFIG_UCLAMP_TASK
@@ -951,6 +953,7 @@ struct rq {
  
         struct callback_head    *balance_callback;
  
+       unsigned char           nohz_idle_balance;
         unsigned char           idle_balance;
  
         unsigned long           misfit_task_load;
@@ -979,7 +982,7 @@ struct rq {
  
         /* This is used to determine avg_idle's max value */
         u64                     max_idle_balance_cost;
-#endif
+#endif /* CONFIG_SMP */
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         u64                     prev_irq_time;
@@ -1020,10 +1023,6 @@ struct rq {
         unsigned int            ttwu_local;
  #endif
  
-#ifdef CONFIG_SMP
-       struct llist_head       wake_list;
-#endif
-
  #ifdef CONFIG_CPU_IDLE
         /* Must be inspected within a rcu lock section */
         struct cpuidle_state    *idle_state;
@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
         rq->balance_callback = head;
  }
  
-extern void sched_ttwu_pending(void);
-
  #define rcu_dereference_check_sched_domain(p) \
         rcu_dereference_check((p), \
                               lockdep_is_held(&sched_domains_mutex))
@@ -1461,7 +1458,7 @@ struct sched_group {
          * by attaching extra space to the end of the structure,
          * depending on how many CPUs the kernel has booted up with)
          */
-       unsigned long           cpumask[0];
+       unsigned long           cpumask[];
  };
  
  static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
  }
  #endif
  
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
-
-#else
-
-static inline void sched_ttwu_pending(void) { }
+extern void flush_smp_call_function_from_idle(void);
  
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
  
  #include "stats.h"
  #include "autogroup.h"
@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
   */
  #define WF_SYNC                        0x01            /* Waker goes to sleep after wakeup */
  #define WF_FORK                        0x02            /* Child wakeup after fork */
-#define WF_MIGRATED            0x4             /* Internal use, task got migrated */
+#define WF_MIGRATED            0x04            /* Internal use, task got migrated */
+#define WF_ON_RQ               0x08            /* Wakee is on_rq */
  
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
  #define BW_SHIFT               20
  #define BW_UNIT                        (1 << BW_SHIFT)
  #define RATIO_SHIFT            8
+#define MAX_BW_BITS            (64 - BW_SHIFT)
+#define MAX_BW                 ((1ULL << MAX_BW_BITS) - 1)
  unsigned long to_ratio(u64 period, u64 runtime);
  
  extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h

new file mode 100644 (file)

index 0000000..9620e32
--- /dev/null
+++ b/kernel/sched/smp.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+
+extern void sched_ttwu_pending(void *arg);
+
+extern void send_call_function_single_ipi(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 8344757..1d7b446 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
         cpumask_clear(groupmask);
  
         printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
-
-       if (!(sd->flags & SD_LOAD_BALANCE)) {
-               printk("does not load-balance\n");
-               if (sd->parent)
-                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
-               return -1;
-       }
-
         printk(KERN_CONT "span=%*pbl level=%s\n",
                cpumask_pr_args(sched_domain_span(sd)), sd->name);
  
@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
                 return 1;
  
         /* Following flags need at least 2 groups */
-       if (sd->flags & (SD_LOAD_BALANCE |
-                        SD_BALANCE_NEWIDLE |
+       if (sd->flags & (SD_BALANCE_NEWIDLE |
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUCAPACITY |
@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
  
         /* Flags needing groups don't count if only 1 group in parent */
         if (parent->groups == parent->groups->next) {
-               pflags &= ~(SD_LOAD_BALANCE |
-                               SD_BALANCE_NEWIDLE |
-                               SD_BALANCE_FORK |
-                               SD_BALANCE_EXEC |
-                               SD_ASYM_CPUCAPACITY |
-                               SD_SHARE_CPUCAPACITY |
-                               SD_SHARE_PKG_RESOURCES |
-                               SD_PREFER_SIBLING |
-                               SD_SHARE_POWERDOMAIN);
+               pflags &= ~(SD_BALANCE_NEWIDLE |
+                           SD_BALANCE_FORK |
+                           SD_BALANCE_EXEC |
+                           SD_ASYM_CPUCAPACITY |
+                           SD_SHARE_CPUCAPACITY |
+                           SD_SHARE_PKG_RESOURCES |
+                           SD_PREFER_SIBLING |
+                           SD_SHARE_POWERDOMAIN);
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
  
                 .cache_nice_tries       = 0,
  
-               .flags                  = 1*SD_LOAD_BALANCE
-                                       | 1*SD_BALANCE_NEWIDLE
+               .flags                  = 1*SD_BALANCE_NEWIDLE
                                         | 1*SD_BALANCE_EXEC
                                         | 1*SD_BALANCE_FORK
                                         | 0*SD_BALANCE_WAKE
diff --git a/kernel/smp.c b/kernel/smp.c

index 8430319..472c2b2 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -22,11 +22,9 @@
  #include <linux/hypervisor.h>
  
  #include "smpboot.h"
+#include "sched/smp.h"
  
-enum {
-       CSD_FLAG_LOCK           = 0x01,
-       CSD_FLAG_SYNCHRONOUS    = 0x02,
-};
+#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
  
  struct call_function_data {
         call_single_data_t      __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
          * still pending.
          */
         flush_smp_call_function_queue(false);
+       irq_work_run();
         return 0;
  }
  
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
  
  static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
  
+void __smp_call_single_queue(int cpu, struct llist_node *node)
+{
+       /*
+        * The list addition should be visible before sending the IPI
+        * handler locks the list to pull the entry off it because of
+        * normal cache coherency rules implied by spinlocks.
+        *
+        * If IPIs can go out of order to the cache coherency protocol
+        * in an architecture, sufficient synchronisation should be added
+        * to arch code to make it appear to obey cache coherency WRT
+        * locking and barrier primitives. Generic code isn't really
+        * equipped to do the right thing...
+        */
+       if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+               send_call_function_single_ipi(cpu);
+}
+
  /*
   * Insert a previously allocated call_single_data_t element
   * for execution on the given CPU. data must already have
   * ->func, ->info, and ->flags set.
   */
-static int generic_exec_single(int cpu, call_single_data_t *csd,
-                              smp_call_func_t func, void *info)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
  {
         if (cpu == smp_processor_id()) {
+               smp_call_func_t func = csd->func;
+               void *info = csd->info;
                 unsigned long flags;
  
                 /*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
                 return 0;
         }
  
-
         if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                 csd_unlock(csd);
                 return -ENXIO;
         }
  
-       csd->func = func;
-       csd->info = info;
-
-       /*
-        * The list addition should be visible before sending the IPI
-        * handler locks the list to pull the entry off it because of
-        * normal cache coherency rules implied by spinlocks.
-        *
-        * If IPIs can go out of order to the cache coherency protocol
-        * in an architecture, sufficient synchronisation should be added
-        * to arch code to make it appear to obey cache coherency WRT
-        * locking and barrier primitives. Generic code isn't really
-        * equipped to do the right thing...
-        */
-       if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
-               arch_send_call_function_single_ipi(cpu);
+       __smp_call_single_queue(cpu, &csd->llist);
  
         return 0;
  }
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
   */
  static void flush_smp_call_function_queue(bool warn_cpu_offline)
  {
-       struct llist_head *head;
-       struct llist_node *entry;
         call_single_data_t *csd, *csd_next;
+       struct llist_node *entry, *prev;
+       struct llist_head *head;
         static bool warned;
  
         lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
                  * We don't have to use the _safe() variant here
                  * because we are not invoking the IPI handlers yet.
                  */
-               llist_for_each_entry(csd, entry, llist)
-                       pr_warn("IPI callback %pS sent to offline CPU\n",
-                               csd->func);
+               llist_for_each_entry(csd, entry, llist) {
+                       switch (CSD_TYPE(csd)) {
+                       case CSD_TYPE_ASYNC:
+                       case CSD_TYPE_SYNC:
+                       case CSD_TYPE_IRQ_WORK:
+                               pr_warn("IPI callback %pS sent to offline CPU\n",
+                                       csd->func);
+                               break;
+
+                       case CSD_TYPE_TTWU:
+                               pr_warn("IPI task-wakeup sent to offline CPU\n");
+                               break;
+
+                       default:
+                               pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+                                       CSD_TYPE(csd));
+                               break;
+                       }
+               }
         }
  
+       /*
+        * First; run all SYNC callbacks, people are waiting for us.
+        */
+       prev = NULL;
         llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-               smp_call_func_t func = csd->func;
-               void *info = csd->info;
-
                 /* Do we wait until *after* callback? */
-               if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+               if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+                       smp_call_func_t func = csd->func;
+                       void *info = csd->info;
+
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+
                         func(info);
                         csd_unlock(csd);
                 } else {
-                       csd_unlock(csd);
-                       func(info);
+                       prev = &csd->llist;
                 }
         }
  
+       if (!entry)
+               return;
+
         /*
-        * Handle irq works queued remotely by irq_work_queue_on().
-        * Smp functions above are typically synchronous so they
-        * better run first since some other CPUs may be busy waiting
-        * for them.
+        * Second; run all !SYNC callbacks.
          */
-       irq_work_run();
+       prev = NULL;
+       llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+               int type = CSD_TYPE(csd);
+
+               if (type != CSD_TYPE_TTWU) {
+                       if (prev) {
+                               prev->next = &csd_next->llist;
+                       } else {
+                               entry = &csd_next->llist;
+                       }
+
+                       if (type == CSD_TYPE_ASYNC) {
+                               smp_call_func_t func = csd->func;
+                               void *info = csd->info;
+
+                               csd_unlock(csd);
+                               func(info);
+                       } else if (type == CSD_TYPE_IRQ_WORK) {
+                               irq_work_single(csd);
+                       }
+
+               } else {
+                       prev = &csd->llist;
+               }
+       }
+
+       /*
+        * Third; only CSD_TYPE_TTWU is left, issue those.
+        */
+       if (entry)
+               sched_ttwu_pending(entry);
+}
+
+void flush_smp_call_function_from_idle(void)
+{
+       unsigned long flags;
+
+       if (llist_empty(this_cpu_ptr(&call_single_queue)))
+               return;
+
+       local_irq_save(flags);
+       flush_smp_call_function_queue(true);
+       local_irq_restore(flags);
  }
  
  /*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
  {
         call_single_data_t *csd;
         call_single_data_t csd_stack = {
-               .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+               .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
         };
         int this_cpu;
         int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                 csd_lock(csd);
         }
  
-       err = generic_exec_single(cpu, csd, func, info);
+       csd->func = func;
+       csd->info = info;
+
+       err = generic_exec_single(cpu, csd);
  
         if (wait)
                 csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
         csd->flags = CSD_FLAG_LOCK;
         smp_wmb();
  
-       err = generic_exec_single(cpu, csd, csd->func, csd->info);
+       err = generic_exec_single(cpu, csd);
  
  out:
         preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
  
                 csd_lock(csd);
                 if (wait)
-                       csd->flags |= CSD_FLAG_SYNCHRONOUS;
+                       csd->flags |= CSD_TYPE_SYNC;
                 csd->func = func;
                 csd->info = info;
                 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@ -598,6 +669,24 @@ void __init smp_init(void)
  {
         int num_nodes, num_cpus;
  
+       /*
+        * Ensure struct irq_work layout matches so that
+        * flush_smp_call_function_queue() can do horrible things.
+        */
+       BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+                    offsetof(struct __call_single_data, llist));
+       BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+                    offsetof(struct __call_single_data, func));
+       BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+                    offsetof(struct __call_single_data, flags));
+
+       /*
+        * Assert the CSD_TYPE_TTWU layout is similar enough
+        * for task_struct to be on the @call_single_queue.
+        */
+       BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+                    offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+
         idle_threads_init();
         cpuhp_threads_init();
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Jun 2020 20:06:42 +0000 (13:06 -0700)
arch/powerpc/platforms/powernv/smp.c		patch \| blob \| history
include/linux/irq_work.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/mm.h		patch \| blob \| history
include/linux/sched/topology.h		patch \| blob \| history
include/linux/smp.h		patch \| blob \| history
include/linux/swait.h		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/irq_work.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpuacct.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/idle.c		patch \| blob \| history
kernel/sched/pelt.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/smp.h	[new file with mode: 0644]	patch \| blob
kernel/sched/topology.c		patch \| blob \| history
kernel/smp.c		patch \| blob \| history