Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git...

[tomoyo/tomoyo-test1.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 0ae29fd..d766902 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -21,6 +21,7 @@
  #include "../smpboot.h"
  
  #include "pelt.h"
+#include "smp.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
         update_rq_clock_task(rq, delta);
  }
  
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+       csd->flags = 0;
+       csd->func = func;
+       csd->info = rq;
+}
  
  #ifdef CONFIG_SCHED_HRTICK
  /*
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                       HRTIMER_MODE_REL_PINNED_HARD);
  }
+
  #endif /* CONFIG_SMP */
  
  static void hrtick_rq_init(struct rq *rq)
  {
  #ifdef CONFIG_SMP
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
+       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
  #endif
-
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
  }
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
                 wake_up_idle_cpu(cpu);
  }
  
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
  {
-       int cpu = smp_processor_id();
-
-       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
-               return false;
-
-       if (idle_cpu(cpu) && !need_resched())
-               return true;
+       struct rq *rq = info;
+       int cpu = cpu_of(rq);
+       unsigned int flags;
  
         /*
-        * We can't run Idle Load Balance on this CPU for this time so we
-        * cancel it and clear NOHZ_BALANCE_KICK
+        * Release the rq::nohz_csd.
          */
-       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
-       return false;
-}
+       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+       WARN_ON(!(flags & NOHZ_KICK_MASK));
  
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
-       return false;
+       rq->idle_balance = idle_cpu(cpu);
+       if (rq->idle_balance && !need_resched()) {
+               rq->nohz_idle_balance = flags;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
  }
  
  #endif /* CONFIG_NO_HZ_COMMON */
@@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
          * __migrate_task() such that we will not miss enforcing cpus_ptr
          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
          */
-       sched_ttwu_pending();
+       flush_smp_call_function_from_idle();
  
         raw_spin_lock(&p->pi_lock);
         rq_lock(rq, &rf);
@@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
  }
  
  #ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
  {
+       struct llist_node *llist = arg;
         struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
         struct task_struct *p, *t;
         struct rq_flags rf;
  
         if (!llist)
                 return;
  
+       /*
+        * rq::ttwu_pending racy indication of out-standing wakeups.
+        * Races such that false-negatives are possible, since they
+        * are shorter lived that false-positives would be.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
+
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
  
@@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
         rq_unlock_irqrestore(rq, &rf);
  }
  
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
  {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
-
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
-
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
+       struct rq *rq = cpu_rq(cpu);
  
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
+       if (!set_nr_if_polling(rq->idle))
+               arch_send_call_function_single_ipi(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
  }
  
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
  {
         struct rq *rq = cpu_rq(cpu);
  
         p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
  
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-               if (!set_nr_if_polling(rq->idle))
-                       smp_send_reschedule(cpu);
-               else
-                       trace_sched_wake_idle_without_ipi(cpu);
-       }
+       WRITE_ONCE(rq->ttwu_pending, 1);
+       __smp_call_single_queue(cpu, &p->wake_entry);
  }
  
  void wake_up_if_idle(int cpu)
@@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
  {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+       /*
+        * If the CPU does not share cache, then queue the task on the
+        * remote rqs wakelist to avoid accessing remote data.
+        */
+       if (!cpus_share_cache(smp_processor_id(), cpu))
+               return true;
+
+       /*
+        * If the task is descheduling and the only running task on the
+        * CPU then use the wakelist to offload the task activation to
+        * the soon-to-be-idle CPU as the current CPU is likely busy.
+        * nr_running is checked to avoid unnecessary task stacking.
+        */
+       if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+               return true;
+
+       return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+               __ttwu_queue_wakelist(p, cpu, wake_flags);
+               return true;
+       }
+
+       return false;
+}
  #endif /* CONFIG_SMP */
  
  static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
         struct rq_flags rf;
  
  #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               ttwu_queue_remote(p, cpu, wake_flags);
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                 return;
-       }
  #endif
  
         rq_lock(rq, &rf);
@@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto unlock;
  
+       if (p->in_iowait) {
+               delayacct_blkio_end(p);
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
  #ifdef CONFIG_SMP
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
+
         /*
          * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
          * possible to, falsely, observe p->on_cpu == 0.
@@ -2593,6 +2611,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         /*
          * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        */
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+               goto unlock;
+
+       /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
          * this task as prev, wait until its done referencing the task.
          *
          * Pairs with the smp_store_release() in finish_task().
@@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         smp_cond_load_acquire(&p->on_cpu, !VAL);
  
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 psi_ttwu_dequeue(p);
                 set_task_cpu(p, cpu);
         }
-
-#else /* CONFIG_SMP */
-
-       if (p->in_iowait) {
-               delayacct_blkio_end(p);
-               atomic_dec(&task_rq(p)->nr_iowait);
-       }
-
  #endif /* CONFIG_SMP */
  
         ttwu_queue(p, cpu, wake_flags);
@@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->capture_control = NULL;
  #endif
         init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+       p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
  }
  
  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
         schedstat_inc(this_rq()->sched_count);
  }
  
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+                                 struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+       const struct sched_class *class;
+       /*
+        * We must do the balancing pass before put_prev_task(), such
+        * that when we release the rq->lock the task is in the same
+        * state as before we took rq->lock.
+        *
+        * We can terminate the balance pass as soon as we know there is
+        * a runnable task of @class priority or higher.
+        */
+       for_class_range(class, prev->sched_class, &idle_sched_class) {
+               if (class->balance(rq, prev, rf))
+                       break;
+       }
+#endif
+
+       put_prev_task(rq, prev);
+}
+
  /*
   * Pick up the highest-prio task:
   */
@@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
         }
  
  restart:
-#ifdef CONFIG_SMP
-       /*
-        * We must do the balancing pass before put_next_task(), such
-        * that when we release the rq->lock the task is in the same
-        * state as before we took rq->lock.
-        *
-        * We can terminate the balance pass as soon as we know there is
-        * a runnable task of @class priority or higher.
-        */
-       for_class_range(class, prev->sched_class, &idle_sched_class) {
-               if (class->balance(rq, prev, rf))
-                       break;
-       }
-#endif
-
-       put_prev_task(rq, prev);
+       put_prev_task_balance(rq, prev, rf);
  
         for_each_class(class) {
                 p = class->pick_next_task(rq);
@@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
                 return 0;
  
  #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                 return 0;
  #endif
  
@@ -6243,13 +6265,14 @@ void idle_task_exit(void)
         struct mm_struct *mm = current->active_mm;
  
         BUG_ON(cpu_online(smp_processor_id()));
+       BUG_ON(current != this_rq()->idle);
  
         if (mm != &init_mm) {
                 switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
                 finish_arch_post_lock_switch();
         }
-       mmdrop(mm);
+
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
  }
  
  /*
@@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
         struct rq_flags rf;
  
         /* Handle pending wakeups and then migrate everything off */
-       sched_ttwu_pending();
         sched_tick_stop(cpu);
  
         rq_lock_irqsave(rq, &rf);
@@ -6642,6 +6664,8 @@ void __init sched_init(void)
                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6694,7 +6718,6 @@ void __init sched_init(void)
                 init_rt_rq(&rq->rt);
                 init_dl_rq(&rq->dl);
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
                 /*
@@ -6716,7 +6739,6 @@ void __init sched_init(void)
                  * We achieve this by letting root_task_group's tasks sit
                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
@@ -6744,6 +6766,8 @@ void __init sched_init(void)
  #ifdef CONFIG_NO_HZ_COMMON
                 rq->last_blocked_load_update_tick = jiffies;
                 atomic_set(&rq->nohz_flags, 0);
+
+               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
  #endif
  #endif /* CONFIG_SMP */
                 hrtick_rq_init(rq);
@@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
  
  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
  static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
  
@@ -7466,6 +7492,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                 return -EINVAL;
  
         /*
+        * Bound quota to defend quota against overflow during bandwidth shift.
+        */
+       if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+               return -EINVAL;
+
+       /*
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */