kernel/sched/core.c

   1 /*
   2  *  kernel/sched/core.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20  *              fair scheduling design by Con Kolivas.
  21  *  2007-05-05  Load balancing (smp-nice) and other improvements
  22  *              by Peter Williams
  23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  26  *              Thomas Gleixner, Mike Kravetz
  27  */
  28
  29 #include <linux/mm.h>
  30 #include <linux/module.h>
  31 #include <linux/nmi.h>
  32 #include <linux/init.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/highmem.h>
  35 #include <asm/mmu_context.h>
  36 #include <linux/interrupt.h>
  37 #include <linux/capability.h>
  38 #include <linux/completion.h>
  39 #include <linux/kernel_stat.h>
  40 #include <linux/debug_locks.h>
  41 #include <linux/perf_event.h>
  42 #include <linux/security.h>
  43 #include <linux/notifier.h>
  44 #include <linux/profile.h>
  45 #include <linux/freezer.h>
  46 #include <linux/vmalloc.h>
  47 #include <linux/blkdev.h>
  48 #include <linux/delay.h>
  49 #include <linux/pid_namespace.h>
  50 #include <linux/smp.h>
  51 #include <linux/threads.h>
  52 #include <linux/timer.h>
  53 #include <linux/rcupdate.h>
  54 #include <linux/cpu.h>
  55 #include <linux/cpuset.h>
  56 #include <linux/percpu.h>
  57 #include <linux/proc_fs.h>
  58 #include <linux/seq_file.h>
  59 #include <linux/sysctl.h>
  60 #include <linux/syscalls.h>
  61 #include <linux/times.h>
  62 #include <linux/tsacct_kern.h>
  63 #include <linux/kprobes.h>
  64 #include <linux/delayacct.h>
  65 #include <linux/unistd.h>
  66 #include <linux/pagemap.h>
  67 #include <linux/hrtimer.h>
  68 #include <linux/tick.h>
  69 #include <linux/debugfs.h>
  70 #include <linux/ctype.h>
  71 #include <linux/ftrace.h>
  72 #include <linux/slab.h>
  73 #include <linux/init_task.h>
  74 #include <linux/binfmts.h>
  75 #include <linux/context_tracking.h>
  76 #include <linux/compiler.h>
  77
  78 #include <asm/switch_to.h>
  79 #include <asm/tlb.h>
  80 #include <asm/irq_regs.h>
  81 #include <asm/mutex.h>
  82 #ifdef CONFIG_PARAVIRT
  83 #include <asm/paravirt.h>
  84 #endif
  85
  86 #include "sched.h"
  87 #include "../workqueue_internal.h"
  88 #include "../smpboot.h"
  89
  90 #define CREATE_TRACE_POINTS
  91 #include <trace/events/sched.h>
  92 #include "walt.h"
  93
  94 DEFINE_MUTEX(sched_domains_mutex);
  95 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  96
  97 static void update_rq_clock_task(struct rq *rq, s64 delta);
  98
  99 void update_rq_clock(struct rq *rq)
 100 {
 101         s64 delta;
 102
 103         lockdep_assert_held(&rq->lock);
 104
 105         if (rq->clock_skip_update & RQCF_ACT_SKIP)
 106                 return;
 107
 108         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 109         if (delta < 0)
 110                 return;
 111         rq->clock += delta;
 112         update_rq_clock_task(rq, delta);
 113 }
 114
 115 /*
 116  * Debugging: various feature bits
 117  */
 118
 119 #define SCHED_FEAT(name, enabled)       \
 120         (1UL << __SCHED_FEAT_##name) * enabled |
 121
 122 const_debug unsigned int sysctl_sched_features =
 123 #include "features.h"
 124         0;
 125
 126 #undef SCHED_FEAT
 127
 128 #ifdef CONFIG_SCHED_DEBUG
 129 #define SCHED_FEAT(name, enabled)       \
 130         #name ,
 131
 132 static const char * const sched_feat_names[] = {
 133 #include "features.h"
 134 };
 135
 136 #undef SCHED_FEAT
 137
 138 static int sched_feat_show(struct seq_file *m, void *v)
 139 {
 140         int i;
 141
 142         for (i = 0; i < __SCHED_FEAT_NR; i++) {
 143                 if (!(sysctl_sched_features & (1UL << i)))
 144                         seq_puts(m, "NO_");
 145                 seq_printf(m, "%s ", sched_feat_names[i]);
 146         }
 147         seq_puts(m, "\n");
 148
 149         return 0;
 150 }
 151
 152 #ifdef HAVE_JUMP_LABEL
 153
 154 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
 155 #define jump_label_key__false STATIC_KEY_INIT_FALSE
 156
 157 #define SCHED_FEAT(name, enabled)       \
 158         jump_label_key__##enabled ,
 159
 160 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 161 #include "features.h"
 162 };
 163
 164 #undef SCHED_FEAT
 165
 166 static void sched_feat_disable(int i)
 167 {
 168         static_key_disable(&sched_feat_keys[i]);
 169 }
 170
 171 static void sched_feat_enable(int i)
 172 {
 173         static_key_enable(&sched_feat_keys[i]);
 174 }
 175 #else
 176 static void sched_feat_disable(int i) { };
 177 static void sched_feat_enable(int i) { };
 178 #endif /* HAVE_JUMP_LABEL */
 179
 180 static int sched_feat_set(char *cmp)
 181 {
 182         int i;
 183         int neg = 0;
 184
 185         if (strncmp(cmp, "NO_", 3) == 0) {
 186                 neg = 1;
 187                 cmp += 3;
 188         }
 189
 190         for (i = 0; i < __SCHED_FEAT_NR; i++) {
 191                 if (strcmp(cmp, sched_feat_names[i]) == 0) {
 192                         if (neg) {
 193                                 sysctl_sched_features &= ~(1UL << i);
 194                                 sched_feat_disable(i);
 195                         } else {
 196                                 sysctl_sched_features |= (1UL << i);
 197                                 sched_feat_enable(i);
 198                         }
 199                         break;
 200                 }
 201         }
 202
 203         return i;
 204 }
 205
 206 static ssize_t
 207 sched_feat_write(struct file *filp, const char __user *ubuf,
 208                 size_t cnt, loff_t *ppos)
 209 {
 210         char buf[64];
 211         char *cmp;
 212         int i;
 213         struct inode *inode;
 214
 215         if (cnt > 63)
 216                 cnt = 63;
 217
 218         if (copy_from_user(&buf, ubuf, cnt))
 219                 return -EFAULT;
 220
 221         buf[cnt] = 0;
 222         cmp = strstrip(buf);
 223
 224         /* Ensure the static_key remains in a consistent state */
 225         inode = file_inode(filp);
 226         mutex_lock(&inode->i_mutex);
 227         i = sched_feat_set(cmp);
 228         mutex_unlock(&inode->i_mutex);
 229         if (i == __SCHED_FEAT_NR)
 230                 return -EINVAL;
 231
 232         *ppos += cnt;
 233
 234         return cnt;
 235 }
 236
 237 static int sched_feat_open(struct inode *inode, struct file *filp)
 238 {
 239         return single_open(filp, sched_feat_show, NULL);
 240 }
 241
 242 static const struct file_operations sched_feat_fops = {
 243         .open           = sched_feat_open,
 244         .write          = sched_feat_write,
 245         .read           = seq_read,
 246         .llseek         = seq_lseek,
 247         .release        = single_release,
 248 };
 249
 250 static __init int sched_init_debug(void)
 251 {
 252         debugfs_create_file("sched_features", 0644, NULL, NULL,
 253                         &sched_feat_fops);
 254
 255         return 0;
 256 }
 257 late_initcall(sched_init_debug);
 258 #endif /* CONFIG_SCHED_DEBUG */
 259
 260 /*
 261  * Number of tasks to iterate in a single balance run.
 262  * Limited because this is done with IRQs disabled.
 263  */
 264 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 265
 266 /*
 267  * period over which we average the RT time consumption, measured
 268  * in ms.
 269  *
 270  * default: 1s
 271  */
 272 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 273
 274 /*
 275  * period over which we measure -rt task cpu usage in us.
 276  * default: 1s
 277  */
 278 unsigned int sysctl_sched_rt_period = 1000000;
 279
 280 __read_mostly int scheduler_running;
 281
 282 /*
 283  * part of the period that we allow rt tasks to run in us.
 284  * default: 0.95s
 285  */
 286 int sysctl_sched_rt_runtime = 950000;
 287
 288 /* cpus with isolated domains */
 289 cpumask_var_t cpu_isolated_map;
 290
 291 struct rq *
 292 lock_rq_of(struct task_struct *p, unsigned long *flags)
 293 {
 294         return task_rq_lock(p, flags);
 295 }
 296
 297 void
 298 unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
 299 {
 300         task_rq_unlock(rq, p, flags);
 301 }
 302
 303 /*
 304  * this_rq_lock - lock this runqueue and disable interrupts.
 305  */
 306 static struct rq *this_rq_lock(void)
 307         __acquires(rq->lock)
 308 {
 309         struct rq *rq;
 310
 311         local_irq_disable();
 312         rq = this_rq();
 313         raw_spin_lock(&rq->lock);
 314
 315         return rq;
 316 }
 317
 318 #ifdef CONFIG_SCHED_HRTICK
 319 /*
 320  * Use HR-timers to deliver accurate preemption points.
 321  */
 322
 323 static void hrtick_clear(struct rq *rq)
 324 {
 325         if (hrtimer_active(&rq->hrtick_timer))
 326                 hrtimer_cancel(&rq->hrtick_timer);
 327 }
 328
 329 /*
 330  * High-resolution timer tick.
 331  * Runs from hardirq context with interrupts disabled.
 332  */
 333 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 334 {
 335         struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 336
 337         WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 338
 339         raw_spin_lock(&rq->lock);
 340         update_rq_clock(rq);
 341         rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 342         raw_spin_unlock(&rq->lock);
 343
 344         return HRTIMER_NORESTART;
 345 }
 346
 347 #ifdef CONFIG_SMP
 348
 349 static void __hrtick_restart(struct rq *rq)
 350 {
 351         struct hrtimer *timer = &rq->hrtick_timer;
 352
 353         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 354 }
 355
 356 /*
 357  * called from hardirq (IPI) context
 358  */
 359 static void __hrtick_start(void *arg)
 360 {
 361         struct rq *rq = arg;
 362
 363         raw_spin_lock(&rq->lock);
 364         __hrtick_restart(rq);
 365         rq->hrtick_csd_pending = 0;
 366         raw_spin_unlock(&rq->lock);
 367 }
 368
 369 /*
 370  * Called to set the hrtick timer state.
 371  *
 372  * called with rq->lock held and irqs disabled
 373  */
 374 void hrtick_start(struct rq *rq, u64 delay)
 375 {
 376         struct hrtimer *timer = &rq->hrtick_timer;
 377         ktime_t time;
 378         s64 delta;
 379
 380         /*
 381          * Don't schedule slices shorter than 10000ns, that just
 382          * doesn't make sense and can cause timer DoS.
 383          */
 384         delta = max_t(s64, delay, 10000LL);
 385         time = ktime_add_ns(timer->base->get_time(), delta);
 386
 387         hrtimer_set_expires(timer, time);
 388
 389         if (rq == this_rq()) {
 390                 __hrtick_restart(rq);
 391         } else if (!rq->hrtick_csd_pending) {
 392                 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 393                 rq->hrtick_csd_pending = 1;
 394         }
 395 }
 396
 397 static int
 398 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 399 {
 400         int cpu = (int)(long)hcpu;
 401
 402         switch (action) {
 403         case CPU_UP_CANCELED:
 404         case CPU_UP_CANCELED_FROZEN:
 405         case CPU_DOWN_PREPARE:
 406         case CPU_DOWN_PREPARE_FROZEN:
 407         case CPU_DEAD:
 408         case CPU_DEAD_FROZEN:
 409                 hrtick_clear(cpu_rq(cpu));
 410                 return NOTIFY_OK;
 411         }
 412
 413         return NOTIFY_DONE;
 414 }
 415
 416 static __init void init_hrtick(void)
 417 {
 418         hotcpu_notifier(hotplug_hrtick, 0);
 419 }
 420 #else
 421 /*
 422  * Called to set the hrtick timer state.
 423  *
 424  * called with rq->lock held and irqs disabled
 425  */
 426 void hrtick_start(struct rq *rq, u64 delay)
 427 {
 428         /*
 429          * Don't schedule slices shorter than 10000ns, that just
 430          * doesn't make sense. Rely on vruntime for fairness.
 431          */
 432         delay = max_t(u64, delay, 10000LL);
 433         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 434                       HRTIMER_MODE_REL_PINNED);
 435 }
 436
 437 static inline void init_hrtick(void)
 438 {
 439 }
 440 #endif /* CONFIG_SMP */
 441
 442 static void init_rq_hrtick(struct rq *rq)
 443 {
 444 #ifdef CONFIG_SMP
 445         rq->hrtick_csd_pending = 0;
 446
 447         rq->hrtick_csd.flags = 0;
 448         rq->hrtick_csd.func = __hrtick_start;
 449         rq->hrtick_csd.info = rq;
 450 #endif
 451
 452         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 453         rq->hrtick_timer.function = hrtick;
 454 }
 455 #else   /* CONFIG_SCHED_HRTICK */
 456 static inline void hrtick_clear(struct rq *rq)
 457 {
 458 }
 459
 460 static inline void init_rq_hrtick(struct rq *rq)
 461 {
 462 }
 463
 464 static inline void init_hrtick(void)
 465 {
 466 }
 467 #endif  /* CONFIG_SCHED_HRTICK */
 468
 469 /*
 470  * cmpxchg based fetch_or, macro so it works for different integer types
 471  */
 472 #define fetch_or(ptr, val)                                              \
 473 ({      typeof(*(ptr)) __old, __val = *(ptr);                           \
 474         for (;;) {                                                      \
 475                 __old = cmpxchg((ptr), __val, __val | (val));           \
 476                 if (__old == __val)                                     \
 477                         break;                                          \
 478                 __val = __old;                                          \
 479         }                                                               \
 480         __old;                                                          \
 481 })
 482
 483 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 484 /*
 485  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 486  * this avoids any races wrt polling state changes and thereby avoids
 487  * spurious IPIs.
 488  */
 489 static bool set_nr_and_not_polling(struct task_struct *p)
 490 {
 491         struct thread_info *ti = task_thread_info(p);
 492         return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
 493 }
 494
 495 /*
 496  * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 497  *
 498  * If this returns true, then the idle task promises to call
 499  * sched_ttwu_pending() and reschedule soon.
 500  */
 501 static bool set_nr_if_polling(struct task_struct *p)
 502 {
 503         struct thread_info *ti = task_thread_info(p);
 504         typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 505
 506         for (;;) {
 507                 if (!(val & _TIF_POLLING_NRFLAG))
 508                         return false;
 509                 if (val & _TIF_NEED_RESCHED)
 510                         return true;
 511                 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
 512                 if (old == val)
 513                         break;
 514                 val = old;
 515         }
 516         return true;
 517 }
 518
 519 #else
 520 static bool set_nr_and_not_polling(struct task_struct *p)
 521 {
 522         set_tsk_need_resched(p);
 523         return true;
 524 }
 525
 526 #ifdef CONFIG_SMP
 527 static bool set_nr_if_polling(struct task_struct *p)
 528 {
 529         return false;
 530 }
 531 #endif
 532 #endif
 533
 534 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 535 {
 536         struct wake_q_node *node = &task->wake_q;
 537
 538         /*
 539          * Atomically grab the task, if ->wake_q is !nil already it means
 540          * its already queued (either by us or someone else) and will get the
 541          * wakeup due to that.
 542          *
 543          * This cmpxchg() implies a full barrier, which pairs with the write
 544          * barrier implied by the wakeup in wake_up_list().
 545          */
 546         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 547                 return;
 548
 549         head->count++;
 550
 551         get_task_struct(task);
 552
 553         /*
 554          * The head is context local, there can be no concurrency.
 555          */
 556         *head->lastp = node;
 557         head->lastp = &node->next;
 558 }
 559
 560 static int
 561 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 562                int sibling_count_hint);
 563
 564 void wake_up_q(struct wake_q_head *head)
 565 {
 566         struct wake_q_node *node = head->first;
 567
 568         while (node != WAKE_Q_TAIL) {
 569                 struct task_struct *task;
 570
 571                 task = container_of(node, struct task_struct, wake_q);
 572                 BUG_ON(!task);
 573                 /* task can safely be re-inserted now */
 574                 node = node->next;
 575                 task->wake_q.next = NULL;
 576
 577                 /*
 578                  * try_to_wake_up() implies a wmb() to pair with the queueing
 579                  * in wake_q_add() so as not to miss wakeups.
 580                  */
 581                 try_to_wake_up(task, TASK_NORMAL, 0, head->count);
 582                 put_task_struct(task);
 583         }
 584 }
 585
 586 /*
 587  * resched_curr - mark rq's current task 'to be rescheduled now'.
 588  *
 589  * On UP this means the setting of the need_resched flag, on SMP it
 590  * might also involve a cross-CPU call to trigger the scheduler on
 591  * the target CPU.
 592  */
 593 void resched_curr(struct rq *rq)
 594 {
 595         struct task_struct *curr = rq->curr;
 596         int cpu;
 597
 598         lockdep_assert_held(&rq->lock);
 599
 600         if (test_tsk_need_resched(curr))
 601                 return;
 602
 603         cpu = cpu_of(rq);
 604
 605         if (cpu == smp_processor_id()) {
 606                 set_tsk_need_resched(curr);
 607                 set_preempt_need_resched();
 608                 return;
 609         }
 610
 611         if (set_nr_and_not_polling(curr))
 612                 smp_send_reschedule(cpu);
 613         else
 614                 trace_sched_wake_idle_without_ipi(cpu);
 615 }
 616
 617 void resched_cpu(int cpu)
 618 {
 619         struct rq *rq = cpu_rq(cpu);
 620         unsigned long flags;
 621
 622         if (!raw_spin_trylock_irqsave(&rq->lock, flags))
 623                 return;
 624         resched_curr(rq);
 625         raw_spin_unlock_irqrestore(&rq->lock, flags);
 626 }
 627
 628 #ifdef CONFIG_SMP
 629 #ifdef CONFIG_NO_HZ_COMMON
 630 /*
 631  * In the semi idle case, use the nearest busy cpu for migrating timers
 632  * from an idle cpu.  This is good for power-savings.
 633  *
 634  * We don't do similar optimization for completely idle system, as
 635  * selecting an idle cpu will add more delays to the timers than intended
 636  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
 637  */
 638 int get_nohz_timer_target(void)
 639 {
 640         int i, cpu = smp_processor_id();
 641         struct sched_domain *sd;
 642
 643         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
 644                 return cpu;
 645
 646         rcu_read_lock();
 647         for_each_domain(cpu, sd) {
 648                 for_each_cpu(i, sched_domain_span(sd)) {
 649                         if (cpu == i)
 650                                 continue;
 651
 652                         if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
 653                                 cpu = i;
 654                                 goto unlock;
 655                         }
 656                 }
 657         }
 658
 659         if (!is_housekeeping_cpu(cpu))
 660                 cpu = housekeeping_any_cpu();
 661 unlock:
 662         rcu_read_unlock();
 663         return cpu;
 664 }
 665 /*
 666  * When add_timer_on() enqueues a timer into the timer wheel of an
 667  * idle CPU then this timer might expire before the next timer event
 668  * which is scheduled to wake up that CPU. In case of a completely
 669  * idle system the next event might even be infinite time into the
 670  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 671  * leaves the inner idle loop so the newly added timer is taken into
 672  * account when the CPU goes back to idle and evaluates the timer
 673  * wheel for the next timer event.
 674  */
 675 static void wake_up_idle_cpu(int cpu)
 676 {
 677         struct rq *rq = cpu_rq(cpu);
 678
 679         if (cpu == smp_processor_id())
 680                 return;
 681
 682         if (set_nr_and_not_polling(rq->idle))
 683                 smp_send_reschedule(cpu);
 684         else
 685                 trace_sched_wake_idle_without_ipi(cpu);
 686 }
 687
 688 static bool wake_up_full_nohz_cpu(int cpu)
 689 {
 690         /*
 691          * We just need the target to call irq_exit() and re-evaluate
 692          * the next tick. The nohz full kick at least implies that.
 693          * If needed we can still optimize that later with an
 694          * empty IRQ.
 695          */
 696         if (tick_nohz_full_cpu(cpu)) {
 697                 if (cpu != smp_processor_id() ||
 698                     tick_nohz_tick_stopped())
 699                         tick_nohz_full_kick_cpu(cpu);
 700                 return true;
 701         }
 702
 703         return false;
 704 }
 705
 706 void wake_up_nohz_cpu(int cpu)
 707 {
 708         if (!wake_up_full_nohz_cpu(cpu))
 709                 wake_up_idle_cpu(cpu);
 710 }
 711
 712 static inline bool got_nohz_idle_kick(void)
 713 {
 714         int cpu = smp_processor_id();
 715
 716         if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
 717                 return false;
 718
 719         if (idle_cpu(cpu) && !need_resched())
 720                 return true;
 721
 722         /*
 723          * We can't run Idle Load Balance on this CPU for this time so we
 724          * cancel it and clear NOHZ_BALANCE_KICK
 725          */
 726         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 727         return false;
 728 }
 729
 730 #else /* CONFIG_NO_HZ_COMMON */
 731
 732 static inline bool got_nohz_idle_kick(void)
 733 {
 734         return false;
 735 }
 736
 737 #endif /* CONFIG_NO_HZ_COMMON */
 738
 739 #ifdef CONFIG_NO_HZ_FULL
 740 bool sched_can_stop_tick(void)
 741 {
 742         /*
 743          * FIFO realtime policy runs the highest priority task. Other runnable
 744          * tasks are of a lower priority. The scheduler tick does nothing.
 745          */
 746         if (current->policy == SCHED_FIFO)
 747                 return true;
 748
 749         /*
 750          * Round-robin realtime tasks time slice with other tasks at the same
 751          * realtime priority. Is this task the only one at this priority?
 752          */
 753         if (current->policy == SCHED_RR) {
 754                 struct sched_rt_entity *rt_se = &current->rt;
 755
 756                 return rt_se->run_list.prev == rt_se->run_list.next;
 757         }
 758
 759         /*
 760          * More than one running task need preemption.
 761          * nr_running update is assumed to be visible
 762          * after IPI is sent from wakers.
 763          */
 764         if (this_rq()->nr_running > 1)
 765                 return false;
 766
 767         return true;
 768 }
 769 #endif /* CONFIG_NO_HZ_FULL */
 770
 771 void sched_avg_update(struct rq *rq)
 772 {
 773         s64 period = sched_avg_period();
 774
 775         while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
 776                 /*
 777                  * Inline assembly required to prevent the compiler
 778                  * optimising this loop into a divmod call.
 779                  * See __iter_div_u64_rem() for another example of this.
 780                  */
 781                 asm("" : "+rm" (rq->age_stamp));
 782                 rq->age_stamp += period;
 783                 rq->rt_avg /= 2;
 784         }
 785 }
 786
 787 #endif /* CONFIG_SMP */
 788
 789 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 790                         (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 791 /*
 792  * Iterate task_group tree rooted at *from, calling @down when first entering a
 793  * node and @up when leaving it for the final time.
 794  *
 795  * Caller must hold rcu_lock or sufficient equivalent.
 796  */
 797 int walk_tg_tree_from(struct task_group *from,
 798                              tg_visitor down, tg_visitor up, void *data)
 799 {
 800         struct task_group *parent, *child;
 801         int ret;
 802
 803         parent = from;
 804
 805 down:
 806         ret = (*down)(parent, data);
 807         if (ret)
 808                 goto out;
 809         list_for_each_entry_rcu(child, &parent->children, siblings) {
 810                 parent = child;
 811                 goto down;
 812
 813 up:
 814                 continue;
 815         }
 816         ret = (*up)(parent, data);
 817         if (ret || parent == from)
 818                 goto out;
 819
 820         child = parent;
 821         parent = parent->parent;
 822         if (parent)
 823                 goto up;
 824 out:
 825         return ret;
 826 }
 827
 828 int tg_nop(struct task_group *tg, void *data)
 829 {
 830         return 0;
 831 }
 832 #endif
 833
 834 static void set_load_weight(struct task_struct *p)
 835 {
 836         int prio = p->static_prio - MAX_RT_PRIO;
 837         struct load_weight *load = &p->se.load;
 838
 839         /*
 840          * SCHED_IDLE tasks get minimal weight:
 841          */
 842         if (idle_policy(p->policy)) {
 843                 load->weight = scale_load(WEIGHT_IDLEPRIO);
 844                 load->inv_weight = WMULT_IDLEPRIO;
 845                 return;
 846         }
 847
 848         load->weight = scale_load(prio_to_weight[prio]);
 849         load->inv_weight = prio_to_wmult[prio];
 850 }
 851
 852 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 853 {
 854         update_rq_clock(rq);
 855         if (!(flags & ENQUEUE_RESTORE))
 856                 sched_info_queued(rq, p);
 857         p->sched_class->enqueue_task(rq, p, flags);
 858 }
 859
 860 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 861 {
 862         update_rq_clock(rq);
 863         if (!(flags & DEQUEUE_SAVE))
 864                 sched_info_dequeued(rq, p);
 865         p->sched_class->dequeue_task(rq, p, flags);
 866 }
 867
 868 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 869 {
 870         if (task_contributes_to_load(p))
 871                 rq->nr_uninterruptible--;
 872
 873         enqueue_task(rq, p, flags);
 874 }
 875
 876 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 877 {
 878         if (task_contributes_to_load(p))
 879                 rq->nr_uninterruptible++;
 880
 881         dequeue_task(rq, p, flags);
 882 }
 883
 884 static void update_rq_clock_task(struct rq *rq, s64 delta)
 885 {
 886 /*
 887  * In theory, the compile should just see 0 here, and optimize out the call
 888  * to sched_rt_avg_update. But I don't trust it...
 889  */
 890 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 891         s64 steal = 0, irq_delta = 0;
 892 #endif
 893 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 894         irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 895
 896         /*
 897          * Since irq_time is only updated on {soft,}irq_exit, we might run into
 898          * this case when a previous update_rq_clock() happened inside a
 899          * {soft,}irq region.
 900          *
 901          * When this happens, we stop ->clock_task and only update the
 902          * prev_irq_time stamp to account for the part that fit, so that a next
 903          * update will consume the rest. This ensures ->clock_task is
 904          * monotonic.
 905          *
 906          * It does however cause some slight miss-attribution of {soft,}irq
 907          * time, a more accurate solution would be to update the irq_time using
 908          * the current rq->clock timestamp, except that would require using
 909          * atomic ops.
 910          */
 911         if (irq_delta > delta)
 912                 irq_delta = delta;
 913
 914         rq->prev_irq_time += irq_delta;
 915         delta -= irq_delta;
 916 #endif
 917 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 918         if (static_key_false((&paravirt_steal_rq_enabled))) {
 919                 steal = paravirt_steal_clock(cpu_of(rq));
 920                 steal -= rq->prev_steal_time_rq;
 921
 922                 if (unlikely(steal > delta))
 923                         steal = delta;
 924
 925                 rq->prev_steal_time_rq += steal;
 926                 delta -= steal;
 927         }
 928 #endif
 929
 930         rq->clock_task += delta;
 931
 932 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 933         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 934                 sched_rt_avg_update(rq, irq_delta + steal);
 935 #endif
 936 }
 937
 938 void sched_set_stop_task(int cpu, struct task_struct *stop)
 939 {
 940         struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 941         struct task_struct *old_stop = cpu_rq(cpu)->stop;
 942
 943         if (stop) {
 944                 /*
 945                  * Make it appear like a SCHED_FIFO task, its something
 946                  * userspace knows about and won't get confused about.
 947                  *
 948                  * Also, it will make PI more or less work without too
 949                  * much confusion -- but then, stop work should not
 950                  * rely on PI working anyway.
 951                  */
 952                 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 953
 954                 stop->sched_class = &stop_sched_class;
 955         }
 956
 957         cpu_rq(cpu)->stop = stop;
 958
 959         if (old_stop) {
 960                 /*
 961                  * Reset it back to a normal scheduling class so that
 962                  * it can die in pieces.
 963                  */
 964                 old_stop->sched_class = &rt_sched_class;
 965         }
 966 }
 967
 968 /*
 969  * __normal_prio - return the priority that is based on the static prio
 970  */
 971 static inline int __normal_prio(struct task_struct *p)
 972 {
 973         return p->static_prio;
 974 }
 975
 976 /*
 977  * Calculate the expected normal priority: i.e. priority
 978  * without taking RT-inheritance into account. Might be
 979  * boosted by interactivity modifiers. Changes upon fork,
 980  * setprio syscalls, and whenever the interactivity
 981  * estimator recalculates.
 982  */
 983 static inline int normal_prio(struct task_struct *p)
 984 {
 985         int prio;
 986
 987         if (task_has_dl_policy(p))
 988                 prio = MAX_DL_PRIO-1;
 989         else if (task_has_rt_policy(p))
 990                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 991         else
 992                 prio = __normal_prio(p);
 993         return prio;
 994 }
 995
 996 /*
 997  * Calculate the current priority, i.e. the priority
 998  * taken into account by the scheduler. This value might
 999  * be boosted by RT tasks, or might be boosted by
1000  * interactivity modifiers. Will be RT if the task got
1001  * RT-boosted. If not then it returns p->normal_prio.
1002  */
1003 static int effective_prio(struct task_struct *p)
1004 {
1005         p->normal_prio = normal_prio(p);
1006         /*
1007          * If we are RT tasks or we were boosted to RT priority,
1008          * keep the priority unchanged. Otherwise, update priority
1009          * to the normal priority:
1010          */
1011         if (!rt_prio(p->prio))
1012                 return p->normal_prio;
1013         return p->prio;
1014 }
1015
1016 /**
1017  * task_curr - is this task currently executing on a CPU?
1018  * @p: the task in question.
1019  *
1020  * Return: 1 if the task is currently executing. 0 otherwise.
1021  */
1022 inline int task_curr(const struct task_struct *p)
1023 {
1024         return cpu_curr(task_cpu(p)) == p;
1025 }
1026
1027 /*
1028  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
1029  * use the balance_callback list if you want balancing.
1030  *
1031  * this means any call to check_class_changed() must be followed by a call to
1032  * balance_callback().
1033  */
1034 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1035                                        const struct sched_class *prev_class,
1036                                        int oldprio)
1037 {
1038         if (prev_class != p->sched_class) {
1039                 if (prev_class->switched_from)
1040                         prev_class->switched_from(rq, p);
1041
1042                 p->sched_class->switched_to(rq, p);
1043         } else if (oldprio != p->prio || dl_task(p))
1044                 p->sched_class->prio_changed(rq, p, oldprio);
1045 }
1046
1047 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1048 {
1049         const struct sched_class *class;
1050
1051         if (p->sched_class == rq->curr->sched_class) {
1052                 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1053         } else {
1054                 for_each_class(class) {
1055                         if (class == rq->curr->sched_class)
1056                                 break;
1057                         if (class == p->sched_class) {
1058                                 resched_curr(rq);
1059                                 break;
1060                         }
1061                 }
1062         }
1063
1064         /*
1065          * A queue event has occurred, and we're going to schedule.  In
1066          * this case, we can save a useless back to back clock update.
1067          */
1068         if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1069                 rq_clock_skip_update(rq, true);
1070 }
1071
1072 #ifdef CONFIG_SMP
1073 /*
1074  * This is how migration works:
1075  *
1076  * 1) we invoke migration_cpu_stop() on the target CPU using
1077  *    stop_one_cpu().
1078  * 2) stopper starts to run (implicitly forcing the migrated thread
1079  *    off the CPU)
1080  * 3) it checks whether the migrated task is still in the wrong runqueue.
1081  * 4) if it's in the wrong runqueue then the migration thread removes
1082  *    it and puts it into the right queue.
1083  * 5) stopper completes and stop_one_cpu() returns and the migration
1084  *    is done.
1085  */
1086
1087 /*
1088  * move_queued_task - move a queued task to new rq.
1089  *
1090  * Returns (locked) new rq. Old rq's lock is released.
1091  */
1092 static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
1093 {
1094         lockdep_assert_held(&rq->lock);
1095
1096         dequeue_task(rq, p, 0);
1097         p->on_rq = TASK_ON_RQ_MIGRATING;
1098         double_lock_balance(rq, cpu_rq(new_cpu));
1099         set_task_cpu(p, new_cpu);
1100         double_unlock_balance(rq, cpu_rq(new_cpu));
1101         raw_spin_unlock(&rq->lock);
1102
1103         rq = cpu_rq(new_cpu);
1104
1105         raw_spin_lock(&rq->lock);
1106         BUG_ON(task_cpu(p) != new_cpu);
1107         p->on_rq = TASK_ON_RQ_QUEUED;
1108         enqueue_task(rq, p, 0);
1109         check_preempt_curr(rq, p, 0);
1110
1111         return rq;
1112 }
1113
1114 struct migration_arg {
1115         struct task_struct *task;
1116         int dest_cpu;
1117 };
1118
1119 /*
1120  * Move (not current) task off this cpu, onto dest cpu. We're doing
1121  * this because either it can't run here any more (set_cpus_allowed()
1122  * away from this CPU, or CPU going down), or because we're
1123  * attempting to rebalance this task on exec (sched_exec).
1124  *
1125  * So we race with normal scheduler movements, but that's OK, as long
1126  * as the task is no longer on this CPU.
1127  */
1128 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
1129 {
1130         if (unlikely(!cpu_active(dest_cpu)))
1131                 return rq;
1132
1133         /* Affinity changed (again). */
1134         if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1135                 return rq;
1136
1137         rq = move_queued_task(rq, p, dest_cpu);
1138
1139         return rq;
1140 }
1141
1142 /*
1143  * migration_cpu_stop - this will be executed by a highprio stopper thread
1144  * and performs thread migration by bumping thread off CPU then
1145  * 'pushing' onto another runqueue.
1146  */
1147 static int migration_cpu_stop(void *data)
1148 {
1149         struct migration_arg *arg = data;
1150         struct task_struct *p = arg->task;
1151         struct rq *rq = this_rq();
1152
1153         /*
1154          * The original target cpu might have gone down and we might
1155          * be on another cpu but it doesn't matter.
1156          */
1157         local_irq_disable();
1158         /*
1159          * We need to explicitly wake pending tasks before running
1160          * __migrate_task() such that we will not miss enforcing cpus_allowed
1161          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1162          */
1163         sched_ttwu_pending();
1164
1165         raw_spin_lock(&p->pi_lock);
1166         raw_spin_lock(&rq->lock);
1167         /*
1168          * If task_rq(p) != rq, it cannot be migrated here, because we're
1169          * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1170          * we're holding p->pi_lock.
1171          */
1172         if (task_rq(p) == rq && task_on_rq_queued(p))
1173                 rq = __migrate_task(rq, p, arg->dest_cpu);
1174         raw_spin_unlock(&rq->lock);
1175         raw_spin_unlock(&p->pi_lock);
1176
1177         local_irq_enable();
1178         return 0;
1179 }
1180
1181 /*
1182  * sched_class::set_cpus_allowed must do the below, but is not required to
1183  * actually call this function.
1184  */
1185 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1186 {
1187         cpumask_copy(&p->cpus_allowed, new_mask);
1188         p->nr_cpus_allowed = cpumask_weight(new_mask);
1189 }
1190
1191 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1192 {
1193         struct rq *rq = task_rq(p);
1194         bool queued, running;
1195
1196         lockdep_assert_held(&p->pi_lock);
1197
1198         queued = task_on_rq_queued(p);
1199         running = task_current(rq, p);
1200
1201         if (queued) {
1202                 /*
1203                  * Because __kthread_bind() calls this on blocked tasks without
1204                  * holding rq->lock.
1205                  */
1206                 lockdep_assert_held(&rq->lock);
1207                 dequeue_task(rq, p, DEQUEUE_SAVE);
1208         }
1209         if (running)
1210                 put_prev_task(rq, p);
1211
1212         p->sched_class->set_cpus_allowed(p, new_mask);
1213
1214         if (running)
1215                 p->sched_class->set_curr_task(rq);
1216         if (queued)
1217                 enqueue_task(rq, p, ENQUEUE_RESTORE);
1218 }
1219
1220 /*
1221  * Change a given task's CPU affinity. Migrate the thread to a
1222  * proper CPU and schedule it away if the CPU it's executing on
1223  * is removed from the allowed bitmask.
1224  *
1225  * NOTE: the caller must have a valid reference to the task, the
1226  * task must not exit() & deallocate itself prematurely. The
1227  * call is not atomic; no spinlocks may be held.
1228  */
1229 static int __set_cpus_allowed_ptr(struct task_struct *p,
1230                                   const struct cpumask *new_mask, bool check)
1231 {
1232         unsigned long flags;
1233         struct rq *rq;
1234         unsigned int dest_cpu;
1235         int ret = 0;
1236
1237         rq = task_rq_lock(p, &flags);
1238
1239         /*
1240          * Must re-check here, to close a race against __kthread_bind(),
1241          * sched_setaffinity() is not guaranteed to observe the flag.
1242          */
1243         if (check && (p->flags & PF_NO_SETAFFINITY)) {
1244                 ret = -EINVAL;
1245                 goto out;
1246         }
1247
1248         if (cpumask_equal(&p->cpus_allowed, new_mask))
1249                 goto out;
1250
1251         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
1252                 ret = -EINVAL;
1253                 goto out;
1254         }
1255
1256         do_set_cpus_allowed(p, new_mask);
1257
1258         /* Can the task run on the task's current CPU? If so, we're done */
1259         if (cpumask_test_cpu(task_cpu(p), new_mask))
1260                 goto out;
1261
1262         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
1263         if (task_running(rq, p) || p->state == TASK_WAKING) {
1264                 struct migration_arg arg = { p, dest_cpu };
1265                 /* Need help from migration thread: drop lock and wait. */
1266                 task_rq_unlock(rq, p, &flags);
1267                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1268                 tlb_migrate_finish(p->mm);
1269                 return 0;
1270         } else if (task_on_rq_queued(p)) {
1271                 /*
1272                  * OK, since we're going to drop the lock immediately
1273                  * afterwards anyway.
1274                  */
1275                 lockdep_unpin_lock(&rq->lock);
1276                 rq = move_queued_task(rq, p, dest_cpu);
1277                 lockdep_pin_lock(&rq->lock);
1278         }
1279 out:
1280         task_rq_unlock(rq, p, &flags);
1281
1282         return ret;
1283 }
1284
1285 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1286 {
1287         return __set_cpus_allowed_ptr(p, new_mask, false);
1288 }
1289 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1290
1291 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1292 {
1293 #ifdef CONFIG_SCHED_DEBUG
1294         /*
1295          * We should never call set_task_cpu() on a blocked task,
1296          * ttwu() will sort out the placement.
1297          */
1298         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1299                         !p->on_rq);
1300
1301 #ifdef CONFIG_LOCKDEP
1302         /*
1303          * The caller should hold either p->pi_lock or rq->lock, when changing
1304          * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1305          *
1306          * sched_move_task() holds both and thus holding either pins the cgroup,
1307          * see task_group().
1308          *
1309          * Furthermore, all task_rq users should acquire both locks, see
1310          * task_rq_lock().
1311          */
1312         WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1313                                       lockdep_is_held(&task_rq(p)->lock)));
1314 #endif
1315 #endif
1316
1317         trace_sched_migrate_task(p, new_cpu);
1318
1319         if (task_cpu(p) != new_cpu) {
1320                 if (p->sched_class->migrate_task_rq)
1321                         p->sched_class->migrate_task_rq(p);
1322                 p->se.nr_migrations++;
1323                 perf_event_task_migrate(p);
1324
1325                 walt_fixup_busy_time(p, new_cpu);
1326         }
1327
1328         __set_task_cpu(p, new_cpu);
1329 }
1330
1331 static void __migrate_swap_task(struct task_struct *p, int cpu)
1332 {
1333         if (task_on_rq_queued(p)) {
1334                 struct rq *src_rq, *dst_rq;
1335
1336                 src_rq = task_rq(p);
1337                 dst_rq = cpu_rq(cpu);
1338
1339                 deactivate_task(src_rq, p, 0);
1340                 p->on_rq = TASK_ON_RQ_MIGRATING;
1341                 set_task_cpu(p, cpu);
1342                 p->on_rq = TASK_ON_RQ_QUEUED;
1343                 activate_task(dst_rq, p, 0);
1344                 check_preempt_curr(dst_rq, p, 0);
1345         } else {
1346                 /*
1347                  * Task isn't running anymore; make it appear like we migrated
1348                  * it before it went to sleep. This means on wakeup we make the
1349                  * previous cpu our targer instead of where it really is.
1350                  */
1351                 p->wake_cpu = cpu;
1352         }
1353 }
1354
1355 struct migration_swap_arg {
1356         struct task_struct *src_task, *dst_task;
1357         int src_cpu, dst_cpu;
1358 };
1359
1360 static int migrate_swap_stop(void *data)
1361 {
1362         struct migration_swap_arg *arg = data;
1363         struct rq *src_rq, *dst_rq;
1364         int ret = -EAGAIN;
1365
1366         if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1367                 return -EAGAIN;
1368
1369         src_rq = cpu_rq(arg->src_cpu);
1370         dst_rq = cpu_rq(arg->dst_cpu);
1371
1372         double_raw_lock(&arg->src_task->pi_lock,
1373                         &arg->dst_task->pi_lock);
1374         double_rq_lock(src_rq, dst_rq);
1375
1376         if (task_cpu(arg->dst_task) != arg->dst_cpu)
1377                 goto unlock;
1378
1379         if (task_cpu(arg->src_task) != arg->src_cpu)
1380                 goto unlock;
1381
1382         if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1383                 goto unlock;
1384
1385         if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1386                 goto unlock;
1387
1388         __migrate_swap_task(arg->src_task, arg->dst_cpu);
1389         __migrate_swap_task(arg->dst_task, arg->src_cpu);
1390
1391         ret = 0;
1392
1393 unlock:
1394         double_rq_unlock(src_rq, dst_rq);
1395         raw_spin_unlock(&arg->dst_task->pi_lock);
1396         raw_spin_unlock(&arg->src_task->pi_lock);
1397
1398         return ret;
1399 }
1400
1401 /*
1402  * Cross migrate two tasks
1403  */
1404 int migrate_swap(struct task_struct *cur, struct task_struct *p)
1405 {
1406         struct migration_swap_arg arg;
1407         int ret = -EINVAL;
1408
1409         arg = (struct migration_swap_arg){
1410                 .src_task = cur,
1411                 .src_cpu = task_cpu(cur),
1412                 .dst_task = p,
1413                 .dst_cpu = task_cpu(p),
1414         };
1415
1416         if (arg.src_cpu == arg.dst_cpu)
1417                 goto out;
1418
1419         /*
1420          * These three tests are all lockless; this is OK since all of them
1421          * will be re-checked with proper locks held further down the line.
1422          */
1423         if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1424                 goto out;
1425
1426         if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1427                 goto out;
1428
1429         if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1430                 goto out;
1431
1432         trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1433         ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1434
1435 out:
1436         return ret;
1437 }
1438
1439 /*
1440  * wait_task_inactive - wait for a thread to unschedule.
1441  *
1442  * If @match_state is nonzero, it's the @p->state value just checked and
1443  * not expected to change.  If it changes, i.e. @p might have woken up,
1444  * then return zero.  When we succeed in waiting for @p to be off its CPU,
1445  * we return a positive number (its total switch count).  If a second call
1446  * a short while later returns the same number, the caller can be sure that
1447  * @p has remained unscheduled the whole time.
1448  *
1449  * The caller must ensure that the task *will* unschedule sometime soon,
1450  * else this function might spin for a *long* time. This function can't
1451  * be called with interrupts off, or it may introduce deadlock with
1452  * smp_call_function() if an IPI is sent by the same process we are
1453  * waiting to become inactive.
1454  */
1455 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1456 {
1457         unsigned long flags;
1458         int running, queued;
1459         unsigned long ncsw;
1460         struct rq *rq;
1461
1462         for (;;) {
1463                 /*
1464                  * We do the initial early heuristics without holding
1465                  * any task-queue locks at all. We'll only try to get
1466                  * the runqueue lock when things look like they will
1467                  * work out!
1468                  */
1469                 rq = task_rq(p);
1470
1471                 /*
1472                  * If the task is actively running on another CPU
1473                  * still, just relax and busy-wait without holding
1474                  * any locks.
1475                  *
1476                  * NOTE! Since we don't hold any locks, it's not
1477                  * even sure that "rq" stays as the right runqueue!
1478                  * But we don't care, since "task_running()" will
1479                  * return false if the runqueue has changed and p
1480                  * is actually now running somewhere else!
1481                  */
1482                 while (task_running(rq, p)) {
1483                         if (match_state && unlikely(p->state != match_state))
1484                                 return 0;
1485                         cpu_relax();
1486                 }
1487
1488                 /*
1489                  * Ok, time to look more closely! We need the rq
1490                  * lock now, to be *sure*. If we're wrong, we'll
1491                  * just go back and repeat.
1492                  */
1493                 rq = task_rq_lock(p, &flags);
1494                 trace_sched_wait_task(p);
1495                 running = task_running(rq, p);
1496                 queued = task_on_rq_queued(p);
1497                 ncsw = 0;
1498                 if (!match_state || p->state == match_state)
1499                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1500                 task_rq_unlock(rq, p, &flags);
1501
1502                 /*
1503                  * If it changed from the expected state, bail out now.
1504                  */
1505                 if (unlikely(!ncsw))
1506                         break;
1507
1508                 /*
1509                  * Was it really running after all now that we
1510                  * checked with the proper locks actually held?
1511                  *
1512                  * Oops. Go back and try again..
1513                  */
1514                 if (unlikely(running)) {
1515                         cpu_relax();
1516                         continue;
1517                 }
1518
1519                 /*
1520                  * It's not enough that it's not actively running,
1521                  * it must be off the runqueue _entirely_, and not
1522                  * preempted!
1523                  *
1524                  * So if it was still runnable (but just not actively
1525                  * running right now), it's preempted, and we should
1526                  * yield - it could be a while.
1527                  */
1528                 if (unlikely(queued)) {
1529                         ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1530
1531                         set_current_state(TASK_UNINTERRUPTIBLE);
1532                         schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1533                         continue;
1534                 }
1535
1536                 /*
1537                  * Ahh, all good. It wasn't running, and it wasn't
1538                  * runnable, which means that it will never become
1539                  * running in the future either. We're all done!
1540                  */
1541                 break;
1542         }
1543
1544         return ncsw;
1545 }
1546
1547 /***
1548  * kick_process - kick a running thread to enter/exit the kernel
1549  * @p: the to-be-kicked thread
1550  *
1551  * Cause a process which is running on another CPU to enter
1552  * kernel-mode, without any delay. (to get signals handled.)
1553  *
1554  * NOTE: this function doesn't have to take the runqueue lock,
1555  * because all it wants to ensure is that the remote task enters
1556  * the kernel. If the IPI races and the task has been migrated
1557  * to another CPU then no harm is done and the purpose has been
1558  * achieved as well.
1559  */
1560 void kick_process(struct task_struct *p)
1561 {
1562         int cpu;
1563
1564         preempt_disable();
1565         cpu = task_cpu(p);
1566         if ((cpu != smp_processor_id()) && task_curr(p))
1567                 smp_send_reschedule(cpu);
1568         preempt_enable();
1569 }
1570 EXPORT_SYMBOL_GPL(kick_process);
1571
1572 /*
1573  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1574  */
1575 static int select_fallback_rq(int cpu, struct task_struct *p)
1576 {
1577         int nid = cpu_to_node(cpu);
1578         const struct cpumask *nodemask = NULL;
1579         enum { cpuset, possible, fail } state = cpuset;
1580         int dest_cpu;
1581
1582         /*
1583          * If the node that the cpu is on has been offlined, cpu_to_node()
1584          * will return -1. There is no cpu on the node, and we should
1585          * select the cpu on the other node.
1586          */
1587         if (nid != -1) {
1588                 nodemask = cpumask_of_node(nid);
1589
1590                 /* Look for allowed, online CPU in same node. */
1591                 for_each_cpu(dest_cpu, nodemask) {
1592                         if (!cpu_online(dest_cpu))
1593                                 continue;
1594                         if (!cpu_active(dest_cpu))
1595                                 continue;
1596                         if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1597                                 return dest_cpu;
1598                 }
1599         }
1600
1601         for (;;) {
1602                 /* Any allowed, online CPU? */
1603                 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1604                         if (!cpu_online(dest_cpu))
1605                                 continue;
1606                         if (!cpu_active(dest_cpu))
1607                                 continue;
1608                         goto out;
1609                 }
1610
1611                 /* No more Mr. Nice Guy. */
1612                 switch (state) {
1613                 case cpuset:
1614                         if (IS_ENABLED(CONFIG_CPUSETS)) {
1615                                 cpuset_cpus_allowed_fallback(p);
1616                                 state = possible;
1617                                 break;
1618                         }
1619                         /* fall-through */
1620                 case possible:
1621                         do_set_cpus_allowed(p, cpu_possible_mask);
1622                         state = fail;
1623                         break;
1624
1625                 case fail:
1626                         BUG();
1627                         break;
1628                 }
1629         }
1630
1631 out:
1632         if (state != cpuset) {
1633                 /*
1634                  * Don't tell them about moving exiting tasks or
1635                  * kernel threads (both mm NULL), since they never
1636                  * leave kernel.
1637                  */
1638                 if (p->mm && printk_ratelimit()) {
1639                         printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1640                                         task_pid_nr(p), p->comm, cpu);
1641                 }
1642         }
1643
1644         return dest_cpu;
1645 }
1646
1647 /*
1648  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1649  */
1650 static inline
1651 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
1652                    int sibling_count_hint)
1653 {
1654         lockdep_assert_held(&p->pi_lock);
1655
1656         if (p->nr_cpus_allowed > 1)
1657                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
1658                                                      sibling_count_hint);
1659
1660         /*
1661          * In order not to call set_task_cpu() on a blocking task we need
1662          * to rely on ttwu() to place the task on a valid ->cpus_allowed
1663          * cpu.
1664          *
1665          * Since this is common to all placement strategies, this lives here.
1666          *
1667          * [ this allows ->select_task() to simply return task_cpu(p) and
1668          *   not worry about this generic constraint ]
1669          */
1670         if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1671                      !cpu_online(cpu)))
1672                 cpu = select_fallback_rq(task_cpu(p), p);
1673
1674         return cpu;
1675 }
1676
1677 static void update_avg(u64 *avg, u64 sample)
1678 {
1679         s64 diff = sample - *avg;
1680         *avg += diff >> 3;
1681 }
1682
1683 #else
1684
1685 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1686                                          const struct cpumask *new_mask, bool check)
1687 {
1688         return set_cpus_allowed_ptr(p, new_mask);
1689 }
1690
1691 #endif /* CONFIG_SMP */
1692
1693 static void
1694 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1695 {
1696 #ifdef CONFIG_SCHEDSTATS
1697         struct rq *rq = this_rq();
1698
1699 #ifdef CONFIG_SMP
1700         int this_cpu = smp_processor_id();
1701
1702         if (cpu == this_cpu) {
1703                 schedstat_inc(rq, ttwu_local);
1704                 schedstat_inc(p, se.statistics.nr_wakeups_local);
1705         } else {
1706                 struct sched_domain *sd;
1707
1708                 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1709                 rcu_read_lock();
1710                 for_each_domain(this_cpu, sd) {
1711                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1712                                 schedstat_inc(sd, ttwu_wake_remote);
1713                                 break;
1714                         }
1715                 }
1716                 rcu_read_unlock();
1717         }
1718
1719         if (wake_flags & WF_MIGRATED)
1720                 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1721
1722 #endif /* CONFIG_SMP */
1723
1724         schedstat_inc(rq, ttwu_count);
1725         schedstat_inc(p, se.statistics.nr_wakeups);
1726
1727         if (wake_flags & WF_SYNC)
1728                 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1729
1730 #endif /* CONFIG_SCHEDSTATS */
1731 }
1732
1733 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1734 {
1735         activate_task(rq, p, en_flags);
1736         p->on_rq = TASK_ON_RQ_QUEUED;
1737
1738         /* if a worker is waking up, notify workqueue */
1739         if (p->flags & PF_WQ_WORKER)
1740                 wq_worker_waking_up(p, cpu_of(rq));
1741 }
1742
1743 /*
1744  * Mark the task runnable and perform wakeup-preemption.
1745  */
1746 static void
1747 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1748 {
1749         check_preempt_curr(rq, p, wake_flags);
1750         p->state = TASK_RUNNING;
1751         trace_sched_wakeup(p);
1752
1753 #ifdef CONFIG_SMP
1754         if (p->sched_class->task_woken) {
1755                 /*
1756                  * Our task @p is fully woken up and running; so its safe to
1757                  * drop the rq->lock, hereafter rq is only used for statistics.
1758                  */
1759                 lockdep_unpin_lock(&rq->lock);
1760                 p->sched_class->task_woken(rq, p);
1761                 lockdep_pin_lock(&rq->lock);
1762         }
1763
1764         if (rq->idle_stamp) {
1765                 u64 delta = rq_clock(rq) - rq->idle_stamp;
1766                 u64 max = 2*rq->max_idle_balance_cost;
1767
1768                 update_avg(&rq->avg_idle, delta);
1769
1770                 if (rq->avg_idle > max)
1771                         rq->avg_idle = max;
1772
1773                 rq->idle_stamp = 0;
1774         }
1775 #endif
1776 }
1777
1778 static void
1779 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1780 {
1781         lockdep_assert_held(&rq->lock);
1782
1783 #ifdef CONFIG_SMP
1784         if (p->sched_contributes_to_load)
1785                 rq->nr_uninterruptible--;
1786 #endif
1787
1788         ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1789         ttwu_do_wakeup(rq, p, wake_flags);
1790 }
1791
1792 /*
1793  * Called in case the task @p isn't fully descheduled from its runqueue,
1794  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1795  * since all we need to do is flip p->state to TASK_RUNNING, since
1796  * the task is still ->on_rq.
1797  */
1798 static int ttwu_remote(struct task_struct *p, int wake_flags)
1799 {
1800         struct rq *rq;
1801         int ret = 0;
1802
1803         rq = __task_rq_lock(p);
1804         if (task_on_rq_queued(p)) {
1805                 /* check_preempt_curr() may use rq clock */
1806                 update_rq_clock(rq);
1807                 ttwu_do_wakeup(rq, p, wake_flags);
1808                 ret = 1;
1809         }
1810         __task_rq_unlock(rq);
1811
1812         return ret;
1813 }
1814
1815 #ifdef CONFIG_SMP
1816 void sched_ttwu_pending(void)
1817 {
1818         struct rq *rq = this_rq();
1819         struct llist_node *llist = llist_del_all(&rq->wake_list);
1820         struct task_struct *p;
1821         unsigned long flags;
1822
1823         if (!llist)
1824                 return;
1825
1826         raw_spin_lock_irqsave(&rq->lock, flags);
1827         lockdep_pin_lock(&rq->lock);
1828
1829         while (llist) {
1830                 p = llist_entry(llist, struct task_struct, wake_entry);
1831                 llist = llist_next(llist);
1832                 ttwu_do_activate(rq, p, 0);
1833         }
1834
1835         lockdep_unpin_lock(&rq->lock);
1836         raw_spin_unlock_irqrestore(&rq->lock, flags);
1837 }
1838
1839 void scheduler_ipi(void)
1840 {
1841         /*
1842          * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1843          * TIF_NEED_RESCHED remotely (for the first time) will also send
1844          * this IPI.
1845          */
1846         preempt_fold_need_resched();
1847
1848         if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1849                 return;
1850
1851         /*
1852          * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1853          * traditionally all their work was done from the interrupt return
1854          * path. Now that we actually do some work, we need to make sure
1855          * we do call them.
1856          *
1857          * Some archs already do call them, luckily irq_enter/exit nest
1858          * properly.
1859          *
1860          * Arguably we should visit all archs and update all handlers,
1861          * however a fair share of IPIs are still resched only so this would
1862          * somewhat pessimize the simple resched case.
1863          */
1864         irq_enter();
1865         sched_ttwu_pending();
1866
1867         /*
1868          * Check if someone kicked us for doing the nohz idle load balance.
1869          */
1870         if (unlikely(got_nohz_idle_kick())) {
1871                 this_rq()->idle_balance = 1;
1872                 raise_softirq_irqoff(SCHED_SOFTIRQ);
1873         }
1874         irq_exit();
1875 }
1876
1877 static void ttwu_queue_remote(struct task_struct *p, int cpu)
1878 {
1879         struct rq *rq = cpu_rq(cpu);
1880
1881         if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1882                 if (!set_nr_if_polling(rq->idle))
1883                         smp_send_reschedule(cpu);
1884                 else
1885                         trace_sched_wake_idle_without_ipi(cpu);
1886         }
1887 }
1888
1889 void wake_up_if_idle(int cpu)
1890 {
1891         struct rq *rq = cpu_rq(cpu);
1892         unsigned long flags;
1893
1894         rcu_read_lock();
1895
1896         if (!is_idle_task(rcu_dereference(rq->curr)))
1897                 goto out;
1898
1899         if (set_nr_if_polling(rq->idle)) {
1900                 trace_sched_wake_idle_without_ipi(cpu);
1901         } else {
1902                 raw_spin_lock_irqsave(&rq->lock, flags);
1903                 if (is_idle_task(rq->curr))
1904                         smp_send_reschedule(cpu);
1905                 /* Else cpu is not in idle, do nothing here */
1906                 raw_spin_unlock_irqrestore(&rq->lock, flags);
1907         }
1908
1909 out:
1910         rcu_read_unlock();
1911 }
1912
1913 bool cpus_share_cache(int this_cpu, int that_cpu)
1914 {
1915         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1916 }
1917 #endif /* CONFIG_SMP */
1918
1919 static void ttwu_queue(struct task_struct *p, int cpu)
1920 {
1921         struct rq *rq = cpu_rq(cpu);
1922
1923 #if defined(CONFIG_SMP)
1924         if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1925                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1926                 ttwu_queue_remote(p, cpu);
1927                 return;
1928         }
1929 #endif
1930
1931         raw_spin_lock(&rq->lock);
1932         lockdep_pin_lock(&rq->lock);
1933         ttwu_do_activate(rq, p, 0);
1934         lockdep_unpin_lock(&rq->lock);
1935         raw_spin_unlock(&rq->lock);
1936 }
1937
1938 /**
1939  * try_to_wake_up - wake up a thread
1940  * @p: the thread to be awakened
1941  * @state: the mask of task states that can be woken
1942  * @wake_flags: wake modifier flags (WF_*)
1943  * @sibling_count_hint: A hint at the number of threads that are being woken up
1944  *                      in this event.
1945  *
1946  * Put it on the run-queue if it's not already there. The "current"
1947  * thread is always on the run-queue (except when the actual
1948  * re-schedule is in progress), and as such you're allowed to do
1949  * the simpler "current->state = TASK_RUNNING" to mark yourself
1950  * runnable without the overhead of this.
1951  *
1952  * Return: %true if @p was woken up, %false if it was already running.
1953  * or @state didn't match @p's state.
1954  */
1955 static int
1956 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
1957                int sibling_count_hint)
1958 {
1959         unsigned long flags;
1960         int cpu, success = 0;
1961 #ifdef CONFIG_SMP
1962         struct rq *rq;
1963         u64 wallclock;
1964 #endif
1965
1966         /*
1967          * If we are going to wake up a thread waiting for CONDITION we
1968          * need to ensure that CONDITION=1 done by the caller can not be
1969          * reordered with p->state check below. This pairs with mb() in
1970          * set_current_state() the waiting thread does.
1971          */
1972         smp_mb__before_spinlock();
1973         raw_spin_lock_irqsave(&p->pi_lock, flags);
1974         if (!(p->state & state))
1975                 goto out;
1976
1977         trace_sched_waking(p);
1978
1979         success = 1; /* we're going to change ->state */
1980         cpu = task_cpu(p);
1981
1982         /*
1983          * Ensure we load p->on_rq _after_ p->state, otherwise it would
1984          * be possible to, falsely, observe p->on_rq == 0 and get stuck
1985          * in smp_cond_load_acquire() below.
1986          *
1987          * sched_ttwu_pending()                 try_to_wake_up()
1988          *   [S] p->on_rq = 1;                  [L] P->state
1989          *       UNLOCK rq->lock  -----.
1990          *                              \
1991          *                               +---   RMB
1992          * schedule()                   /
1993          *       LOCK rq->lock    -----'
1994          *       UNLOCK rq->lock
1995          *
1996          * [task p]
1997          *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
1998          *
1999          * Pairs with the UNLOCK+LOCK on rq->lock from the
2000          * last wakeup of our task and the schedule that got our task
2001          * current.
2002          */
2003         smp_rmb();
2004         if (p->on_rq && ttwu_remote(p, wake_flags))
2005                 goto stat;
2006
2007 #ifdef CONFIG_SMP
2008         /*
2009          * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
2010          * possible to, falsely, observe p->on_cpu == 0.
2011          *
2012          * One must be running (->on_cpu == 1) in order to remove oneself
2013          * from the runqueue.
2014          *
2015          *  [S] ->on_cpu = 1;   [L] ->on_rq
2016          *      UNLOCK rq->lock
2017          *                      RMB
2018          *      LOCK   rq->lock
2019          *  [S] ->on_rq = 0;    [L] ->on_cpu
2020          *
2021          * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
2022          * from the consecutive calls to schedule(); the first switching to our
2023          * task, the second putting it to sleep.
2024          */
2025         smp_rmb();
2026
2027         /*
2028          * If the owning (remote) cpu is still in the middle of schedule() with
2029          * this task as prev, wait until its done referencing the task.
2030          */
2031         while (p->on_cpu)
2032                 cpu_relax();
2033         /*
2034          * Combined with the control dependency above, we have an effective
2035          * smp_load_acquire() without the need for full barriers.
2036          *
2037          * Pairs with the smp_store_release() in finish_lock_switch().
2038          *
2039          * This ensures that tasks getting woken will be fully ordered against
2040          * their previous state and preserve Program Order.
2041          */
2042         smp_rmb();
2043
2044         rq = cpu_rq(task_cpu(p));
2045
2046         raw_spin_lock(&rq->lock);
2047         wallclock = walt_ktime_clock();
2048         walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
2049         walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
2050         raw_spin_unlock(&rq->lock);
2051
2052         p->sched_contributes_to_load = !!task_contributes_to_load(p);
2053         p->state = TASK_WAKING;
2054
2055         if (p->sched_class->task_waking)
2056                 p->sched_class->task_waking(p);
2057
2058         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2059                              sibling_count_hint);
2060         if (task_cpu(p) != cpu) {
2061                 wake_flags |= WF_MIGRATED;
2062                 set_task_cpu(p, cpu);
2063         }
2064
2065 #endif /* CONFIG_SMP */
2066
2067         ttwu_queue(p, cpu);
2068 stat:
2069         ttwu_stat(p, cpu, wake_flags);
2070 out:
2071         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2072
2073         return success;
2074 }
2075
2076 /**
2077  * try_to_wake_up_local - try to wake up a local task with rq lock held
2078  * @p: the thread to be awakened
2079  *
2080  * Put @p on the run-queue if it's not already there. The caller must
2081  * ensure that this_rq() is locked, @p is bound to this_rq() and not
2082  * the current task.
2083  */
2084 static void try_to_wake_up_local(struct task_struct *p)
2085 {
2086         struct rq *rq = task_rq(p);
2087
2088         if (WARN_ON_ONCE(rq != this_rq()) ||
2089             WARN_ON_ONCE(p == current))
2090                 return;
2091
2092         lockdep_assert_held(&rq->lock);
2093
2094         if (!raw_spin_trylock(&p->pi_lock)) {
2095                 /*
2096                  * This is OK, because current is on_cpu, which avoids it being
2097                  * picked for load-balance and preemption/IRQs are still
2098                  * disabled avoiding further scheduler activity on it and we've
2099                  * not yet picked a replacement task.
2100                  */
2101                 lockdep_unpin_lock(&rq->lock);
2102                 raw_spin_unlock(&rq->lock);
2103                 raw_spin_lock(&p->pi_lock);
2104                 raw_spin_lock(&rq->lock);
2105                 lockdep_pin_lock(&rq->lock);
2106         }
2107
2108         if (!(p->state & TASK_NORMAL))
2109                 goto out;
2110
2111         trace_sched_waking(p);
2112
2113         if (!task_on_rq_queued(p)) {
2114                 u64 wallclock = walt_ktime_clock();
2115
2116                 walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
2117                 walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
2118                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2119         }
2120
2121         ttwu_do_wakeup(rq, p, 0);
2122         ttwu_stat(p, smp_processor_id(), 0);
2123 out:
2124         raw_spin_unlock(&p->pi_lock);
2125 }
2126
2127 /**
2128  * wake_up_process - Wake up a specific process
2129  * @p: The process to be woken up.
2130  *
2131  * Attempt to wake up the nominated process and move it to the set of runnable
2132  * processes.
2133  *
2134  * Return: 1 if the process was woken up, 0 if it was already running.
2135  *
2136  * It may be assumed that this function implies a write memory barrier before
2137  * changing the task state if and only if any tasks are woken up.
2138  */
2139 int wake_up_process(struct task_struct *p)
2140 {
2141         return try_to_wake_up(p, TASK_NORMAL, 0, 1);
2142 }
2143 EXPORT_SYMBOL(wake_up_process);
2144
2145 int wake_up_state(struct task_struct *p, unsigned int state)
2146 {
2147         return try_to_wake_up(p, state, 0, 1);
2148 }
2149
2150 /*
2151  * This function clears the sched_dl_entity static params.
2152  */
2153 void __dl_clear_params(struct task_struct *p)
2154 {
2155         struct sched_dl_entity *dl_se = &p->dl;
2156
2157         dl_se->dl_runtime = 0;
2158         dl_se->dl_deadline = 0;
2159         dl_se->dl_period = 0;
2160         dl_se->flags = 0;
2161         dl_se->dl_bw = 0;
2162
2163         dl_se->dl_throttled = 0;
2164         dl_se->dl_new = 1;
2165         dl_se->dl_yielded = 0;
2166 }
2167
2168 /*
2169  * Perform scheduler related setup for a newly forked process p.
2170  * p is forked by current.
2171  *
2172  * __sched_fork() is basic setup used by init_idle() too:
2173  */
2174 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2175 {
2176         p->on_rq                        = 0;
2177
2178         p->se.on_rq                     = 0;
2179         p->se.exec_start                = 0;
2180         p->se.sum_exec_runtime          = 0;
2181         p->se.prev_sum_exec_runtime     = 0;
2182         p->se.nr_migrations             = 0;
2183         p->se.vruntime                  = 0;
2184 #ifdef CONFIG_SCHED_WALT
2185         p->last_sleep_ts                = 0;
2186 #endif
2187
2188         INIT_LIST_HEAD(&p->se.group_node);
2189         walt_init_new_task_load(p);
2190
2191 #ifdef CONFIG_FAIR_GROUP_SCHED
2192         p->se.cfs_rq                    = NULL;
2193 #endif
2194
2195 #ifdef CONFIG_SCHEDSTATS
2196         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2197 #endif
2198
2199         RB_CLEAR_NODE(&p->dl.rb_node);
2200         init_dl_task_timer(&p->dl);
2201         __dl_clear_params(p);
2202
2203         init_rt_schedtune_timer(&p->rt);
2204         INIT_LIST_HEAD(&p->rt.run_list);
2205
2206 #ifdef CONFIG_PREEMPT_NOTIFIERS
2207         INIT_HLIST_HEAD(&p->preempt_notifiers);
2208 #endif
2209
2210 #ifdef CONFIG_NUMA_BALANCING
2211         if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2212                 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2213                 p->mm->numa_scan_seq = 0;
2214         }
2215
2216         if (clone_flags & CLONE_VM)
2217                 p->numa_preferred_nid = current->numa_preferred_nid;
2218         else
2219                 p->numa_preferred_nid = -1;
2220
2221         p->node_stamp = 0ULL;
2222         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2223         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2224         p->numa_work.next = &p->numa_work;
2225         p->numa_faults = NULL;
2226         p->last_task_numa_placement = 0;
2227         p->last_sum_exec_runtime = 0;
2228
2229         p->numa_group = NULL;
2230 #endif /* CONFIG_NUMA_BALANCING */
2231 }
2232
2233 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2234
2235 #ifdef CONFIG_NUMA_BALANCING
2236
2237 void set_numabalancing_state(bool enabled)
2238 {
2239         if (enabled)
2240                 static_branch_enable(&sched_numa_balancing);
2241         else
2242                 static_branch_disable(&sched_numa_balancing);
2243 }
2244
2245 #ifdef CONFIG_PROC_SYSCTL
2246 int sysctl_numa_balancing(struct ctl_table *table, int write,
2247                          void __user *buffer, size_t *lenp, loff_t *ppos)
2248 {
2249         struct ctl_table t;
2250         int err;
2251         int state = static_branch_likely(&sched_numa_balancing);
2252
2253         if (write && !capable(CAP_SYS_ADMIN))
2254                 return -EPERM;
2255
2256         t = *table;
2257         t.data = &state;
2258         err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2259         if (err < 0)
2260                 return err;
2261         if (write)
2262                 set_numabalancing_state(state);
2263         return err;
2264 }
2265 #endif
2266 #endif
2267
2268 /*
2269  * fork()/clone()-time setup:
2270  */
2271 int sched_fork(unsigned long clone_flags, struct task_struct *p)
2272 {
2273         unsigned long flags;
2274         int cpu = get_cpu();
2275
2276         __sched_fork(clone_flags, p);
2277         /*
2278          * We mark the process as NEW here. This guarantees that
2279          * nobody will actually run it, and a signal or other external
2280          * event cannot wake it up and insert it on the runqueue either.
2281          */
2282         p->state = TASK_NEW;
2283
2284         /*
2285          * Make sure we do not leak PI boosting priority to the child.
2286          */
2287         p->prio = current->normal_prio;
2288
2289         /*
2290          * Revert to default priority/policy on fork if requested.
2291          */
2292         if (unlikely(p->sched_reset_on_fork)) {
2293                 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2294                         p->policy = SCHED_NORMAL;
2295                         p->static_prio = NICE_TO_PRIO(0);
2296                         p->rt_priority = 0;
2297                 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2298                         p->static_prio = NICE_TO_PRIO(0);
2299
2300                 p->prio = p->normal_prio = __normal_prio(p);
2301                 set_load_weight(p);
2302
2303                 /*
2304                  * We don't need the reset flag anymore after the fork. It has
2305                  * fulfilled its duty:
2306                  */
2307                 p->sched_reset_on_fork = 0;
2308         }
2309
2310         if (dl_prio(p->prio)) {
2311                 put_cpu();
2312                 return -EAGAIN;
2313         } else if (rt_prio(p->prio)) {
2314                 p->sched_class = &rt_sched_class;
2315         } else {
2316                 p->sched_class = &fair_sched_class;
2317         }
2318
2319         init_entity_runnable_average(&p->se);
2320
2321         /*
2322          * The child is not yet in the pid-hash so no cgroup attach races,
2323          * and the cgroup is pinned to this child due to cgroup_fork()
2324          * is ran before sched_fork().
2325          *
2326          * Silence PROVE_RCU.
2327          */
2328         raw_spin_lock_irqsave(&p->pi_lock, flags);
2329         /*
2330          * We're setting the cpu for the first time, we don't migrate,
2331          * so use __set_task_cpu().
2332          */
2333         __set_task_cpu(p, cpu);
2334         if (p->sched_class->task_fork)
2335                 p->sched_class->task_fork(p);
2336         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2337
2338 #ifdef CONFIG_SCHED_INFO
2339         if (likely(sched_info_on()))
2340                 memset(&p->sched_info, 0, sizeof(p->sched_info));
2341 #endif
2342 #if defined(CONFIG_SMP)
2343         p->on_cpu = 0;
2344 #endif
2345         init_task_preempt_count(p);
2346 #ifdef CONFIG_SMP
2347         plist_node_init(&p->pushable_tasks, MAX_PRIO);
2348         RB_CLEAR_NODE(&p->pushable_dl_tasks);
2349 #endif
2350
2351         put_cpu();
2352         return 0;
2353 }
2354
2355 unsigned long to_ratio(u64 period, u64 runtime)
2356 {
2357         if (runtime == RUNTIME_INF)
2358                 return 1ULL << 20;
2359
2360         /*
2361          * Doing this here saves a lot of checks in all
2362          * the calling paths, and returning zero seems
2363          * safe for them anyway.
2364          */
2365         if (period == 0)
2366                 return 0;
2367
2368         return div64_u64(runtime << 20, period);
2369 }
2370
2371 #ifdef CONFIG_SMP
2372 inline struct dl_bw *dl_bw_of(int i)
2373 {
2374         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2375                          "sched RCU must be held");
2376         return &cpu_rq(i)->rd->dl_bw;
2377 }
2378
2379 static inline int dl_bw_cpus(int i)
2380 {
2381         struct root_domain *rd = cpu_rq(i)->rd;
2382         int cpus = 0;
2383
2384         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2385                          "sched RCU must be held");
2386         for_each_cpu_and(i, rd->span, cpu_active_mask)
2387                 cpus++;
2388
2389         return cpus;
2390 }
2391 #else
2392 inline struct dl_bw *dl_bw_of(int i)
2393 {
2394         return &cpu_rq(i)->dl.dl_bw;
2395 }
2396
2397 static inline int dl_bw_cpus(int i)
2398 {
2399         return 1;
2400 }
2401 #endif
2402
2403 /*
2404  * We must be sure that accepting a new task (or allowing changing the
2405  * parameters of an existing one) is consistent with the bandwidth
2406  * constraints. If yes, this function also accordingly updates the currently
2407  * allocated bandwidth to reflect the new situation.
2408  *
2409  * This function is called while holding p's rq->lock.
2410  *
2411  * XXX we should delay bw change until the task's 0-lag point, see
2412  * __setparam_dl().
2413  */
2414 static int dl_overflow(struct task_struct *p, int policy,
2415                        const struct sched_attr *attr)
2416 {
2417
2418         struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2419         u64 period = attr->sched_period ?: attr->sched_deadline;
2420         u64 runtime = attr->sched_runtime;
2421         u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2422         int cpus, err = -1;
2423
2424         if (new_bw == p->dl.dl_bw)
2425                 return 0;
2426
2427         /*
2428          * Either if a task, enters, leave, or stays -deadline but changes
2429          * its parameters, we may need to update accordingly the total
2430          * allocated bandwidth of the container.
2431          */
2432         raw_spin_lock(&dl_b->lock);
2433         cpus = dl_bw_cpus(task_cpu(p));
2434         if (dl_policy(policy) && !task_has_dl_policy(p) &&
2435             !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2436                 __dl_add(dl_b, new_bw);
2437                 err = 0;
2438         } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2439                    !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2440                 __dl_clear(dl_b, p->dl.dl_bw);
2441                 __dl_add(dl_b, new_bw);
2442                 err = 0;
2443         } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2444                 __dl_clear(dl_b, p->dl.dl_bw);
2445                 err = 0;
2446         }
2447         raw_spin_unlock(&dl_b->lock);
2448
2449         return err;
2450 }
2451
2452 extern void init_dl_bw(struct dl_bw *dl_b);
2453
2454 /*
2455  * wake_up_new_task - wake up a newly created task for the first time.
2456  *
2457  * This function will do some initial scheduler statistics housekeeping
2458  * that must be done for every newly created context, then puts the task
2459  * on the runqueue and wakes it.
2460  */
2461 void wake_up_new_task(struct task_struct *p)
2462 {
2463         unsigned long flags;
2464         struct rq *rq;
2465
2466         raw_spin_lock_irqsave(&p->pi_lock, flags);
2467         p->state = TASK_RUNNING;
2468
2469         walt_init_new_task_load(p);
2470
2471         /* Initialize new task's runnable average */
2472         init_entity_runnable_average(&p->se);
2473 #ifdef CONFIG_SMP
2474         /*
2475          * Fork balancing, do it here and not earlier because:
2476          *  - cpus_allowed can change in the fork path
2477          *  - any previously selected cpu might disappear through hotplug
2478          *
2479          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2480          * as we're not fully set-up yet.
2481          */
2482         __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
2483 #endif
2484         rq = __task_rq_lock(p);
2485         update_rq_clock(rq);
2486         post_init_entity_util_avg(&p->se);
2487
2488         walt_mark_task_starting(p);
2489         activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
2490         p->on_rq = TASK_ON_RQ_QUEUED;
2491         trace_sched_wakeup_new(p);
2492         check_preempt_curr(rq, p, WF_FORK);
2493 #ifdef CONFIG_SMP
2494         if (p->sched_class->task_woken) {
2495                 /*
2496                  * Nothing relies on rq->lock after this, so its fine to
2497                  * drop it.
2498                  */
2499                 lockdep_unpin_lock(&rq->lock);
2500                 p->sched_class->task_woken(rq, p);
2501                 lockdep_pin_lock(&rq->lock);
2502         }
2503 #endif
2504         task_rq_unlock(rq, p, &flags);
2505 }
2506
2507 #ifdef CONFIG_PREEMPT_NOTIFIERS
2508
2509 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2510
2511 void preempt_notifier_inc(void)
2512 {
2513         static_key_slow_inc(&preempt_notifier_key);
2514 }
2515 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2516
2517 void preempt_notifier_dec(void)
2518 {
2519         static_key_slow_dec(&preempt_notifier_key);
2520 }
2521 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2522
2523 /**
2524  * preempt_notifier_register - tell me when current is being preempted & rescheduled
2525  * @notifier: notifier struct to register
2526  */
2527 void preempt_notifier_register(struct preempt_notifier *notifier)
2528 {
2529         if (!static_key_false(&preempt_notifier_key))
2530                 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2531
2532         hlist_add_head(&notifier->link, &current->preempt_notifiers);
2533 }
2534 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2535
2536 /**
2537  * preempt_notifier_unregister - no longer interested in preemption notifications
2538  * @notifier: notifier struct to unregister
2539  *
2540  * This is *not* safe to call from within a preemption notifier.
2541  */
2542 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2543 {
2544         hlist_del(&notifier->link);
2545 }
2546 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2547
2548 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2549 {
2550         struct preempt_notifier *notifier;
2551
2552         hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2553                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2554 }
2555
2556 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2557 {
2558         if (static_key_false(&preempt_notifier_key))
2559                 __fire_sched_in_preempt_notifiers(curr);
2560 }
2561
2562 static void
2563 __fire_sched_out_preempt_notifiers(struct task_struct *curr,
2564                                    struct task_struct *next)
2565 {
2566         struct preempt_notifier *notifier;
2567
2568         hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2569                 notifier->ops->sched_out(notifier, next);
2570 }
2571
2572 static __always_inline void
2573 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2574                                  struct task_struct *next)
2575 {
2576         if (static_key_false(&preempt_notifier_key))
2577                 __fire_sched_out_preempt_notifiers(curr, next);
2578 }
2579
2580 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2581
2582 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2583 {
2584 }
2585
2586 static inline void
2587 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2588                                  struct task_struct *next)
2589 {
2590 }
2591
2592 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2593
2594 /**
2595  * prepare_task_switch - prepare to switch tasks
2596  * @rq: the runqueue preparing to switch
2597  * @prev: the current task that is being switched out
2598  * @next: the task we are going to switch to.
2599  *
2600  * This is called with the rq lock held and interrupts off. It must
2601  * be paired with a subsequent finish_task_switch after the context
2602  * switch.
2603  *
2604  * prepare_task_switch sets up locking and calls architecture specific
2605  * hooks.
2606  */
2607 static inline void
2608 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2609                     struct task_struct *next)
2610 {
2611         sched_info_switch(rq, prev, next);
2612         perf_event_task_sched_out(prev, next);
2613         fire_sched_out_preempt_notifiers(prev, next);
2614         prepare_lock_switch(rq, next);
2615         prepare_arch_switch(next);
2616 }
2617
2618 /**
2619  * finish_task_switch - clean up after a task-switch
2620  * @prev: the thread we just switched away from.
2621  *
2622  * finish_task_switch must be called after the context switch, paired
2623  * with a prepare_task_switch call before the context switch.
2624  * finish_task_switch will reconcile locking set up by prepare_task_switch,
2625  * and do any other architecture-specific cleanup actions.
2626  *
2627  * Note that we may have delayed dropping an mm in context_switch(). If
2628  * so, we finish that here outside of the runqueue lock. (Doing it
2629  * with the lock held can cause deadlocks; see schedule() for
2630  * details.)
2631  *
2632  * The context switch have flipped the stack from under us and restored the
2633  * local variables which were saved when this task called schedule() in the
2634  * past. prev == current is still correct but we need to recalculate this_rq
2635  * because prev may have moved to another CPU.
2636  */
2637 static struct rq *finish_task_switch(struct task_struct *prev)
2638         __releases(rq->lock)
2639 {
2640         struct rq *rq = this_rq();
2641         struct mm_struct *mm = rq->prev_mm;
2642         long prev_state;
2643
2644         /*
2645          * The previous task will have left us with a preempt_count of 2
2646          * because it left us after:
2647          *
2648          *      schedule()
2649          *        preempt_disable();                    // 1
2650          *        __schedule()
2651          *          raw_spin_lock_irq(&rq->lock)        // 2
2652          *
2653          * Also, see FORK_PREEMPT_COUNT.
2654          */
2655         if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2656                       "corrupted preempt_count: %s/%d/0x%x\n",
2657                       current->comm, current->pid, preempt_count()))
2658                 preempt_count_set(FORK_PREEMPT_COUNT);
2659
2660         rq->prev_mm = NULL;
2661
2662         /*
2663          * A task struct has one reference for the use as "current".
2664          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2665          * schedule one last time. The schedule call will never return, and
2666          * the scheduled task must drop that reference.
2667          *
2668          * We must observe prev->state before clearing prev->on_cpu (in
2669          * finish_lock_switch), otherwise a concurrent wakeup can get prev
2670          * running on another CPU and we could rave with its RUNNING -> DEAD
2671          * transition, resulting in a double drop.
2672          */
2673         prev_state = prev->state;
2674         vtime_task_switch(prev);
2675         perf_event_task_sched_in(prev, current);
2676         finish_lock_switch(rq, prev);
2677         finish_arch_post_lock_switch();
2678
2679         fire_sched_in_preempt_notifiers(current);
2680         if (mm)
2681                 mmdrop(mm);
2682         if (unlikely(prev_state == TASK_DEAD)) {
2683                 if (prev->sched_class->task_dead)
2684                         prev->sched_class->task_dead(prev);
2685
2686                 /*
2687                  * Remove function-return probe instances associated with this
2688                  * task and put them back on the free list.
2689                  */
2690                 kprobe_flush_task(prev);
2691                 put_task_struct(prev);
2692         }
2693
2694         tick_nohz_task_switch();
2695         return rq;
2696 }
2697
2698 #ifdef CONFIG_SMP
2699
2700 /* rq->lock is NOT held, but preemption is disabled */
2701 static void __balance_callback(struct rq *rq)
2702 {
2703         struct callback_head *head, *next;
2704         void (*func)(struct rq *rq);
2705         unsigned long flags;
2706
2707         raw_spin_lock_irqsave(&rq->lock, flags);
2708         head = rq->balance_callback;
2709         rq->balance_callback = NULL;
2710         while (head) {
2711                 func = (void (*)(struct rq *))head->func;
2712                 next = head->next;
2713                 head->next = NULL;
2714                 head = next;
2715
2716                 func(rq);
2717         }
2718         raw_spin_unlock_irqrestore(&rq->lock, flags);
2719 }
2720
2721 static inline void balance_callback(struct rq *rq)
2722 {
2723         if (unlikely(rq->balance_callback))
2724                 __balance_callback(rq);
2725 }
2726
2727 #else
2728
2729 static inline void balance_callback(struct rq *rq)
2730 {
2731 }
2732
2733 #endif
2734
2735 /**
2736  * schedule_tail - first thing a freshly forked thread must call.
2737  * @prev: the thread we just switched away from.
2738  */
2739 asmlinkage __visible void schedule_tail(struct task_struct *prev)
2740         __releases(rq->lock)
2741 {
2742         struct rq *rq;
2743
2744         /*
2745          * New tasks start with FORK_PREEMPT_COUNT, see there and
2746          * finish_task_switch() for details.
2747          *
2748          * finish_task_switch() will drop rq->lock() and lower preempt_count
2749          * and the preempt_enable() will end up enabling preemption (on
2750          * PREEMPT_COUNT kernels).
2751          */
2752
2753         rq = finish_task_switch(prev);
2754         balance_callback(rq);
2755         preempt_enable();
2756
2757         if (current->set_child_tid)
2758                 put_user(task_pid_vnr(current), current->set_child_tid);
2759 }
2760
2761 /*
2762  * context_switch - switch to the new MM and the new thread's register state.
2763  */
2764 static inline struct rq *
2765 context_switch(struct rq *rq, struct task_struct *prev,
2766                struct task_struct *next)
2767 {
2768         struct mm_struct *mm, *oldmm;
2769
2770         prepare_task_switch(rq, prev, next);
2771
2772         mm = next->mm;
2773         oldmm = prev->active_mm;
2774         /*
2775          * For paravirt, this is coupled with an exit in switch_to to
2776          * combine the page table reload and the switch backend into
2777          * one hypercall.
2778          */
2779         arch_start_context_switch(prev);
2780
2781         if (!mm) {
2782                 next->active_mm = oldmm;
2783                 atomic_inc(&oldmm->mm_count);
2784                 enter_lazy_tlb(oldmm, next);
2785         } else
2786                 switch_mm(oldmm, mm, next);
2787
2788         if (!prev->mm) {
2789                 prev->active_mm = NULL;
2790                 rq->prev_mm = oldmm;
2791         }
2792         /*
2793          * Since the runqueue lock will be released by the next
2794          * task (which is an invalid locking op but in the case
2795          * of the scheduler it's an obvious special-case), so we
2796          * do an early lockdep release here:
2797          */
2798         lockdep_unpin_lock(&rq->lock);
2799         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2800
2801         /* Here we just switch the register state and the stack. */
2802         switch_to(prev, next, prev);
2803         barrier();
2804
2805         return finish_task_switch(prev);
2806 }
2807
2808 /*
2809  * nr_running and nr_context_switches:
2810  *
2811  * externally visible scheduler statistics: current number of runnable
2812  * threads, total number of context switches performed since bootup.
2813  */
2814 unsigned long nr_running(void)
2815 {
2816         unsigned long i, sum = 0;
2817
2818         for_each_online_cpu(i)
2819                 sum += cpu_rq(i)->nr_running;
2820
2821         return sum;
2822 }
2823
2824 /*
2825  * Check if only the current task is running on the cpu.
2826  *
2827  * Caution: this function does not check that the caller has disabled
2828  * preemption, thus the result might have a time-of-check-to-time-of-use
2829  * race.  The caller is responsible to use it correctly, for example:
2830  *
2831  * - from a non-preemptable section (of course)
2832  *
2833  * - from a thread that is bound to a single CPU
2834  *
2835  * - in a loop with very short iterations (e.g. a polling loop)
2836  */
2837 bool single_task_running(void)
2838 {
2839         return raw_rq()->nr_running == 1;
2840 }
2841 EXPORT_SYMBOL(single_task_running);
2842
2843 unsigned long long nr_context_switches(void)
2844 {
2845         int i;
2846         unsigned long long sum = 0;
2847
2848         for_each_possible_cpu(i)
2849                 sum += cpu_rq(i)->nr_switches;
2850
2851         return sum;
2852 }
2853
2854 unsigned long nr_iowait(void)
2855 {
2856         unsigned long i, sum = 0;
2857
2858         for_each_possible_cpu(i)
2859                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2860
2861         return sum;
2862 }
2863
2864 unsigned long nr_iowait_cpu(int cpu)
2865 {
2866         struct rq *this = cpu_rq(cpu);
2867         return atomic_read(&this->nr_iowait);
2868 }
2869
2870 #ifdef CONFIG_CPU_QUIET
2871 u64 nr_running_integral(unsigned int cpu)
2872 {
2873         unsigned int seqcnt;
2874         u64 integral;
2875         struct rq *q;
2876
2877         if (cpu >= nr_cpu_ids)
2878                 return 0;
2879
2880         q = cpu_rq(cpu);
2881
2882         /*
2883          * Update average to avoid reading stalled value if there were
2884          * no run-queue changes for a long time. On the other hand if
2885          * the changes are happening right now, just read current value
2886          * directly.
2887          */
2888
2889         seqcnt = read_seqcount_begin(&q->ave_seqcnt);
2890         integral = do_nr_running_integral(q);
2891         if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
2892                 read_seqcount_begin(&q->ave_seqcnt);
2893                 integral = q->nr_running_integral;
2894         }
2895
2896         return integral;
2897 }
2898 #endif
2899
2900 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2901 {
2902         struct rq *rq = this_rq();
2903         *nr_waiters = atomic_read(&rq->nr_iowait);
2904         *load = rq->load.weight;
2905 }
2906
2907 #ifdef CONFIG_SMP
2908
2909 /*
2910  * sched_exec - execve() is a valuable balancing opportunity, because at
2911  * this point the task has the smallest effective memory and cache footprint.
2912  */
2913 void sched_exec(void)
2914 {
2915         struct task_struct *p = current;
2916         unsigned long flags;
2917         int dest_cpu;
2918
2919         raw_spin_lock_irqsave(&p->pi_lock, flags);
2920         dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
2921         if (dest_cpu == smp_processor_id())
2922                 goto unlock;
2923
2924         if (likely(cpu_active(dest_cpu))) {
2925                 struct migration_arg arg = { p, dest_cpu };
2926
2927                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2928                 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2929                 return;
2930         }
2931 unlock:
2932         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2933 }
2934
2935 #endif
2936
2937 DEFINE_PER_CPU(struct kernel_stat, kstat);
2938 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2939
2940 EXPORT_PER_CPU_SYMBOL(kstat);
2941 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2942
2943 /*
2944  * Return accounted runtime for the task.
2945  * In case the task is currently running, return the runtime plus current's
2946  * pending runtime that have not been accounted yet.
2947  */
2948 unsigned long long task_sched_runtime(struct task_struct *p)
2949 {
2950         unsigned long flags;
2951         struct rq *rq;
2952         u64 ns;
2953
2954 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2955         /*
2956          * 64-bit doesn't need locks to atomically read a 64bit value.
2957          * So we have a optimization chance when the task's delta_exec is 0.
2958          * Reading ->on_cpu is racy, but this is ok.
2959          *
2960          * If we race with it leaving cpu, we'll take a lock. So we're correct.
2961          * If we race with it entering cpu, unaccounted time is 0. This is
2962          * indistinguishable from the read occurring a few cycles earlier.
2963          * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2964          * been accounted, so we're correct here as well.
2965          */
2966         if (!p->on_cpu || !task_on_rq_queued(p))
2967                 return p->se.sum_exec_runtime;
2968 #endif
2969
2970         rq = task_rq_lock(p, &flags);
2971         /*
2972          * Must be ->curr _and_ ->on_rq.  If dequeued, we would
2973          * project cycles that may never be accounted to this
2974          * thread, breaking clock_gettime().
2975          */
2976         if (task_current(rq, p) && task_on_rq_queued(p)) {
2977                 update_rq_clock(rq);
2978                 p->sched_class->update_curr(rq);
2979         }
2980         ns = p->se.sum_exec_runtime;
2981         task_rq_unlock(rq, p, &flags);
2982
2983         return ns;
2984 }
2985
2986 /*
2987  * This function gets called by the timer code, with HZ frequency.
2988  * We call it with interrupts disabled.
2989  */
2990 void scheduler_tick(void)
2991 {
2992         int cpu = smp_processor_id();
2993         struct rq *rq = cpu_rq(cpu);
2994         struct task_struct *curr = rq->curr;
2995
2996         sched_clock_tick();
2997
2998         raw_spin_lock(&rq->lock);
2999         walt_set_window_start(rq);
3000         walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
3001                         walt_ktime_clock(), 0);
3002         update_rq_clock(rq);
3003         curr->sched_class->task_tick(rq, curr, 0);
3004         update_cpu_load_active(rq);
3005         calc_global_load_tick(rq);
3006         raw_spin_unlock(&rq->lock);
3007
3008         perf_event_task_tick();
3009
3010 #ifdef CONFIG_SMP
3011         rq->idle_balance = idle_cpu(cpu);
3012         trigger_load_balance(rq);
3013 #endif
3014         rq_last_tick_reset(rq);
3015
3016         if (curr->sched_class == &fair_sched_class)
3017                 check_for_migration(rq, curr);
3018 }
3019
3020 #ifdef CONFIG_NO_HZ_FULL
3021 /**
3022  * scheduler_tick_max_deferment
3023  *
3024  * Keep at least one tick per second when a single
3025  * active task is running because the scheduler doesn't
3026  * yet completely support full dynticks environment.
3027  *
3028  * This makes sure that uptime, CFS vruntime, load
3029  * balancing, etc... continue to move forward, even
3030  * with a very low granularity.
3031  *
3032  * Return: Maximum deferment in nanoseconds.
3033  */
3034 u64 scheduler_tick_max_deferment(void)
3035 {
3036         struct rq *rq = this_rq();
3037         unsigned long next, now = READ_ONCE(jiffies);
3038
3039         next = rq->last_sched_tick + HZ;
3040
3041         if (time_before_eq(next, now))
3042                 return 0;
3043
3044         return jiffies_to_nsecs(next - now);
3045 }
3046 #endif
3047
3048 notrace unsigned long get_parent_ip(unsigned long addr)
3049 {
3050         if (in_lock_functions(addr)) {
3051                 addr = CALLER_ADDR2;
3052                 if (in_lock_functions(addr))
3053                         addr = CALLER_ADDR3;
3054         }
3055         return addr;
3056 }
3057
3058 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3059                                 defined(CONFIG_PREEMPT_TRACER))
3060
3061 void preempt_count_add(int val)
3062 {
3063 #ifdef CONFIG_DEBUG_PREEMPT
3064         /*
3065          * Underflow?
3066          */
3067         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3068                 return;
3069 #endif
3070         __preempt_count_add(val);
3071 #ifdef CONFIG_DEBUG_PREEMPT
3072         /*
3073          * Spinlock count overflowing soon?
3074          */
3075         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3076                                 PREEMPT_MASK - 10);
3077 #endif
3078         if (preempt_count() == val) {
3079                 unsigned long ip = get_parent_ip(CALLER_ADDR1);
3080 #ifdef CONFIG_DEBUG_PREEMPT
3081                 current->preempt_disable_ip = ip;
3082 #endif
3083                 trace_preempt_off(CALLER_ADDR0, ip);
3084         }
3085 }
3086 EXPORT_SYMBOL(preempt_count_add);
3087 NOKPROBE_SYMBOL(preempt_count_add);
3088
3089 void preempt_count_sub(int val)
3090 {
3091 #ifdef CONFIG_DEBUG_PREEMPT
3092         /*
3093          * Underflow?
3094          */
3095         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3096                 return;
3097         /*
3098          * Is the spinlock portion underflowing?
3099          */
3100         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3101                         !(preempt_count() & PREEMPT_MASK)))
3102                 return;
3103 #endif
3104
3105         if (preempt_count() == val)
3106                 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3107         __preempt_count_sub(val);
3108 }
3109 EXPORT_SYMBOL(preempt_count_sub);
3110 NOKPROBE_SYMBOL(preempt_count_sub);
3111
3112 #endif
3113
3114 /*
3115  * Print scheduling while atomic bug:
3116  */
3117 static noinline void __schedule_bug(struct task_struct *prev)
3118 {
3119         if (oops_in_progress)
3120                 return;
3121
3122         printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3123                 prev->comm, prev->pid, preempt_count());
3124
3125         debug_show_held_locks(prev);
3126         print_modules();
3127         if (irqs_disabled())
3128                 print_irqtrace_events(prev);
3129 #ifdef CONFIG_DEBUG_PREEMPT
3130         if (in_atomic_preempt_off()) {
3131                 pr_err("Preemption disabled at:");
3132                 print_ip_sym(current->preempt_disable_ip);
3133                 pr_cont("\n");
3134         }
3135 #endif
3136         dump_stack();
3137         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3138 }
3139
3140 /*
3141  * Various schedule()-time debugging checks and statistics:
3142  */
3143 static inline void schedule_debug(struct task_struct *prev)
3144 {
3145 #ifdef CONFIG_SCHED_STACK_END_CHECK
3146         if (task_stack_end_corrupted(prev))
3147                 panic("corrupted stack end detected inside scheduler\n");
3148 #endif
3149
3150         if (unlikely(in_atomic_preempt_off())) {
3151                 __schedule_bug(prev);
3152                 preempt_count_set(PREEMPT_DISABLED);
3153         }
3154         rcu_sleep_check();
3155
3156         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3157
3158         schedstat_inc(this_rq(), sched_count);
3159 }
3160
3161 /*
3162  * Pick up the highest-prio task:
3163  */
3164 static inline struct task_struct *
3165 pick_next_task(struct rq *rq, struct task_struct *prev)
3166 {
3167         const struct sched_class *class = &fair_sched_class;
3168         struct task_struct *p;
3169
3170         /*
3171          * Optimization: we know that if all tasks are in
3172          * the fair class we can call that function directly:
3173          */
3174         if (likely(prev->sched_class == class &&
3175                    rq->nr_running == rq->cfs.h_nr_running)) {
3176                 p = fair_sched_class.pick_next_task(rq, prev);
3177                 if (unlikely(p == RETRY_TASK))
3178                         goto again;
3179
3180                 /* assumes fair_sched_class->next == idle_sched_class */
3181                 if (unlikely(!p))
3182                         p = idle_sched_class.pick_next_task(rq, prev);
3183
3184                 return p;
3185         }
3186
3187 again:
3188         for_each_class(class) {
3189                 p = class->pick_next_task(rq, prev);
3190                 if (p) {
3191                         if (unlikely(p == RETRY_TASK))
3192                                 goto again;
3193                         return p;
3194                 }
3195         }
3196
3197         BUG(); /* the idle class will always have a runnable task */
3198 }
3199
3200 /*
3201  * __schedule() is the main scheduler function.
3202  *
3203  * The main means of driving the scheduler and thus entering this function are:
3204  *
3205  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
3206  *
3207  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
3208  *      paths. For example, see arch/x86/entry_64.S.
3209  *
3210  *      To drive preemption between tasks, the scheduler sets the flag in timer
3211  *      interrupt handler scheduler_tick().
3212  *
3213  *   3. Wakeups don't really cause entry into schedule(). They add a
3214  *      task to the run-queue and that's it.
3215  *
3216  *      Now, if the new task added to the run-queue preempts the current
3217  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
3218  *      called on the nearest possible occasion:
3219  *
3220  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
3221  *
3222  *         - in syscall or exception context, at the next outmost
3223  *           preempt_enable(). (this might be as soon as the wake_up()'s
3224  *           spin_unlock()!)
3225  *
3226  *         - in IRQ context, return from interrupt-handler to
3227  *           preemptible context
3228  *
3229  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
3230  *         then at the next:
3231  *
3232  *          - cond_resched() call
3233  *          - explicit schedule() call
3234  *          - return from syscall or exception to user-space
3235  *          - return from interrupt-handler to user-space
3236  *
3237  * WARNING: must be called with preemption disabled!
3238  */
3239 static void __sched notrace __schedule(bool preempt)
3240 {
3241         struct task_struct *prev, *next;
3242         unsigned long *switch_count;
3243         struct rq *rq;
3244         int cpu;
3245         u64 wallclock;
3246
3247         cpu = smp_processor_id();
3248         rq = cpu_rq(cpu);
3249         rcu_note_context_switch();
3250         prev = rq->curr;
3251
3252         /*
3253          * do_exit() calls schedule() with preemption disabled as an exception;
3254          * however we must fix that up, otherwise the next task will see an
3255          * inconsistent (higher) preempt count.
3256          *
3257          * It also avoids the below schedule_debug() test from complaining
3258          * about this.
3259          */
3260         if (unlikely(prev->state == TASK_DEAD))
3261                 preempt_enable_no_resched_notrace();
3262
3263         schedule_debug(prev);
3264
3265         if (sched_feat(HRTICK))
3266                 hrtick_clear(rq);
3267
3268         /*
3269          * Make sure that signal_pending_state()->signal_pending() below
3270          * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3271          * done by the caller to avoid the race with signal_wake_up().
3272          */
3273         smp_mb__before_spinlock();
3274         raw_spin_lock_irq(&rq->lock);
3275         lockdep_pin_lock(&rq->lock);
3276
3277         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
3278
3279         switch_count = &prev->nivcsw;
3280         if (!preempt && prev->state) {
3281                 if (unlikely(signal_pending_state(prev->state, prev))) {
3282                         prev->state = TASK_RUNNING;
3283                 } else {
3284                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
3285                         prev->on_rq = 0;
3286
3287                         /*
3288                          * If a worker went to sleep, notify and ask workqueue
3289                          * whether it wants to wake up a task to maintain
3290                          * concurrency.
3291                          */
3292                         if (prev->flags & PF_WQ_WORKER) {
3293                                 struct task_struct *to_wakeup;
3294
3295                                 to_wakeup = wq_worker_sleeping(prev, cpu);
3296                                 if (to_wakeup)
3297                                         try_to_wake_up_local(to_wakeup);
3298                         }
3299                 }
3300                 switch_count = &prev->nvcsw;
3301         }
3302
3303         if (task_on_rq_queued(prev))
3304                 update_rq_clock(rq);
3305
3306         next = pick_next_task(rq, prev);
3307         wallclock = walt_ktime_clock();
3308         walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
3309         walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
3310         clear_tsk_need_resched(prev);
3311         clear_preempt_need_resched();
3312         rq->clock_skip_update = 0;
3313
3314         if (likely(prev != next)) {
3315 #ifdef CONFIG_SCHED_WALT
3316                 if (!prev->on_rq)
3317                         prev->last_sleep_ts = wallclock;
3318 #endif
3319                 rq->nr_switches++;
3320                 rq->curr = next;
3321                 ++*switch_count;
3322
3323                 trace_sched_switch(preempt, prev, next);
3324                 rq = context_switch(rq, prev, next); /* unlocks the rq */
3325                 cpu = cpu_of(rq);
3326         } else {
3327                 lockdep_unpin_lock(&rq->lock);
3328                 raw_spin_unlock_irq(&rq->lock);
3329         }
3330
3331         balance_callback(rq);
3332 }
3333
3334 static inline void sched_submit_work(struct task_struct *tsk)
3335 {
3336         if (!tsk->state || tsk_is_pi_blocked(tsk))
3337                 return;
3338         /*
3339          * If we are going to sleep and we have plugged IO queued,
3340          * make sure to submit it to avoid deadlocks.
3341          */
3342         if (blk_needs_flush_plug(tsk))
3343                 blk_schedule_flush_plug(tsk);
3344 }
3345
3346 asmlinkage __visible void __sched schedule(void)
3347 {
3348         struct task_struct *tsk = current;
3349
3350         sched_submit_work(tsk);
3351         do {
3352                 preempt_disable();
3353                 __schedule(false);
3354                 sched_preempt_enable_no_resched();
3355         } while (need_resched());
3356 }
3357 EXPORT_SYMBOL(schedule);
3358
3359 #ifdef CONFIG_CONTEXT_TRACKING
3360 asmlinkage __visible void __sched schedule_user(void)
3361 {
3362         /*
3363          * If we come here after a random call to set_need_resched(),
3364          * or we have been woken up remotely but the IPI has not yet arrived,
3365          * we haven't yet exited the RCU idle mode. Do it here manually until
3366          * we find a better solution.
3367          *
3368          * NB: There are buggy callers of this function.  Ideally we
3369          * should warn if prev_state != CONTEXT_USER, but that will trigger
3370          * too frequently to make sense yet.
3371          */
3372         enum ctx_state prev_state = exception_enter();
3373         schedule();
3374         exception_exit(prev_state);
3375 }
3376 #endif
3377
3378 /**
3379  * schedule_preempt_disabled - called with preemption disabled
3380  *
3381  * Returns with preemption disabled. Note: preempt_count must be 1
3382  */
3383 void __sched schedule_preempt_disabled(void)
3384 {
3385         sched_preempt_enable_no_resched();
3386         schedule();
3387         preempt_disable();
3388 }
3389
3390 static void __sched notrace preempt_schedule_common(void)
3391 {
3392         do {
3393                 preempt_disable_notrace();
3394                 __schedule(true);
3395                 preempt_enable_no_resched_notrace();
3396
3397                 /*
3398                  * Check again in case we missed a preemption opportunity
3399                  * between schedule and now.
3400                  */
3401         } while (need_resched());
3402 }
3403
3404 #ifdef CONFIG_PREEMPT
3405 /*
3406  * this is the entry point to schedule() from in-kernel preemption
3407  * off of preempt_enable. Kernel preemptions off return from interrupt
3408  * occur there and call schedule directly.
3409  */
3410 asmlinkage __visible void __sched notrace preempt_schedule(void)
3411 {
3412         /*
3413          * If there is a non-zero preempt_count or interrupts are disabled,
3414          * we do not want to preempt the current task. Just return..
3415          */
3416         if (likely(!preemptible()))
3417                 return;
3418
3419         preempt_schedule_common();
3420 }
3421 NOKPROBE_SYMBOL(preempt_schedule);
3422 EXPORT_SYMBOL(preempt_schedule);
3423
3424 /**
3425  * preempt_schedule_notrace - preempt_schedule called by tracing
3426  *
3427  * The tracing infrastructure uses preempt_enable_notrace to prevent
3428  * recursion and tracing preempt enabling caused by the tracing
3429  * infrastructure itself. But as tracing can happen in areas coming
3430  * from userspace or just about to enter userspace, a preempt enable
3431  * can occur before user_exit() is called. This will cause the scheduler
3432  * to be called when the system is still in usermode.
3433  *
3434  * To prevent this, the preempt_enable_notrace will use this function
3435  * instead of preempt_schedule() to exit user context if needed before
3436  * calling the scheduler.
3437  */
3438 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3439 {
3440         enum ctx_state prev_ctx;
3441
3442         if (likely(!preemptible()))
3443                 return;
3444
3445         do {
3446                 preempt_disable_notrace();
3447                 /*
3448                  * Needs preempt disabled in case user_exit() is traced
3449                  * and the tracer calls preempt_enable_notrace() causing
3450                  * an infinite recursion.
3451                  */
3452                 prev_ctx = exception_enter();
3453                 __schedule(true);
3454                 exception_exit(prev_ctx);
3455
3456                 preempt_enable_no_resched_notrace();
3457         } while (need_resched());
3458 }
3459 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3460
3461 #endif /* CONFIG_PREEMPT */
3462
3463 /*
3464  * this is the entry point to schedule() from kernel preemption
3465  * off of irq context.
3466  * Note, that this is called and return with irqs disabled. This will
3467  * protect us against recursive calling from irq.
3468  */
3469 asmlinkage __visible void __sched preempt_schedule_irq(void)
3470 {
3471         enum ctx_state prev_state;
3472
3473         /* Catch callers which need to be fixed */
3474         BUG_ON(preempt_count() || !irqs_disabled());
3475
3476         prev_state = exception_enter();
3477
3478         do {
3479                 preempt_disable();
3480                 local_irq_enable();
3481                 __schedule(true);
3482                 local_irq_disable();
3483                 sched_preempt_enable_no_resched();
3484         } while (need_resched());
3485
3486         exception_exit(prev_state);
3487 }
3488
3489 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3490                           void *key)
3491 {
3492         return try_to_wake_up(curr->private, mode, wake_flags, 1);
3493 }
3494 EXPORT_SYMBOL(default_wake_function);
3495
3496 #ifdef CONFIG_RT_MUTEXES
3497
3498 /*
3499  * rt_mutex_setprio - set the current priority of a task
3500  * @p: task
3501  * @prio: prio value (kernel-internal form)
3502  *
3503  * This function changes the 'effective' priority of a task. It does
3504  * not touch ->normal_prio like __setscheduler().
3505  *
3506  * Used by the rt_mutex code to implement priority inheritance
3507  * logic. Call site only calls if the priority of the task changed.
3508  */
3509 void rt_mutex_setprio(struct task_struct *p, int prio)
3510 {
3511         int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
3512         struct rq *rq;
3513         const struct sched_class *prev_class;
3514
3515         BUG_ON(prio > MAX_PRIO);
3516
3517         rq = __task_rq_lock(p);
3518         update_rq_clock(rq);
3519
3520         /*
3521          * Idle task boosting is a nono in general. There is one
3522          * exception, when PREEMPT_RT and NOHZ is active:
3523          *
3524          * The idle task calls get_next_timer_interrupt() and holds
3525          * the timer wheel base->lock on the CPU and another CPU wants
3526          * to access the timer (probably to cancel it). We can safely
3527          * ignore the boosting request, as the idle CPU runs this code
3528          * with interrupts disabled and will complete the lock
3529          * protected section without being interrupted. So there is no
3530          * real need to boost.
3531          */
3532         if (unlikely(p == rq->idle)) {
3533                 WARN_ON(p != rq->curr);
3534                 WARN_ON(p->pi_blocked_on);
3535                 goto out_unlock;
3536         }
3537
3538         trace_sched_pi_setprio(p, prio);
3539         oldprio = p->prio;
3540         prev_class = p->sched_class;
3541         queued = task_on_rq_queued(p);
3542         running = task_current(rq, p);
3543         if (queued)
3544                 dequeue_task(rq, p, DEQUEUE_SAVE);
3545         if (running)
3546                 put_prev_task(rq, p);
3547
3548         /*
3549          * Boosting condition are:
3550          * 1. -rt task is running and holds mutex A
3551          *      --> -dl task blocks on mutex A
3552          *
3553          * 2. -dl task is running and holds mutex A
3554          *      --> -dl task blocks on mutex A and could preempt the
3555          *          running task
3556          */
3557         if (dl_prio(prio)) {
3558                 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3559                 if (!dl_prio(p->normal_prio) ||
3560                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3561                         p->dl.dl_boosted = 1;
3562                         enqueue_flag |= ENQUEUE_REPLENISH;
3563                 } else
3564                         p->dl.dl_boosted = 0;
3565                 p->sched_class = &dl_sched_class;
3566         } else if (rt_prio(prio)) {
3567                 if (dl_prio(oldprio))
3568                         p->dl.dl_boosted = 0;
3569                 if (oldprio < prio)
3570                         enqueue_flag |= ENQUEUE_HEAD;
3571                 p->sched_class = &rt_sched_class;
3572         } else {
3573                 if (dl_prio(oldprio))
3574                         p->dl.dl_boosted = 0;
3575                 if (rt_prio(oldprio))
3576                         p->rt.timeout = 0;
3577                 p->sched_class = &fair_sched_class;
3578         }
3579
3580         p->prio = prio;
3581
3582         if (running)
3583                 p->sched_class->set_curr_task(rq);
3584         if (queued)
3585                 enqueue_task(rq, p, enqueue_flag);
3586
3587         check_class_changed(rq, p, prev_class, oldprio);
3588 out_unlock:
3589         preempt_disable(); /* avoid rq from going away on us */
3590         __task_rq_unlock(rq);
3591
3592         balance_callback(rq);
3593         preempt_enable();
3594 }
3595 #endif
3596
3597 void set_user_nice(struct task_struct *p, long nice)
3598 {
3599         int old_prio, delta, queued;
3600         unsigned long flags;
3601         struct rq *rq;
3602
3603         if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3604                 return;
3605         /*
3606          * We have to be careful, if called from sys_setpriority(),
3607          * the task might be in the middle of scheduling on another CPU.
3608          */
3609         rq = task_rq_lock(p, &flags);
3610         update_rq_clock(rq);
3611
3612         /*
3613          * The RT priorities are set via sched_setscheduler(), but we still
3614          * allow the 'normal' nice value to be set - but as expected
3615          * it wont have any effect on scheduling until the task is
3616          * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
3617          */
3618         if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3619                 p->static_prio = NICE_TO_PRIO(nice);
3620                 goto out_unlock;
3621         }
3622         queued = task_on_rq_queued(p);
3623         if (queued)
3624                 dequeue_task(rq, p, DEQUEUE_SAVE);
3625
3626         p->static_prio = NICE_TO_PRIO(nice);
3627         set_load_weight(p);
3628         old_prio = p->prio;
3629         p->prio = effective_prio(p);
3630         delta = p->prio - old_prio;
3631
3632         if (queued) {
3633                 enqueue_task(rq, p, ENQUEUE_RESTORE);
3634                 /*
3635                  * If the task increased its priority or is running and
3636                  * lowered its priority, then reschedule its CPU:
3637                  */
3638                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3639                         resched_curr(rq);
3640         }
3641 out_unlock:
3642         task_rq_unlock(rq, p, &flags);
3643 }
3644 EXPORT_SYMBOL(set_user_nice);
3645
3646 /*
3647  * can_nice - check if a task can reduce its nice value
3648  * @p: task
3649  * @nice: nice value
3650  */
3651 int can_nice(const struct task_struct *p, const int nice)
3652 {
3653         /* convert nice value [19,-20] to rlimit style value [1,40] */
3654         int nice_rlim = nice_to_rlimit(nice);
3655
3656         return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3657                 capable(CAP_SYS_NICE));
3658 }
3659
3660 #ifdef __ARCH_WANT_SYS_NICE
3661
3662 /*
3663  * sys_nice - change the priority of the current process.
3664  * @increment: priority increment
3665  *
3666  * sys_setpriority is a more generic, but much slower function that
3667  * does similar things.
3668  */
3669 SYSCALL_DEFINE1(nice, int, increment)
3670 {
3671         long nice, retval;
3672
3673         /*
3674          * Setpriority might change our priority at the same moment.
3675          * We don't have to worry. Conceptually one call occurs first
3676          * and we have a single winner.
3677          */
3678         increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3679         nice = task_nice(current) + increment;
3680
3681         nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3682         if (increment < 0 && !can_nice(current, nice))
3683                 return -EPERM;
3684
3685         retval = security_task_setnice(current, nice);
3686         if (retval)
3687                 return retval;
3688
3689         set_user_nice(current, nice);
3690         return 0;
3691 }
3692
3693 #endif
3694
3695 /**
3696  * task_prio - return the priority value of a given task.
3697  * @p: the task in question.
3698  *
3699  * Return: The priority value as seen by users in /proc.
3700  * RT tasks are offset by -200. Normal tasks are centered
3701  * around 0, value goes from -16 to +15.
3702  */
3703 int task_prio(const struct task_struct *p)
3704 {
3705         return p->prio - MAX_RT_PRIO;
3706 }
3707
3708 /**
3709  * idle_cpu - is a given cpu idle currently?
3710  * @cpu: the processor in question.
3711  *
3712  * Return: 1 if the CPU is currently idle. 0 otherwise.
3713  */
3714 int idle_cpu(int cpu)
3715 {
3716         struct rq *rq = cpu_rq(cpu);
3717
3718         if (rq->curr != rq->idle)
3719                 return 0;
3720
3721         if (rq->nr_running)
3722                 return 0;
3723
3724 #ifdef CONFIG_SMP
3725         if (!llist_empty(&rq->wake_list))
3726                 return 0;
3727 #endif
3728
3729         return 1;
3730 }
3731
3732 /**
3733  * idle_task - return the idle task for a given cpu.
3734  * @cpu: the processor in question.
3735  *
3736  * Return: The idle task for the cpu @cpu.
3737  */
3738 struct task_struct *idle_task(int cpu)
3739 {
3740         return cpu_rq(cpu)->idle;
3741 }
3742
3743 /**
3744  * find_process_by_pid - find a process with a matching PID value.
3745  * @pid: the pid in question.
3746  *
3747  * The task of @pid, if found. %NULL otherwise.
3748  */
3749 static struct task_struct *find_process_by_pid(pid_t pid)
3750 {
3751         return pid ? find_task_by_vpid(pid) : current;
3752 }
3753
3754 /*
3755  * This function initializes the sched_dl_entity of a newly becoming
3756  * SCHED_DEADLINE task.
3757  *
3758  * Only the static values are considered here, the actual runtime and the
3759  * absolute deadline will be properly calculated when the task is enqueued
3760  * for the first time with its new policy.
3761  */
3762 static void
3763 __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3764 {
3765         struct sched_dl_entity *dl_se = &p->dl;
3766
3767         dl_se->dl_runtime = attr->sched_runtime;
3768         dl_se->dl_deadline = attr->sched_deadline;
3769         dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3770         dl_se->flags = attr->sched_flags;
3771         dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3772
3773         /*
3774          * Changing the parameters of a task is 'tricky' and we're not doing
3775          * the correct thing -- also see task_dead_dl() and switched_from_dl().
3776          *
3777          * What we SHOULD do is delay the bandwidth release until the 0-lag
3778          * point. This would include retaining the task_struct until that time
3779          * and change dl_overflow() to not immediately decrement the current
3780          * amount.
3781          *
3782          * Instead we retain the current runtime/deadline and let the new
3783          * parameters take effect after the current reservation period lapses.
3784          * This is safe (albeit pessimistic) because the 0-lag point is always
3785          * before the current scheduling deadline.
3786          *
3787          * We can still have temporary overloads because we do not delay the
3788          * change in bandwidth until that time; so admission control is
3789          * not on the safe side. It does however guarantee tasks will never
3790          * consume more than promised.
3791          */
3792 }
3793
3794 /*
3795  * sched_setparam() passes in -1 for its policy, to let the functions
3796  * it calls know not to change it.
3797  */
3798 #define SETPARAM_POLICY -1
3799
3800 static void __setscheduler_params(struct task_struct *p,
3801                 const struct sched_attr *attr)
3802 {
3803         int policy = attr->sched_policy;
3804
3805         if (policy == SETPARAM_POLICY)
3806                 policy = p->policy;
3807
3808         p->policy = policy;
3809
3810         if (dl_policy(policy))
3811                 __setparam_dl(p, attr);
3812         else if (fair_policy(policy))
3813                 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3814
3815         /*
3816          * __sched_setscheduler() ensures attr->sched_priority == 0 when
3817          * !rt_policy. Always setting this ensures that things like
3818          * getparam()/getattr() don't report silly values for !rt tasks.
3819          */
3820         p->rt_priority = attr->sched_priority;
3821         p->normal_prio = normal_prio(p);
3822         set_load_weight(p);
3823 }
3824
3825 /* Actually do priority change: must hold pi & rq lock. */
3826 static void __setscheduler(struct rq *rq, struct task_struct *p,
3827                            const struct sched_attr *attr, bool keep_boost)
3828 {
3829         __setscheduler_params(p, attr);
3830
3831         /*
3832          * Keep a potential priority boosting if called from
3833          * sched_setscheduler().
3834          */
3835         if (keep_boost)
3836                 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
3837         else
3838                 p->prio = normal_prio(p);
3839
3840         if (dl_prio(p->prio))
3841                 p->sched_class = &dl_sched_class;
3842         else if (rt_prio(p->prio))
3843                 p->sched_class = &rt_sched_class;
3844         else
3845                 p->sched_class = &fair_sched_class;
3846 }
3847
3848 static void
3849 __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3850 {
3851         struct sched_dl_entity *dl_se = &p->dl;
3852
3853         attr->sched_priority = p->rt_priority;
3854         attr->sched_runtime = dl_se->dl_runtime;
3855         attr->sched_deadline = dl_se->dl_deadline;
3856         attr->sched_period = dl_se->dl_period;
3857         attr->sched_flags = dl_se->flags;
3858 }
3859
3860 /*
3861  * This function validates the new parameters of a -deadline task.
3862  * We ask for the deadline not being zero, and greater or equal
3863  * than the runtime, as well as the period of being zero or
3864  * greater than deadline. Furthermore, we have to be sure that
3865  * user parameters are above the internal resolution of 1us (we
3866  * check sched_runtime only since it is always the smaller one) and
3867  * below 2^63 ns (we have to check both sched_deadline and
3868  * sched_period, as the latter can be zero).
3869  */
3870 static bool
3871 __checkparam_dl(const struct sched_attr *attr)
3872 {
3873         /* deadline != 0 */
3874         if (attr->sched_deadline == 0)
3875                 return false;
3876
3877         /*
3878          * Since we truncate DL_SCALE bits, make sure we're at least
3879          * that big.
3880          */
3881         if (attr->sched_runtime < (1ULL << DL_SCALE))
3882                 return false;
3883
3884         /*
3885          * Since we use the MSB for wrap-around and sign issues, make
3886          * sure it's not set (mind that period can be equal to zero).
3887          */
3888         if (attr->sched_deadline & (1ULL << 63) ||
3889             attr->sched_period & (1ULL << 63))
3890                 return false;
3891
3892         /* runtime <= deadline <= period (if period != 0) */
3893         if ((attr->sched_period != 0 &&
3894              attr->sched_period < attr->sched_deadline) ||
3895             attr->sched_deadline < attr->sched_runtime)
3896                 return false;
3897
3898         return true;
3899 }
3900
3901 /*
3902  * check the target process has a UID that matches the current process's
3903  */
3904 static bool check_same_owner(struct task_struct *p)
3905 {
3906         const struct cred *cred = current_cred(), *pcred;
3907         bool match;
3908
3909         rcu_read_lock();
3910         pcred = __task_cred(p);
3911         match = (uid_eq(cred->euid, pcred->euid) ||
3912                  uid_eq(cred->euid, pcred->uid));
3913         rcu_read_unlock();
3914         return match;
3915 }
3916
3917 static bool dl_param_changed(struct task_struct *p,
3918                 const struct sched_attr *attr)
3919 {
3920         struct sched_dl_entity *dl_se = &p->dl;
3921
3922         if (dl_se->dl_runtime != attr->sched_runtime ||
3923                 dl_se->dl_deadline != attr->sched_deadline ||
3924                 dl_se->dl_period != attr->sched_period ||
3925                 dl_se->flags != attr->sched_flags)
3926                 return true;
3927
3928         return false;
3929 }
3930
3931 static int __sched_setscheduler(struct task_struct *p,
3932                                 const struct sched_attr *attr,
3933                                 bool user, bool pi)
3934 {
3935         int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3936                       MAX_RT_PRIO - 1 - attr->sched_priority;
3937         int retval, oldprio, oldpolicy = -1, queued, running;
3938         int new_effective_prio, policy = attr->sched_policy;
3939         unsigned long flags;
3940         const struct sched_class *prev_class;
3941         struct rq *rq;
3942         int reset_on_fork;
3943
3944         /* may grab non-irq protected spin_locks */
3945         BUG_ON(in_interrupt());
3946 recheck:
3947         /* double check policy once rq lock held */
3948         if (policy < 0) {
3949                 reset_on_fork = p->sched_reset_on_fork;
3950                 policy = oldpolicy = p->policy;
3951         } else {
3952                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3953
3954                 if (!valid_policy(policy))
3955                         return -EINVAL;
3956         }
3957
3958         if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3959                 return -EINVAL;
3960
3961         /*
3962          * Valid priorities for SCHED_FIFO and SCHED_RR are
3963          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3964          * SCHED_BATCH and SCHED_IDLE is 0.
3965          */
3966         if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3967             (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3968                 return -EINVAL;
3969         if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3970             (rt_policy(policy) != (attr->sched_priority != 0)))
3971                 return -EINVAL;
3972
3973         /*
3974          * Allow unprivileged RT tasks to decrease priority:
3975          */
3976         if (user && !capable(CAP_SYS_NICE)) {
3977                 if (fair_policy(policy)) {
3978                         if (attr->sched_nice < task_nice(p) &&
3979                             !can_nice(p, attr->sched_nice))
3980                                 return -EPERM;
3981                 }
3982
3983                 if (rt_policy(policy)) {
3984                         unsigned long rlim_rtprio =
3985                                         task_rlimit(p, RLIMIT_RTPRIO);
3986
3987                         /* can't set/change the rt policy */
3988                         if (policy != p->policy && !rlim_rtprio)
3989                                 return -EPERM;
3990
3991                         /* can't increase priority */
3992                         if (attr->sched_priority > p->rt_priority &&
3993                             attr->sched_priority > rlim_rtprio)
3994                                 return -EPERM;
3995                 }
3996
3997                  /*
3998                   * Can't set/change SCHED_DEADLINE policy at all for now
3999                   * (safest behavior); in the future we would like to allow
4000                   * unprivileged DL tasks to increase their relative deadline
4001                   * or reduce their runtime (both ways reducing utilization)
4002                   */
4003                 if (dl_policy(policy))
4004                         return -EPERM;
4005
4006                 /*
4007                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
4008                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4009                  */
4010                 if (idle_policy(p->policy) && !idle_policy(policy)) {
4011                         if (!can_nice(p, task_nice(p)))
4012                                 return -EPERM;
4013                 }
4014
4015                 /* can't change other user's priorities */
4016                 if (!check_same_owner(p))
4017                         return -EPERM;
4018
4019                 /* Normal users shall not reset the sched_reset_on_fork flag */
4020                 if (p->sched_reset_on_fork && !reset_on_fork)
4021                         return -EPERM;
4022         }
4023
4024         if (user) {
4025                 retval = security_task_setscheduler(p);
4026                 if (retval)
4027                         return retval;
4028         }
4029
4030         /*
4031          * make sure no PI-waiters arrive (or leave) while we are
4032          * changing the priority of the task:
4033          *
4034          * To be able to change p->policy safely, the appropriate
4035          * runqueue lock must be held.
4036          */
4037         rq = task_rq_lock(p, &flags);
4038         update_rq_clock(rq);
4039
4040         /*
4041          * Changing the policy of the stop threads its a very bad idea
4042          */
4043         if (p == rq->stop) {
4044                 task_rq_unlock(rq, p, &flags);
4045                 return -EINVAL;
4046         }
4047
4048         /*
4049          * If not changing anything there's no need to proceed further,
4050          * but store a possible modification of reset_on_fork.
4051          */
4052         if (unlikely(policy == p->policy)) {
4053                 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4054                         goto change;
4055                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4056                         goto change;
4057                 if (dl_policy(policy) && dl_param_changed(p, attr))
4058                         goto change;
4059
4060                 p->sched_reset_on_fork = reset_on_fork;
4061                 task_rq_unlock(rq, p, &flags);
4062                 return 0;
4063         }
4064 change:
4065
4066         if (user) {
4067 #ifdef CONFIG_RT_GROUP_SCHED
4068                 /*
4069                  * Do not allow realtime tasks into groups that have no runtime
4070                  * assigned.
4071                  */
4072                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4073                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4074                                 !task_group_is_autogroup(task_group(p))) {
4075                         task_rq_unlock(rq, p, &flags);
4076                         return -EPERM;
4077                 }
4078 #endif
4079 #ifdef CONFIG_SMP
4080                 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4081                         cpumask_t *span = rq->rd->span;
4082
4083                         /*
4084                          * Don't allow tasks with an affinity mask smaller than
4085                          * the entire root_domain to become SCHED_DEADLINE. We
4086                          * will also fail if there's no bandwidth available.
4087                          */
4088                         if (!cpumask_subset(span, &p->cpus_allowed) ||
4089                             rq->rd->dl_bw.bw == 0) {
4090                                 task_rq_unlock(rq, p, &flags);
4091                                 return -EPERM;
4092                         }
4093                 }
4094 #endif
4095         }
4096
4097         /* recheck policy now with rq lock held */
4098         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4099                 policy = oldpolicy = -1;
4100                 task_rq_unlock(rq, p, &flags);
4101                 goto recheck;
4102         }
4103
4104         /*
4105          * If setscheduling to SCHED_DEADLINE (or changing the parameters
4106          * of a SCHED_DEADLINE task) we need to check if enough bandwidth
4107          * is available.
4108          */
4109         if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
4110                 task_rq_unlock(rq, p, &flags);
4111                 return -EBUSY;
4112         }
4113
4114         p->sched_reset_on_fork = reset_on_fork;
4115         oldprio = p->prio;
4116
4117         if (pi) {
4118                 /*
4119                  * Take priority boosted tasks into account. If the new
4120                  * effective priority is unchanged, we just store the new
4121                  * normal parameters and do not touch the scheduler class and
4122                  * the runqueue. This will be done when the task deboost
4123                  * itself.
4124                  */
4125                 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
4126                 if (new_effective_prio == oldprio) {
4127                         __setscheduler_params(p, attr);
4128                         task_rq_unlock(rq, p, &flags);
4129                         return 0;
4130                 }
4131         }
4132
4133         queued = task_on_rq_queued(p);
4134         running = task_current(rq, p);
4135         if (queued)
4136                 dequeue_task(rq, p, DEQUEUE_SAVE);
4137         if (running)
4138                 put_prev_task(rq, p);
4139
4140         prev_class = p->sched_class;
4141         __setscheduler(rq, p, attr, pi);
4142
4143         if (running)
4144                 p->sched_class->set_curr_task(rq);
4145         if (queued) {
4146                 int enqueue_flags = ENQUEUE_RESTORE;
4147                 /*
4148                  * We enqueue to tail when the priority of a task is
4149                  * increased (user space view).
4150                  */
4151                 if (oldprio <= p->prio)
4152                         enqueue_flags |= ENQUEUE_HEAD;
4153
4154                 enqueue_task(rq, p, enqueue_flags);
4155         }
4156
4157         check_class_changed(rq, p, prev_class, oldprio);
4158         preempt_disable(); /* avoid rq from going away on us */
4159         task_rq_unlock(rq, p, &flags);
4160
4161         if (pi)
4162                 rt_mutex_adjust_pi(p);
4163
4164         /*
4165          * Run balance callbacks after we've adjusted the PI chain.
4166          */
4167         balance_callback(rq);
4168         preempt_enable();
4169
4170         return 0;
4171 }
4172
4173 static int _sched_setscheduler(struct task_struct *p, int policy,
4174                                const struct sched_param *param, bool check)
4175 {
4176         struct sched_attr attr = {
4177                 .sched_policy   = policy,
4178                 .sched_priority = param->sched_priority,
4179                 .sched_nice     = PRIO_TO_NICE(p->static_prio),
4180         };
4181
4182         /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
4183         if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4184                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4185                 policy &= ~SCHED_RESET_ON_FORK;
4186                 attr.sched_policy = policy;
4187         }
4188
4189         return __sched_setscheduler(p, &attr, check, true);
4190 }
4191 /**
4192  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4193  * @p: the task in question.
4194  * @policy: new policy.
4195  * @param: structure containing the new RT priority.
4196  *
4197  * Return: 0 on success. An error code otherwise.
4198  *
4199  * NOTE that the task may be already dead.
4200  */
4201 int sched_setscheduler(struct task_struct *p, int policy,
4202                        const struct sched_param *param)
4203 {
4204         return _sched_setscheduler(p, policy, param, true);
4205 }
4206 EXPORT_SYMBOL_GPL(sched_setscheduler);
4207
4208 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4209 {
4210         return __sched_setscheduler(p, attr, true, true);
4211 }
4212 EXPORT_SYMBOL_GPL(sched_setattr);
4213
4214 /**
4215  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4216  * @p: the task in question.
4217  * @policy: new policy.
4218  * @param: structure containing the new RT priority.
4219  *
4220  * Just like sched_setscheduler, only don't bother checking if the
4221  * current context has permission.  For example, this is needed in
4222  * stop_machine(): we create temporary high priority worker threads,
4223  * but our caller might not have that capability.
4224  *
4225  * Return: 0 on success. An error code otherwise.
4226  */
4227 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4228                                const struct sched_param *param)
4229 {
4230         return _sched_setscheduler(p, policy, param, false);
4231 }
4232 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4233
4234 static int
4235 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4236 {
4237         struct sched_param lparam;
4238         struct task_struct *p;
4239         int retval;
4240
4241         if (!param || pid < 0)
4242                 return -EINVAL;
4243         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4244                 return -EFAULT;
4245
4246         rcu_read_lock();
4247         retval = -ESRCH;
4248         p = find_process_by_pid(pid);
4249         if (p != NULL)
4250                 retval = sched_setscheduler(p, policy, &lparam);
4251         rcu_read_unlock();
4252
4253         return retval;
4254 }
4255
4256 /*
4257  * Mimics kernel/events/core.c perf_copy_attr().
4258  */
4259 static int sched_copy_attr(struct sched_attr __user *uattr,
4260                            struct sched_attr *attr)
4261 {
4262         u32 size;
4263         int ret;
4264
4265         if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4266                 return -EFAULT;
4267
4268         /*
4269          * zero the full structure, so that a short copy will be nice.
4270          */
4271         memset(attr, 0, sizeof(*attr));
4272
4273         ret = get_user(size, &uattr->size);
4274         if (ret)
4275                 return ret;
4276
4277         if (size > PAGE_SIZE)   /* silly large */
4278                 goto err_size;
4279
4280         if (!size)              /* abi compat */
4281                 size = SCHED_ATTR_SIZE_VER0;
4282
4283         if (size < SCHED_ATTR_SIZE_VER0)
4284                 goto err_size;
4285
4286         /*
4287          * If we're handed a bigger struct than we know of,
4288          * ensure all the unknown bits are 0 - i.e. new
4289          * user-space does not rely on any kernel feature
4290          * extensions we dont know about yet.
4291          */
4292         if (size > sizeof(*attr)) {
4293                 unsigned char __user *addr;
4294                 unsigned char __user *end;
4295                 unsigned char val;
4296
4297                 addr = (void __user *)uattr + sizeof(*attr);
4298                 end  = (void __user *)uattr + size;
4299
4300                 for (; addr < end; addr++) {
4301                         ret = get_user(val, addr);
4302                         if (ret)
4303                                 return ret;
4304                         if (val)
4305                                 goto err_size;
4306                 }
4307                 size = sizeof(*attr);
4308         }
4309
4310         ret = copy_from_user(attr, uattr, size);
4311         if (ret)
4312                 return -EFAULT;
4313
4314         /*
4315          * XXX: do we want to be lenient like existing syscalls; or do we want
4316          * to be strict and return an error on out-of-bounds values?
4317          */
4318         attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4319
4320         return 0;
4321
4322 err_size:
4323         put_user(sizeof(*attr), &uattr->size);
4324         return -E2BIG;
4325 }
4326
4327 /**
4328  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4329  * @pid: the pid in question.
4330  * @policy: new policy.
4331  * @param: structure containing the new RT priority.
4332  *
4333  * Return: 0 on success. An error code otherwise.
4334  */
4335 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4336                 struct sched_param __user *, param)
4337 {
4338         /* negative values for policy are not valid */
4339         if (policy < 0)
4340                 return -EINVAL;
4341
4342         return do_sched_setscheduler(pid, policy, param);
4343 }
4344
4345 /**
4346  * sys_sched_setparam - set/change the RT priority of a thread
4347  * @pid: the pid in question.
4348  * @param: structure containing the new RT priority.
4349  *
4350  * Return: 0 on success. An error code otherwise.
4351  */
4352 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4353 {
4354         return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4355 }
4356
4357 /**
4358  * sys_sched_setattr - same as above, but with extended sched_attr
4359  * @pid: the pid in question.
4360  * @uattr: structure containing the extended parameters.
4361  * @flags: for future extension.
4362  */
4363 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4364                                unsigned int, flags)
4365 {
4366         struct sched_attr attr;
4367         struct task_struct *p;
4368         int retval;
4369
4370         if (!uattr || pid < 0 || flags)
4371                 return -EINVAL;
4372
4373         retval = sched_copy_attr(uattr, &attr);
4374         if (retval)
4375                 return retval;
4376
4377         if ((int)attr.sched_policy < 0)
4378                 return -EINVAL;
4379
4380         rcu_read_lock();
4381         retval = -ESRCH;
4382         p = find_process_by_pid(pid);
4383         if (p != NULL)
4384                 retval = sched_setattr(p, &attr);
4385         rcu_read_unlock();
4386
4387         return retval;
4388 }
4389
4390 /**
4391  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4392  * @pid: the pid in question.
4393  *
4394  * Return: On success, the policy of the thread. Otherwise, a negative error
4395  * code.
4396  */
4397 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4398 {
4399         struct task_struct *p;
4400         int retval;
4401
4402         if (pid < 0)
4403                 return -EINVAL;
4404
4405         retval = -ESRCH;
4406         rcu_read_lock();
4407         p = find_process_by_pid(pid);
4408         if (p) {
4409                 retval = security_task_getscheduler(p);
4410                 if (!retval)
4411                         retval = p->policy
4412                                 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4413         }
4414         rcu_read_unlock();
4415         return retval;
4416 }
4417
4418 /**
4419  * sys_sched_getparam - get the RT priority of a thread
4420  * @pid: the pid in question.
4421  * @param: structure containing the RT priority.
4422  *
4423  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
4424  * code.
4425  */
4426 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4427 {
4428         struct sched_param lp = { .sched_priority = 0 };
4429         struct task_struct *p;
4430         int retval;
4431
4432         if (!param || pid < 0)
4433                 return -EINVAL;
4434
4435         rcu_read_lock();
4436         p = find_process_by_pid(pid);
4437         retval = -ESRCH;
4438         if (!p)
4439                 goto out_unlock;
4440
4441         retval = security_task_getscheduler(p);
4442         if (retval)
4443                 goto out_unlock;
4444
4445         if (task_has_rt_policy(p))
4446                 lp.sched_priority = p->rt_priority;
4447         rcu_read_unlock();
4448
4449         /*
4450          * This one might sleep, we cannot do it with a spinlock held ...
4451          */
4452         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4453
4454         return retval;
4455
4456 out_unlock:
4457         rcu_read_unlock();
4458         return retval;
4459 }
4460
4461 static int sched_read_attr(struct sched_attr __user *uattr,
4462                            struct sched_attr *attr,
4463                            unsigned int usize)
4464 {
4465         int ret;
4466
4467         if (!access_ok(VERIFY_WRITE, uattr, usize))
4468                 return -EFAULT;
4469
4470         /*
4471          * If we're handed a smaller struct than we know of,
4472          * ensure all the unknown bits are 0 - i.e. old
4473          * user-space does not get uncomplete information.
4474          */
4475         if (usize < sizeof(*attr)) {
4476                 unsigned char *addr;
4477                 unsigned char *end;
4478
4479                 addr = (void *)attr + usize;
4480                 end  = (void *)attr + sizeof(*attr);
4481
4482                 for (; addr < end; addr++) {
4483                         if (*addr)
4484                                 return -EFBIG;
4485                 }
4486
4487                 attr->size = usize;
4488         }
4489
4490         ret = copy_to_user(uattr, attr, attr->size);
4491         if (ret)
4492                 return -EFAULT;
4493
4494         return 0;
4495 }
4496
4497 /**
4498  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
4499  * @pid: the pid in question.
4500  * @uattr: structure containing the extended parameters.
4501  * @size: sizeof(attr) for fwd/bwd comp.
4502  * @flags: for future extension.
4503  */
4504 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4505                 unsigned int, size, unsigned int, flags)
4506 {
4507         struct sched_attr attr = {
4508                 .size = sizeof(struct sched_attr),
4509         };
4510         struct task_struct *p;
4511         int retval;
4512
4513         if (!uattr || pid < 0 || size > PAGE_SIZE ||
4514             size < SCHED_ATTR_SIZE_VER0 || flags)
4515                 return -EINVAL;
4516
4517         rcu_read_lock();
4518         p = find_process_by_pid(pid);
4519         retval = -ESRCH;
4520         if (!p)
4521                 goto out_unlock;
4522
4523         retval = security_task_getscheduler(p);
4524         if (retval)
4525                 goto out_unlock;
4526
4527         attr.sched_policy = p->policy;
4528         if (p->sched_reset_on_fork)
4529                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4530         if (task_has_dl_policy(p))
4531                 __getparam_dl(p, &attr);
4532         else if (task_has_rt_policy(p))
4533                 attr.sched_priority = p->rt_priority;
4534         else
4535                 attr.sched_nice = task_nice(p);
4536
4537         rcu_read_unlock();
4538
4539         retval = sched_read_attr(uattr, &attr, size);
4540         return retval;
4541
4542 out_unlock:
4543         rcu_read_unlock();
4544         return retval;
4545 }
4546
4547 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4548 {
4549         cpumask_var_t cpus_allowed, new_mask;
4550         struct task_struct *p;
4551         int retval;
4552
4553         rcu_read_lock();
4554
4555         p = find_process_by_pid(pid);
4556         if (!p) {
4557                 rcu_read_unlock();
4558                 return -ESRCH;
4559         }
4560
4561         /* Prevent p going away */
4562         get_task_struct(p);
4563         rcu_read_unlock();
4564
4565         if (p->flags & PF_NO_SETAFFINITY) {
4566                 retval = -EINVAL;
4567                 goto out_put_task;
4568         }
4569         if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4570                 retval = -ENOMEM;
4571                 goto out_put_task;
4572         }
4573         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4574                 retval = -ENOMEM;
4575                 goto out_free_cpus_allowed;
4576         }
4577         retval = -EPERM;
4578         if (!check_same_owner(p)) {
4579                 rcu_read_lock();
4580                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4581                         rcu_read_unlock();
4582                         goto out_free_new_mask;
4583                 }
4584                 rcu_read_unlock();
4585         }
4586
4587         retval = security_task_setscheduler(p);
4588         if (retval)
4589                 goto out_free_new_mask;
4590
4591
4592         cpuset_cpus_allowed(p, cpus_allowed);
4593         cpumask_and(new_mask, in_mask, cpus_allowed);
4594
4595         /*
4596          * Since bandwidth control happens on root_domain basis,
4597          * if admission test is enabled, we only admit -deadline
4598          * tasks allowed to run on all the CPUs in the task's
4599          * root_domain.
4600          */
4601 #ifdef CONFIG_SMP
4602         if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4603                 rcu_read_lock();
4604                 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4605                         retval = -EBUSY;
4606                         rcu_read_unlock();
4607                         goto out_free_new_mask;
4608                 }
4609                 rcu_read_unlock();
4610         }
4611 #endif
4612 again:
4613         retval = __set_cpus_allowed_ptr(p, new_mask, true);
4614
4615         if (!retval) {
4616                 cpuset_cpus_allowed(p, cpus_allowed);
4617                 if (!cpumask_subset(new_mask, cpus_allowed)) {
4618                         /*
4619                          * We must have raced with a concurrent cpuset
4620                          * update. Just reset the cpus_allowed to the
4621                          * cpuset's cpus_allowed
4622                          */
4623                         cpumask_copy(new_mask, cpus_allowed);
4624                         goto again;
4625                 }
4626         }
4627 out_free_new_mask:
4628         free_cpumask_var(new_mask);
4629 out_free_cpus_allowed:
4630         free_cpumask_var(cpus_allowed);
4631 out_put_task:
4632         put_task_struct(p);
4633         return retval;
4634 }
4635
4636 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4637                              struct cpumask *new_mask)
4638 {
4639         if (len < cpumask_size())
4640                 cpumask_clear(new_mask);
4641         else if (len > cpumask_size())
4642                 len = cpumask_size();
4643
4644         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4645 }
4646
4647 /**
4648  * sys_sched_setaffinity - set the cpu affinity of a process
4649  * @pid: pid of the process
4650  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4651  * @user_mask_ptr: user-space pointer to the new cpu mask
4652  *
4653  * Return: 0 on success. An error code otherwise.
4654  */
4655 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4656                 unsigned long __user *, user_mask_ptr)
4657 {
4658         cpumask_var_t new_mask;
4659         int retval;
4660
4661         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4662                 return -ENOMEM;
4663
4664         retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4665         if (retval == 0)
4666                 retval = sched_setaffinity(pid, new_mask);
4667         free_cpumask_var(new_mask);
4668         return retval;
4669 }
4670
4671 long sched_getaffinity(pid_t pid, struct cpumask *mask)
4672 {
4673         struct task_struct *p;
4674         unsigned long flags;
4675         int retval;
4676
4677         rcu_read_lock();
4678
4679         retval = -ESRCH;
4680         p = find_process_by_pid(pid);
4681         if (!p)
4682                 goto out_unlock;
4683
4684         retval = security_task_getscheduler(p);
4685         if (retval)
4686                 goto out_unlock;
4687
4688         raw_spin_lock_irqsave(&p->pi_lock, flags);
4689         cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4690         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4691
4692 out_unlock:
4693         rcu_read_unlock();
4694
4695         return retval;
4696 }
4697
4698 /**
4699  * sys_sched_getaffinity - get the cpu affinity of a process
4700  * @pid: pid of the process
4701  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4702  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4703  *
4704  * Return: 0 on success. An error code otherwise.
4705  */
4706 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4707                 unsigned long __user *, user_mask_ptr)
4708 {
4709         int ret;
4710         cpumask_var_t mask;
4711
4712         if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4713                 return -EINVAL;
4714         if (len & (sizeof(unsigned long)-1))
4715                 return -EINVAL;
4716
4717         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4718                 return -ENOMEM;
4719
4720         ret = sched_getaffinity(pid, mask);
4721         if (ret == 0) {
4722                 size_t retlen = min_t(size_t, len, cpumask_size());
4723
4724                 if (copy_to_user(user_mask_ptr, mask, retlen))
4725                         ret = -EFAULT;
4726                 else
4727                         ret = retlen;
4728         }
4729         free_cpumask_var(mask);
4730
4731         return ret;
4732 }
4733
4734 /**
4735  * sys_sched_yield - yield the current processor to other threads.
4736  *
4737  * This function yields the current CPU to other tasks. If there are no
4738  * other threads running on this CPU then this function will return.
4739  *
4740  * Return: 0.
4741  */
4742 SYSCALL_DEFINE0(sched_yield)
4743 {
4744         struct rq *rq = this_rq_lock();
4745
4746         schedstat_inc(rq, yld_count);
4747         current->sched_class->yield_task(rq);
4748
4749         /*
4750          * Since we are going to call schedule() anyway, there's
4751          * no need to preempt or enable interrupts:
4752          */
4753         __release(rq->lock);
4754         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4755         do_raw_spin_unlock(&rq->lock);
4756         sched_preempt_enable_no_resched();
4757
4758         schedule();
4759
4760         return 0;
4761 }
4762
4763 int __sched _cond_resched(void)
4764 {
4765         if (should_resched(0)) {
4766                 preempt_schedule_common();
4767                 return 1;
4768         }
4769         return 0;
4770 }
4771 EXPORT_SYMBOL(_cond_resched);
4772
4773 /*
4774  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4775  * call schedule, and on return reacquire the lock.
4776  *
4777  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4778  * operations here to prevent schedule() from being called twice (once via
4779  * spin_unlock(), once by hand).
4780  */
4781 int __cond_resched_lock(spinlock_t *lock)
4782 {
4783         int resched = should_resched(PREEMPT_LOCK_OFFSET);
4784         int ret = 0;
4785
4786         lockdep_assert_held(lock);
4787
4788         if (spin_needbreak(lock) || resched) {
4789                 spin_unlock(lock);
4790                 if (resched)
4791                         preempt_schedule_common();
4792                 else
4793                         cpu_relax();
4794                 ret = 1;
4795                 spin_lock(lock);
4796         }
4797         return ret;
4798 }
4799 EXPORT_SYMBOL(__cond_resched_lock);
4800
4801 int __sched __cond_resched_softirq(void)
4802 {
4803         BUG_ON(!in_softirq());
4804
4805         if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
4806                 local_bh_enable();
4807                 preempt_schedule_common();
4808                 local_bh_disable();
4809                 return 1;
4810         }
4811         return 0;
4812 }
4813 EXPORT_SYMBOL(__cond_resched_softirq);
4814
4815 /**
4816  * yield - yield the current processor to other threads.
4817  *
4818  * Do not ever use this function, there's a 99% chance you're doing it wrong.
4819  *
4820  * The scheduler is at all times free to pick the calling task as the most
4821  * eligible task to run, if removing the yield() call from your code breaks
4822  * it, its already broken.
4823  *
4824  * Typical broken usage is:
4825  *
4826  * while (!event)
4827  *      yield();
4828  *
4829  * where one assumes that yield() will let 'the other' process run that will
4830  * make event true. If the current task is a SCHED_FIFO task that will never
4831  * happen. Never use yield() as a progress guarantee!!
4832  *
4833  * If you want to use yield() to wait for something, use wait_event().
4834  * If you want to use yield() to be 'nice' for others, use cond_resched().
4835  * If you still want to use yield(), do not!
4836  */
4837 void __sched yield(void)
4838 {
4839         set_current_state(TASK_RUNNING);
4840         sys_sched_yield();
4841 }
4842 EXPORT_SYMBOL(yield);
4843
4844 /**
4845  * yield_to - yield the current processor to another thread in
4846  * your thread group, or accelerate that thread toward the
4847  * processor it's on.
4848  * @p: target task
4849  * @preempt: whether task preemption is allowed or not
4850  *
4851  * It's the caller's job to ensure that the target task struct
4852  * can't go away on us before we can do any checks.
4853  *
4854  * Return:
4855  *      true (>0) if we indeed boosted the target task.
4856  *      false (0) if we failed to boost the target.
4857  *      -ESRCH if there's no task to yield to.
4858  */
4859 int __sched yield_to(struct task_struct *p, bool preempt)
4860 {
4861         struct task_struct *curr = current;
4862         struct rq *rq, *p_rq;
4863         unsigned long flags;
4864         int yielded = 0;
4865
4866         local_irq_save(flags);
4867         rq = this_rq();
4868
4869 again:
4870         p_rq = task_rq(p);
4871         /*
4872          * If we're the only runnable task on the rq and target rq also
4873          * has only one task, there's absolutely no point in yielding.
4874          */
4875         if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4876                 yielded = -ESRCH;
4877                 goto out_irq;
4878         }
4879
4880         double_rq_lock(rq, p_rq);
4881         if (task_rq(p) != p_rq) {
4882                 double_rq_unlock(rq, p_rq);
4883                 goto again;
4884         }
4885
4886         if (!curr->sched_class->yield_to_task)
4887                 goto out_unlock;
4888
4889         if (curr->sched_class != p->sched_class)
4890                 goto out_unlock;
4891
4892         if (task_running(p_rq, p) || p->state)
4893                 goto out_unlock;
4894
4895         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4896         if (yielded) {
4897                 schedstat_inc(rq, yld_count);
4898                 /*
4899                  * Make p's CPU reschedule; pick_next_entity takes care of
4900                  * fairness.
4901                  */
4902                 if (preempt && rq != p_rq)
4903                         resched_curr(p_rq);
4904         }
4905
4906 out_unlock:
4907         double_rq_unlock(rq, p_rq);
4908 out_irq:
4909         local_irq_restore(flags);
4910
4911         if (yielded > 0)
4912                 schedule();
4913
4914         return yielded;
4915 }
4916 EXPORT_SYMBOL_GPL(yield_to);
4917
4918 /*
4919  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4920  * that process accounting knows that this is a task in IO wait state.
4921  */
4922 long __sched io_schedule_timeout(long timeout)
4923 {
4924         int old_iowait = current->in_iowait;
4925         struct rq *rq;
4926         long ret;
4927
4928         current->in_iowait = 1;
4929         blk_schedule_flush_plug(current);
4930
4931         delayacct_blkio_start();
4932         rq = raw_rq();
4933         atomic_inc(&rq->nr_iowait);
4934         ret = schedule_timeout(timeout);
4935         current->in_iowait = old_iowait;
4936         atomic_dec(&rq->nr_iowait);
4937         delayacct_blkio_end();
4938
4939         return ret;
4940 }
4941 EXPORT_SYMBOL(io_schedule_timeout);
4942
4943 /**
4944  * sys_sched_get_priority_max - return maximum RT priority.
4945  * @policy: scheduling class.
4946  *
4947  * Return: On success, this syscall returns the maximum
4948  * rt_priority that can be used by a given scheduling class.
4949  * On failure, a negative error code is returned.
4950  */
4951 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4952 {
4953         int ret = -EINVAL;
4954
4955         switch (policy) {
4956         case SCHED_FIFO:
4957         case SCHED_RR:
4958                 ret = MAX_USER_RT_PRIO-1;
4959                 break;
4960         case SCHED_DEADLINE:
4961         case SCHED_NORMAL:
4962         case SCHED_BATCH:
4963         case SCHED_IDLE:
4964                 ret = 0;
4965                 break;
4966         }
4967         return ret;
4968 }
4969
4970 /**
4971  * sys_sched_get_priority_min - return minimum RT priority.
4972  * @policy: scheduling class.
4973  *
4974  * Return: On success, this syscall returns the minimum
4975  * rt_priority that can be used by a given scheduling class.
4976  * On failure, a negative error code is returned.
4977  */
4978 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4979 {
4980         int ret = -EINVAL;
4981
4982         switch (policy) {
4983         case SCHED_FIFO:
4984         case SCHED_RR:
4985                 ret = 1;
4986                 break;
4987         case SCHED_DEADLINE:
4988         case SCHED_NORMAL:
4989         case SCHED_BATCH:
4990         case SCHED_IDLE:
4991                 ret = 0;
4992         }
4993         return ret;
4994 }
4995
4996 /**
4997  * sys_sched_rr_get_interval - return the default timeslice of a process.
4998  * @pid: pid of the process.
4999  * @interval: userspace pointer to the timeslice value.
5000  *
5001  * this syscall writes the default timeslice value of a given process
5002  * into the user-space timespec buffer. A value of '0' means infinity.
5003  *
5004  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5005  * an error code.
5006  */
5007 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5008                 struct timespec __user *, interval)
5009 {
5010         struct task_struct *p;
5011         unsigned int time_slice;
5012         unsigned long flags;
5013         struct rq *rq;
5014         int retval;
5015         struct timespec t;
5016
5017         if (pid < 0)
5018                 return -EINVAL;
5019
5020         retval = -ESRCH;
5021         rcu_read_lock();
5022         p = find_process_by_pid(pid);
5023         if (!p)
5024                 goto out_unlock;
5025
5026         retval = security_task_getscheduler(p);
5027         if (retval)
5028                 goto out_unlock;
5029
5030         rq = task_rq_lock(p, &flags);
5031         time_slice = 0;
5032         if (p->sched_class->get_rr_interval)
5033                 time_slice = p->sched_class->get_rr_interval(rq, p);
5034         task_rq_unlock(rq, p, &flags);
5035
5036         rcu_read_unlock();
5037         jiffies_to_timespec(time_slice, &t);
5038         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5039         return retval;
5040
5041 out_unlock:
5042         rcu_read_unlock();
5043         return retval;
5044 }
5045
5046 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5047
5048 void sched_show_task(struct task_struct *p)
5049 {
5050         unsigned long free = 0;
5051         int ppid;
5052         unsigned long state = p->state;
5053
5054         if (state)
5055                 state = __ffs(state) + 1;
5056         printk(KERN_INFO "%-15.15s %c", p->comm,
5057                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5058 #if BITS_PER_LONG == 32
5059         if (state == TASK_RUNNING)
5060                 printk(KERN_CONT " running  ");
5061         else
5062                 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5063 #else
5064         if (state == TASK_RUNNING)
5065                 printk(KERN_CONT "  running task    ");
5066         else
5067                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5068 #endif
5069 #ifdef CONFIG_DEBUG_STACK_USAGE
5070         free = stack_not_used(p);
5071 #endif
5072         ppid = 0;
5073         rcu_read_lock();
5074         if (pid_alive(p))
5075                 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5076         rcu_read_unlock();
5077         printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5078                 task_pid_nr(p), ppid,
5079                 (unsigned long)task_thread_info(p)->flags);
5080
5081         print_worker_info(KERN_INFO, p);
5082         show_stack(p, NULL);
5083 }
5084
5085 void show_state_filter(unsigned long state_filter)
5086 {
5087         struct task_struct *g, *p;
5088
5089 #if BITS_PER_LONG == 32
5090         printk(KERN_INFO
5091                 "  task                PC stack   pid father\n");
5092 #else
5093         printk(KERN_INFO
5094                 "  task                        PC stack   pid father\n");
5095 #endif
5096         rcu_read_lock();
5097         for_each_process_thread(g, p) {
5098                 /*
5099                  * reset the NMI-timeout, listing all files on a slow
5100                  * console might take a lot of time:
5101                  * Also, reset softlockup watchdogs on all CPUs, because
5102                  * another CPU might be blocked waiting for us to process
5103                  * an IPI.
5104                  */
5105                 touch_nmi_watchdog();
5106                 touch_all_softlockup_watchdogs();
5107                 if (!state_filter || (p->state & state_filter))
5108                         sched_show_task(p);
5109         }
5110
5111 #ifdef CONFIG_SCHED_DEBUG
5112         sysrq_sched_debug_show();
5113 #endif
5114         rcu_read_unlock();
5115         /*
5116          * Only show locks if all tasks are dumped:
5117          */
5118         if (!state_filter)
5119                 debug_show_all_locks();
5120 }
5121
5122 void init_idle_bootup_task(struct task_struct *idle)
5123 {
5124         idle->sched_class = &idle_sched_class;
5125 }
5126
5127 /**
5128  * init_idle - set up an idle thread for a given CPU
5129  * @idle: task in question
5130  * @cpu: cpu the idle task belongs to
5131  *
5132  * NOTE: this function does not set the idle thread's NEED_RESCHED
5133  * flag, to make booting more robust.
5134  */
5135 void init_idle(struct task_struct *idle, int cpu)
5136 {
5137         struct rq *rq = cpu_rq(cpu);
5138         unsigned long flags;
5139
5140         raw_spin_lock_irqsave(&idle->pi_lock, flags);
5141         raw_spin_lock(&rq->lock);
5142
5143         __sched_fork(0, idle);
5144
5145         idle->state = TASK_RUNNING;
5146         idle->se.exec_start = sched_clock();
5147
5148 #ifdef CONFIG_SMP
5149         /*
5150          * Its possible that init_idle() gets called multiple times on a task,
5151          * in that case do_set_cpus_allowed() will not do the right thing.
5152          *
5153          * And since this is boot we can forgo the serialization.
5154          */
5155         set_cpus_allowed_common(idle, cpumask_of(cpu));
5156 #endif
5157         /*
5158          * We're having a chicken and egg problem, even though we are
5159          * holding rq->lock, the cpu isn't yet set to this cpu so the
5160          * lockdep check in task_group() will fail.
5161          *
5162          * Similar case to sched_fork(). / Alternatively we could
5163          * use task_rq_lock() here and obtain the other rq->lock.
5164          *
5165          * Silence PROVE_RCU
5166          */
5167         rcu_read_lock();
5168         __set_task_cpu(idle, cpu);
5169         rcu_read_unlock();
5170
5171         rq->curr = rq->idle = idle;
5172         idle->on_rq = TASK_ON_RQ_QUEUED;
5173 #ifdef CONFIG_SMP
5174         idle->on_cpu = 1;
5175 #endif
5176         raw_spin_unlock(&rq->lock);
5177         raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5178
5179         /* Set the preempt count _outside_ the spinlocks! */
5180         init_idle_preempt_count(idle, cpu);
5181
5182         /*
5183          * The idle tasks have their own, simple scheduling class:
5184          */
5185         idle->sched_class = &idle_sched_class;
5186         ftrace_graph_init_idle_task(idle, cpu);
5187         vtime_init_idle(idle, cpu);
5188 #ifdef CONFIG_SMP
5189         sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5190 #endif
5191 }
5192
5193 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5194                               const struct cpumask *trial)
5195 {
5196         int ret = 1, trial_cpus;
5197         struct dl_bw *cur_dl_b;
5198         unsigned long flags;
5199
5200         if (!cpumask_weight(cur))
5201                 return ret;
5202
5203         rcu_read_lock_sched();
5204         cur_dl_b = dl_bw_of(cpumask_any(cur));
5205         trial_cpus = cpumask_weight(trial);
5206
5207         raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5208         if (cur_dl_b->bw != -1 &&
5209             cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5210                 ret = 0;
5211         raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5212         rcu_read_unlock_sched();
5213
5214         return ret;
5215 }
5216
5217 int task_can_attach(struct task_struct *p,
5218                     const struct cpumask *cs_cpus_allowed)
5219 {
5220         int ret = 0;
5221
5222         /*
5223          * Kthreads which disallow setaffinity shouldn't be moved
5224          * to a new cpuset; we don't want to change their cpu
5225          * affinity and isolating such threads by their set of
5226          * allowed nodes is unnecessary.  Thus, cpusets are not
5227          * applicable for such threads.  This prevents checking for
5228          * success of set_cpus_allowed_ptr() on all attached tasks
5229          * before cpus_allowed may be changed.
5230          */
5231         if (p->flags & PF_NO_SETAFFINITY) {
5232                 ret = -EINVAL;
5233                 goto out;
5234         }
5235
5236 #ifdef CONFIG_SMP
5237         if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5238                                               cs_cpus_allowed)) {
5239                 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5240                                                         cs_cpus_allowed);
5241                 struct dl_bw *dl_b;
5242                 bool overflow;
5243                 int cpus;
5244                 unsigned long flags;
5245
5246                 rcu_read_lock_sched();
5247                 dl_b = dl_bw_of(dest_cpu);
5248                 raw_spin_lock_irqsave(&dl_b->lock, flags);
5249                 cpus = dl_bw_cpus(dest_cpu);
5250                 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5251                 if (overflow)
5252                         ret = -EBUSY;
5253                 else {
5254                         /*
5255                          * We reserve space for this task in the destination
5256                          * root_domain, as we can't fail after this point.
5257                          * We will free resources in the source root_domain
5258                          * later on (see set_cpus_allowed_dl()).
5259                          */
5260                         __dl_add(dl_b, p->dl.dl_bw);
5261                 }
5262                 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5263                 rcu_read_unlock_sched();
5264
5265         }
5266 #endif
5267 out:
5268         return ret;
5269 }
5270
5271 #ifdef CONFIG_SMP
5272
5273 #ifdef CONFIG_NUMA_BALANCING
5274 /* Migrate current task p to target_cpu */
5275 int migrate_task_to(struct task_struct *p, int target_cpu)
5276 {
5277         struct migration_arg arg = { p, target_cpu };
5278         int curr_cpu = task_cpu(p);
5279
5280         if (curr_cpu == target_cpu)
5281                 return 0;
5282
5283         if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
5284                 return -EINVAL;
5285
5286         /* TODO: This is not properly updating schedstats */
5287
5288         trace_sched_move_numa(p, curr_cpu, target_cpu);
5289         return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5290 }
5291
5292 /*
5293  * Requeue a task on a given node and accurately track the number of NUMA
5294  * tasks on the runqueues
5295  */
5296 void sched_setnuma(struct task_struct *p, int nid)
5297 {
5298         struct rq *rq;
5299         unsigned long flags;
5300         bool queued, running;
5301
5302         rq = task_rq_lock(p, &flags);
5303         queued = task_on_rq_queued(p);
5304         running = task_current(rq, p);
5305
5306         if (queued)
5307                 dequeue_task(rq, p, DEQUEUE_SAVE);
5308         if (running)
5309                 put_prev_task(rq, p);
5310
5311         p->numa_preferred_nid = nid;
5312
5313         if (running)
5314                 p->sched_class->set_curr_task(rq);
5315         if (queued)
5316                 enqueue_task(rq, p, ENQUEUE_RESTORE);
5317         task_rq_unlock(rq, p, &flags);
5318 }
5319 #endif /* CONFIG_NUMA_BALANCING */
5320
5321 #ifdef CONFIG_HOTPLUG_CPU
5322 /*
5323  * Ensures that the idle task is using init_mm right before its cpu goes
5324  * offline.
5325  */
5326 void idle_task_exit(void)
5327 {
5328         struct mm_struct *mm = current->active_mm;
5329
5330         BUG_ON(cpu_online(smp_processor_id()));
5331
5332         if (mm != &init_mm) {
5333                 switch_mm(mm, &init_mm, current);
5334                 finish_arch_post_lock_switch();
5335         }
5336         mmdrop(mm);
5337 }
5338
5339 /*
5340  * Since this CPU is going 'away' for a while, fold any nr_active delta
5341  * we might have. Assumes we're called after migrate_tasks() so that the
5342  * nr_active count is stable.
5343  *
5344  * Also see the comment "Global load-average calculations".
5345  */
5346 static void calc_load_migrate(struct rq *rq)
5347 {
5348         long delta = calc_load_fold_active(rq);
5349         if (delta)
5350                 atomic_long_add(delta, &calc_load_tasks);
5351 }
5352
5353 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5354 {
5355 }
5356
5357 static const struct sched_class fake_sched_class = {
5358         .put_prev_task = put_prev_task_fake,
5359 };
5360
5361 static struct task_struct fake_task = {
5362         /*
5363          * Avoid pull_{rt,dl}_task()
5364          */
5365         .prio = MAX_PRIO + 1,
5366         .sched_class = &fake_sched_class,
5367 };
5368
5369 /*
5370  * Migrate all tasks from the rq, sleeping tasks will be migrated by
5371  * try_to_wake_up()->select_task_rq().
5372  *
5373  * Called with rq->lock held even though we'er in stop_machine() and
5374  * there's no concurrency possible, we hold the required locks anyway
5375  * because of lock validation efforts.
5376  */
5377 static void migrate_tasks(struct rq *dead_rq)
5378 {
5379         struct rq *rq = dead_rq;
5380         struct task_struct *next, *stop = rq->stop;
5381         int dest_cpu;
5382
5383         /*
5384          * Fudge the rq selection such that the below task selection loop
5385          * doesn't get stuck on the currently eligible stop task.
5386          *
5387          * We're currently inside stop_machine() and the rq is either stuck
5388          * in the stop_machine_cpu_stop() loop, or we're executing this code,
5389          * either way we should never end up calling schedule() until we're
5390          * done here.
5391          */
5392         rq->stop = NULL;
5393
5394         /*
5395          * put_prev_task() and pick_next_task() sched
5396          * class method both need to have an up-to-date
5397          * value of rq->clock[_task]
5398          */
5399         update_rq_clock(rq);
5400
5401         for (;;) {
5402                 /*
5403                  * There's this thread running, bail when that's the only
5404                  * remaining thread.
5405                  */
5406                 if (rq->nr_running == 1)
5407                         break;
5408
5409                 /*
5410                  * pick_next_task assumes pinned rq->lock.
5411                  */
5412                 lockdep_pin_lock(&rq->lock);
5413                 next = pick_next_task(rq, &fake_task);
5414                 BUG_ON(!next);
5415                 next->sched_class->put_prev_task(rq, next);
5416
5417                 /*
5418                  * Rules for changing task_struct::cpus_allowed are holding
5419                  * both pi_lock and rq->lock, such that holding either
5420                  * stabilizes the mask.
5421                  *
5422                  * Drop rq->lock is not quite as disastrous as it usually is
5423                  * because !cpu_active at this point, which means load-balance
5424                  * will not interfere. Also, stop-machine.
5425                  */
5426                 lockdep_unpin_lock(&rq->lock);
5427                 raw_spin_unlock(&rq->lock);
5428                 raw_spin_lock(&next->pi_lock);
5429                 raw_spin_lock(&rq->lock);
5430
5431                 /*
5432                  * Since we're inside stop-machine, _nothing_ should have
5433                  * changed the task, WARN if weird stuff happened, because in
5434                  * that case the above rq->lock drop is a fail too.
5435                  */
5436                 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5437                         raw_spin_unlock(&next->pi_lock);
5438                         continue;
5439                 }
5440
5441                 /* Find suitable destination for @next, with force if needed. */
5442                 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5443
5444                 rq = __migrate_task(rq, next, dest_cpu);
5445                 if (rq != dead_rq) {
5446                         raw_spin_unlock(&rq->lock);
5447                         rq = dead_rq;
5448                         raw_spin_lock(&rq->lock);
5449                 }
5450                 raw_spin_unlock(&next->pi_lock);
5451         }
5452
5453         rq->stop = stop;
5454 }
5455 #endif /* CONFIG_HOTPLUG_CPU */
5456
5457 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5458
5459 static struct ctl_table sd_ctl_dir[] = {
5460         {
5461                 .procname       = "sched_domain",
5462                 .mode           = 0555,
5463         },
5464         {}
5465 };
5466
5467 static struct ctl_table sd_ctl_root[] = {
5468         {
5469                 .procname       = "kernel",
5470                 .mode           = 0555,
5471                 .child          = sd_ctl_dir,
5472         },
5473         {}
5474 };
5475
5476 static struct ctl_table *sd_alloc_ctl_entry(int n)
5477 {
5478         struct ctl_table *entry =
5479                 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5480
5481         return entry;
5482 }
5483
5484 static void sd_free_ctl_entry(struct ctl_table **tablep)
5485 {
5486         struct ctl_table *entry;
5487
5488         /*
5489          * In the intermediate directories, both the child directory and
5490          * procname are dynamically allocated and could fail but the mode
5491          * will always be set. In the lowest directory the names are
5492          * static strings and all have proc handlers.
5493          */
5494         for (entry = *tablep; entry->mode; entry++) {
5495                 if (entry->child)
5496                         sd_free_ctl_entry(&entry->child);
5497                 if (entry->proc_handler == NULL)
5498                         kfree(entry->procname);
5499         }
5500
5501         kfree(*tablep);
5502         *tablep = NULL;
5503 }
5504
5505 static int min_load_idx = 0;
5506 static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5507
5508 static void
5509 set_table_entry(struct ctl_table *entry,
5510                 const char *procname, void *data, int maxlen,
5511                 umode_t mode, proc_handler *proc_handler,
5512                 bool load_idx)
5513 {
5514         entry->procname = procname;
5515         entry->data = data;
5516         entry->maxlen = maxlen;
5517         entry->mode = mode;
5518         entry->proc_handler = proc_handler;
5519
5520         if (load_idx) {
5521                 entry->extra1 = &min_load_idx;
5522                 entry->extra2 = &max_load_idx;
5523         }
5524 }
5525
5526 static struct ctl_table *
5527 sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
5528 {
5529         struct ctl_table *table = sd_alloc_ctl_entry(5);
5530
5531         if (table == NULL)
5532                 return NULL;
5533
5534         set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
5535                         sizeof(int), 0644, proc_dointvec_minmax, false);
5536         set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
5537                         sge->nr_idle_states*sizeof(struct idle_state), 0644,
5538                         proc_doulongvec_minmax, false);
5539         set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
5540                         sizeof(int), 0644, proc_dointvec_minmax, false);
5541         set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
5542                         sge->nr_cap_states*sizeof(struct capacity_state), 0644,
5543                         proc_doulongvec_minmax, false);
5544
5545         return table;
5546 }
5547
5548 static struct ctl_table *
5549 sd_alloc_ctl_group_table(struct sched_group *sg)
5550 {
5551         struct ctl_table *table = sd_alloc_ctl_entry(2);
5552
5553         if (table == NULL)
5554                 return NULL;
5555
5556         table->procname = kstrdup("energy", GFP_KERNEL);
5557         table->mode = 0555;
5558         table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
5559
5560         return table;
5561 }
5562
5563 static struct ctl_table *
5564 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5565 {
5566         struct ctl_table *table;
5567         unsigned int nr_entries = 14;
5568
5569         int i = 0;
5570         struct sched_group *sg = sd->groups;
5571
5572         if (sg->sge) {
5573                 int nr_sgs = 0;
5574
5575                 do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
5576
5577                 nr_entries += nr_sgs;
5578         }
5579
5580         table = sd_alloc_ctl_entry(nr_entries);
5581
5582         if (table == NULL)
5583                 return NULL;
5584
5585         set_table_entry(&table[0], "min_interval", &sd->min_interval,
5586                 sizeof(long), 0644, proc_doulongvec_minmax, false);
5587         set_table_entry(&table[1], "max_interval", &sd->max_interval,
5588                 sizeof(long), 0644, proc_doulongvec_minmax, false);
5589         set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5590                 sizeof(int), 0644, proc_dointvec_minmax, true);
5591         set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5592                 sizeof(int), 0644, proc_dointvec_minmax, true);
5593         set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5594                 sizeof(int), 0644, proc_dointvec_minmax, true);
5595         set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5596                 sizeof(int), 0644, proc_dointvec_minmax, true);
5597         set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5598                 sizeof(int), 0644, proc_dointvec_minmax, true);
5599         set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5600                 sizeof(int), 0644, proc_dointvec_minmax, false);
5601         set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5602                 sizeof(int), 0644, proc_dointvec_minmax, false);
5603         set_table_entry(&table[9], "cache_nice_tries",
5604                 &sd->cache_nice_tries,
5605                 sizeof(int), 0644, proc_dointvec_minmax, false);
5606         set_table_entry(&table[10], "flags", &sd->flags,
5607                 sizeof(int), 0644, proc_dointvec_minmax, false);
5608         set_table_entry(&table[11], "max_newidle_lb_cost",
5609                 &sd->max_newidle_lb_cost,
5610                 sizeof(long), 0644, proc_doulongvec_minmax, false);
5611         set_table_entry(&table[12], "name", sd->name,
5612                 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5613         sg = sd->groups;
5614         if (sg->sge) {
5615                 char buf[32];
5616                 struct ctl_table *entry = &table[13];
5617
5618                 do {
5619                         snprintf(buf, 32, "group%d", i);
5620                         entry->procname = kstrdup(buf, GFP_KERNEL);
5621                         entry->mode = 0555;
5622                         entry->child = sd_alloc_ctl_group_table(sg);
5623                 } while (entry++, i++, sg = sg->next, sg != sd->groups);
5624         }
5625         /* &table[nr_entries-1] is terminator */
5626
5627         return table;
5628 }
5629
5630 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5631 {
5632         struct ctl_table *entry, *table;
5633         struct sched_domain *sd;
5634         int domain_num = 0, i;
5635         char buf[32];
5636
5637         for_each_domain(cpu, sd)
5638                 domain_num++;
5639         entry = table = sd_alloc_ctl_entry(domain_num + 1);
5640         if (table == NULL)
5641                 return NULL;
5642
5643         i = 0;
5644         for_each_domain(cpu, sd) {
5645                 snprintf(buf, 32, "domain%d", i);
5646                 entry->procname = kstrdup(buf, GFP_KERNEL);
5647                 entry->mode = 0555;
5648                 entry->child = sd_alloc_ctl_domain_table(sd);
5649                 entry++;
5650                 i++;
5651         }
5652         return table;
5653 }
5654
5655 static struct ctl_table_header *sd_sysctl_header;
5656 static void register_sched_domain_sysctl(void)
5657 {
5658         int i, cpu_num = num_possible_cpus();
5659         struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5660         char buf[32];
5661
5662         WARN_ON(sd_ctl_dir[0].child);
5663         sd_ctl_dir[0].child = entry;
5664
5665         if (entry == NULL)
5666                 return;
5667
5668         for_each_possible_cpu(i) {
5669                 snprintf(buf, 32, "cpu%d", i);
5670                 entry->procname = kstrdup(buf, GFP_KERNEL);
5671                 entry->mode = 0555;
5672                 entry->child = sd_alloc_ctl_cpu_table(i);
5673                 entry++;
5674         }
5675
5676         WARN_ON(sd_sysctl_header);
5677         sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5678 }
5679
5680 /* may be called multiple times per register */
5681 static void unregister_sched_domain_sysctl(void)
5682 {
5683         unregister_sysctl_table(sd_sysctl_header);
5684         sd_sysctl_header = NULL;
5685         if (sd_ctl_dir[0].child)
5686                 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5687 }
5688 #else
5689 static void register_sched_domain_sysctl(void)
5690 {
5691 }
5692 static void unregister_sched_domain_sysctl(void)
5693 {
5694 }
5695 #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
5696
5697 static void set_rq_online(struct rq *rq)
5698 {
5699         if (!rq->online) {
5700                 const struct sched_class *class;
5701
5702                 cpumask_set_cpu(rq->cpu, rq->rd->online);
5703                 rq->online = 1;
5704
5705                 for_each_class(class) {
5706                         if (class->rq_online)
5707                                 class->rq_online(rq);
5708                 }
5709         }
5710 }
5711
5712 static void set_rq_offline(struct rq *rq)
5713 {
5714         if (rq->online) {
5715                 const struct sched_class *class;
5716
5717                 for_each_class(class) {
5718                         if (class->rq_offline)
5719                                 class->rq_offline(rq);
5720                 }
5721
5722                 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5723                 rq->online = 0;
5724         }
5725 }
5726
5727 /*
5728  * migration_call - callback that gets triggered when a CPU is added.
5729  * Here we can start up the necessary migration thread for the new CPU.
5730  */
5731 static int
5732 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5733 {
5734         int cpu = (long)hcpu;
5735         unsigned long flags;
5736         struct rq *rq = cpu_rq(cpu);
5737
5738         switch (action & ~CPU_TASKS_FROZEN) {
5739
5740         case CPU_UP_PREPARE:
5741                 raw_spin_lock_irqsave(&rq->lock, flags);
5742                 walt_set_window_start(rq);
5743                 raw_spin_unlock_irqrestore(&rq->lock, flags);
5744                 rq->calc_load_update = calc_load_update;
5745                 break;
5746
5747         case CPU_ONLINE:
5748                 /* Update our root-domain */
5749                 raw_spin_lock_irqsave(&rq->lock, flags);
5750                 if (rq->rd) {
5751                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5752
5753                         set_rq_online(rq);
5754                 }
5755                 raw_spin_unlock_irqrestore(&rq->lock, flags);
5756                 break;
5757
5758 #ifdef CONFIG_HOTPLUG_CPU
5759         case CPU_DYING:
5760                 sched_ttwu_pending();
5761                 /* Update our root-domain */
5762                 raw_spin_lock_irqsave(&rq->lock, flags);
5763                 walt_migrate_sync_cpu(cpu);
5764                 if (rq->rd) {
5765                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5766                         set_rq_offline(rq);
5767                 }
5768                 migrate_tasks(rq);
5769                 BUG_ON(rq->nr_running != 1); /* the migration thread */
5770                 raw_spin_unlock_irqrestore(&rq->lock, flags);
5771                 break;
5772
5773         case CPU_DEAD:
5774                 calc_load_migrate(rq);
5775                 break;
5776 #endif
5777         }
5778
5779         update_max_interval();
5780
5781         return NOTIFY_OK;
5782 }
5783
5784 /*
5785  * Register at high priority so that task migration (migrate_all_tasks)
5786  * happens before everything else.  This has to be lower priority than
5787  * the notifier in the perf_event subsystem, though.
5788  */
5789 static struct notifier_block migration_notifier = {
5790         .notifier_call = migration_call,
5791         .priority = CPU_PRI_MIGRATION,
5792 };
5793
5794 static void set_cpu_rq_start_time(void)
5795 {
5796         int cpu = smp_processor_id();
5797         struct rq *rq = cpu_rq(cpu);
5798         rq->age_stamp = sched_clock_cpu(cpu);
5799 }
5800
5801 static int sched_cpu_active(struct notifier_block *nfb,
5802                                       unsigned long action, void *hcpu)
5803 {
5804         int cpu = (long)hcpu;
5805
5806         switch (action & ~CPU_TASKS_FROZEN) {
5807         case CPU_STARTING:
5808                 set_cpu_rq_start_time();
5809                 return NOTIFY_OK;
5810
5811         case CPU_ONLINE:
5812                 /*
5813                  * At this point a starting CPU has marked itself as online via
5814                  * set_cpu_online(). But it might not yet have marked itself
5815                  * as active, which is essential from here on.
5816                  */
5817                 set_cpu_active(cpu, true);
5818                 stop_machine_unpark(cpu);
5819                 return NOTIFY_OK;
5820
5821         case CPU_DOWN_FAILED:
5822                 set_cpu_active(cpu, true);
5823                 return NOTIFY_OK;
5824
5825         default:
5826                 return NOTIFY_DONE;
5827         }
5828 }
5829
5830 static int sched_cpu_inactive(struct notifier_block *nfb,
5831                                         unsigned long action, void *hcpu)
5832 {
5833         switch (action & ~CPU_TASKS_FROZEN) {
5834         case CPU_DOWN_PREPARE:
5835                 set_cpu_active((long)hcpu, false);
5836                 return NOTIFY_OK;
5837         default:
5838                 return NOTIFY_DONE;
5839         }
5840 }
5841
5842 static int __init migration_init(void)
5843 {
5844         void *cpu = (void *)(long)smp_processor_id();
5845         int err;
5846
5847         /* Initialize migration for the boot CPU */
5848         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5849         BUG_ON(err == NOTIFY_BAD);
5850         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5851         register_cpu_notifier(&migration_notifier);
5852
5853         /* Register cpu active notifiers */
5854         cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5855         cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5856
5857         return 0;
5858 }
5859 early_initcall(migration_init);
5860
5861 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5862
5863 #ifdef CONFIG_SCHED_DEBUG
5864
5865 static __read_mostly int sched_debug_enabled;
5866
5867 static int __init sched_debug_setup(char *str)
5868 {
5869         sched_debug_enabled = 1;
5870
5871         return 0;
5872 }
5873 early_param("sched_debug", sched_debug_setup);
5874
5875 static inline bool sched_debug(void)
5876 {
5877         return sched_debug_enabled;
5878 }
5879
5880 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5881                                   struct cpumask *groupmask)
5882 {
5883         struct sched_group *group = sd->groups;
5884
5885         cpumask_clear(groupmask);
5886
5887         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5888
5889         if (!(sd->flags & SD_LOAD_BALANCE)) {
5890                 printk("does not load-balance\n");
5891                 return -1;
5892         }
5893
5894         printk(KERN_CONT "span %*pbl level %s\n",
5895                cpumask_pr_args(sched_domain_span(sd)), sd->name);
5896
5897         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5898                 printk(KERN_ERR "ERROR: domain->span does not contain "
5899                                 "CPU%d\n", cpu);
5900         }
5901         if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5902                 printk(KERN_ERR "ERROR: domain->groups does not contain"
5903                                 " CPU%d\n", cpu);
5904         }
5905
5906         printk(KERN_DEBUG "%*s groups:", level + 1, "");
5907         do {
5908                 if (!group) {
5909                         printk("\n");
5910                         printk(KERN_ERR "ERROR: group is NULL\n");
5911                         break;
5912                 }
5913
5914                 if (!cpumask_weight(sched_group_cpus(group))) {
5915                         printk(KERN_CONT "\n");
5916                         printk(KERN_ERR "ERROR: empty group\n");
5917                         break;
5918                 }
5919
5920                 if (!(sd->flags & SD_OVERLAP) &&
5921                     cpumask_intersects(groupmask, sched_group_cpus(group))) {
5922                         printk(KERN_CONT "\n");
5923                         printk(KERN_ERR "ERROR: repeated CPUs\n");
5924                         break;
5925                 }
5926
5927                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5928
5929                 printk(KERN_CONT " %*pbl",
5930                        cpumask_pr_args(sched_group_cpus(group)));
5931                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5932                         printk(KERN_CONT " (cpu_capacity = %lu)",
5933                                 group->sgc->capacity);
5934                 }
5935
5936                 group = group->next;
5937         } while (group != sd->groups);
5938         printk(KERN_CONT "\n");
5939
5940         if (!cpumask_equal(sched_domain_span(sd), groupmask))
5941                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5942
5943         if (sd->parent &&
5944             !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5945                 printk(KERN_ERR "ERROR: parent span is not a superset "
5946                         "of domain->span\n");
5947         return 0;
5948 }
5949
5950 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5951 {
5952         int level = 0;
5953
5954         if (!sched_debug_enabled)
5955                 return;
5956
5957         if (!sd) {
5958                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5959                 return;
5960         }
5961
5962         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5963
5964         for (;;) {
5965                 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5966                         break;
5967                 level++;
5968                 sd = sd->parent;
5969                 if (!sd)
5970                         break;
5971         }
5972 }
5973 #else /* !CONFIG_SCHED_DEBUG */
5974 # define sched_domain_debug(sd, cpu) do { } while (0)
5975 static inline bool sched_debug(void)
5976 {
5977         return false;
5978 }
5979 #endif /* CONFIG_SCHED_DEBUG */
5980
5981 static int sd_degenerate(struct sched_domain *sd)
5982 {
5983         if (cpumask_weight(sched_domain_span(sd)) == 1) {
5984                 if (sd->groups->sge)
5985                         sd->flags &= ~SD_LOAD_BALANCE;
5986                 else
5987                         return 1;
5988         }
5989
5990         /* Following flags need at least 2 groups */
5991         if (sd->flags & (SD_LOAD_BALANCE |
5992                          SD_BALANCE_NEWIDLE |
5993                          SD_BALANCE_FORK |
5994                          SD_BALANCE_EXEC |
5995                          SD_SHARE_CPUCAPACITY |
5996                          SD_ASYM_CPUCAPACITY |
5997                          SD_SHARE_PKG_RESOURCES |
5998                          SD_SHARE_POWERDOMAIN |
5999                          SD_SHARE_CAP_STATES)) {
6000                 if (sd->groups != sd->groups->next)
6001                         return 0;
6002         }
6003
6004         /* Following flags don't use groups */
6005         if (sd->flags & (SD_WAKE_AFFINE))
6006                 return 0;
6007
6008         return 1;
6009 }
6010
6011 static int
6012 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6013 {
6014         unsigned long cflags = sd->flags, pflags = parent->flags;
6015
6016         if (sd_degenerate(parent))
6017                 return 1;
6018
6019         if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6020                 return 0;
6021
6022         /* Flags needing groups don't count if only 1 group in parent */
6023         if (parent->groups == parent->groups->next) {
6024                 pflags &= ~(SD_LOAD_BALANCE |
6025                                 SD_BALANCE_NEWIDLE |
6026                                 SD_BALANCE_FORK |
6027                                 SD_BALANCE_EXEC |
6028                                 SD_ASYM_CPUCAPACITY |
6029                                 SD_SHARE_CPUCAPACITY |
6030                                 SD_SHARE_PKG_RESOURCES |
6031                                 SD_PREFER_SIBLING |
6032                                 SD_SHARE_POWERDOMAIN |
6033                                 SD_SHARE_CAP_STATES);
6034                 if (parent->groups->sge) {
6035                         parent->flags &= ~SD_LOAD_BALANCE;
6036                         return 0;
6037                 }
6038                 if (nr_node_ids == 1)
6039                         pflags &= ~SD_SERIALIZE;
6040         }
6041         if (~cflags & pflags)
6042                 return 0;
6043
6044         return 1;
6045 }
6046
6047 static void free_rootdomain(struct rcu_head *rcu)
6048 {
6049         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6050
6051         cpupri_cleanup(&rd->cpupri);
6052         cpudl_cleanup(&rd->cpudl);
6053         free_cpumask_var(rd->dlo_mask);
6054         free_cpumask_var(rd->rto_mask);
6055         free_cpumask_var(rd->online);
6056         free_cpumask_var(rd->span);
6057         kfree(rd);
6058 }
6059
6060 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6061 {
6062         struct root_domain *old_rd = NULL;
6063         unsigned long flags;
6064
6065         raw_spin_lock_irqsave(&rq->lock, flags);
6066
6067         if (rq->rd) {
6068                 old_rd = rq->rd;
6069
6070                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6071                         set_rq_offline(rq);
6072
6073                 cpumask_clear_cpu(rq->cpu, old_rd->span);
6074
6075                 /*
6076                  * If we dont want to free the old_rd yet then
6077                  * set old_rd to NULL to skip the freeing later
6078                  * in this function:
6079                  */
6080                 if (!atomic_dec_and_test(&old_rd->refcount))
6081                         old_rd = NULL;
6082         }
6083
6084         atomic_inc(&rd->refcount);
6085         rq->rd = rd;
6086
6087         cpumask_set_cpu(rq->cpu, rd->span);
6088         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6089                 set_rq_online(rq);
6090
6091         raw_spin_unlock_irqrestore(&rq->lock, flags);
6092
6093         if (old_rd)
6094                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6095 }
6096
6097 static int init_rootdomain(struct root_domain *rd)
6098 {
6099         memset(rd, 0, sizeof(*rd));
6100
6101         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
6102                 goto out;
6103         if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
6104                 goto free_span;
6105         if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
6106                 goto free_online;
6107         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6108                 goto free_dlo_mask;
6109
6110         init_dl_bw(&rd->dl_bw);
6111         if (cpudl_init(&rd->cpudl) != 0)
6112                 goto free_dlo_mask;
6113
6114         if (cpupri_init(&rd->cpupri) != 0)
6115                 goto free_rto_mask;
6116
6117         init_max_cpu_capacity(&rd->max_cpu_capacity);
6118
6119         rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
6120
6121         return 0;
6122
6123 free_rto_mask:
6124         free_cpumask_var(rd->rto_mask);
6125 free_dlo_mask:
6126         free_cpumask_var(rd->dlo_mask);
6127 free_online:
6128         free_cpumask_var(rd->online);
6129 free_span:
6130         free_cpumask_var(rd->span);
6131 out:
6132         return -ENOMEM;
6133 }
6134
6135 /*
6136  * By default the system creates a single root-domain with all cpus as
6137  * members (mimicking the global state we have today).
6138  */
6139 struct root_domain def_root_domain;
6140
6141 static void init_defrootdomain(void)
6142 {
6143         init_rootdomain(&def_root_domain);
6144
6145         atomic_set(&def_root_domain.refcount, 1);
6146 }
6147
6148 static struct root_domain *alloc_rootdomain(void)
6149 {
6150         struct root_domain *rd;
6151
6152         rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6153         if (!rd)
6154                 return NULL;
6155
6156         if (init_rootdomain(rd) != 0) {
6157                 kfree(rd);
6158                 return NULL;
6159         }
6160
6161         return rd;
6162 }
6163
6164 static void free_sched_groups(struct sched_group *sg, int free_sgc)
6165 {
6166         struct sched_group *tmp, *first;
6167
6168         if (!sg)
6169                 return;
6170
6171         first = sg;
6172         do {
6173                 tmp = sg->next;
6174
6175                 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
6176                         kfree(sg->sgc);
6177
6178                 kfree(sg);
6179                 sg = tmp;
6180         } while (sg != first);
6181 }
6182
6183 static void free_sched_domain(struct rcu_head *rcu)
6184 {
6185         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6186
6187         /*
6188          * If its an overlapping domain it has private groups, iterate and
6189          * nuke them all.
6190          */
6191         if (sd->flags & SD_OVERLAP) {
6192                 free_sched_groups(sd->groups, 1);
6193         } else if (atomic_dec_and_test(&sd->groups->ref)) {
6194                 kfree(sd->groups->sgc);
6195                 kfree(sd->groups);
6196         }
6197         kfree(sd);
6198 }
6199
6200 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6201 {
6202         call_rcu(&sd->rcu, free_sched_domain);
6203 }
6204
6205 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6206 {
6207         for (; sd; sd = sd->parent)
6208                 destroy_sched_domain(sd, cpu);
6209 }
6210
6211 /*
6212  * Keep a special pointer to the highest sched_domain that has
6213  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6214  * allows us to avoid some pointer chasing select_idle_sibling().
6215  *
6216  * Also keep a unique ID per domain (we use the first cpu number in
6217  * the cpumask of the domain), this allows us to quickly tell if
6218  * two cpus are in the same cache domain, see cpus_share_cache().
6219  */
6220 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
6221 DEFINE_PER_CPU(int, sd_llc_size);
6222 DEFINE_PER_CPU(int, sd_llc_id);
6223 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
6224 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
6225 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
6226 DEFINE_PER_CPU(struct sched_domain *, sd_ea);
6227 DEFINE_PER_CPU(struct sched_domain *, sd_scs);
6228
6229 static void update_top_cache_domain(int cpu)
6230 {
6231         struct sched_domain *sd;
6232         struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
6233         int id = cpu;
6234         int size = 1;
6235
6236         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6237         if (sd) {
6238                 id = cpumask_first(sched_domain_span(sd));
6239                 size = cpumask_weight(sched_domain_span(sd));
6240                 busy_sd = sd->parent; /* sd_busy */
6241         }
6242         rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
6243
6244         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6245         per_cpu(sd_llc_size, cpu) = size;
6246         per_cpu(sd_llc_id, cpu) = id;
6247
6248         sd = lowest_flag_domain(cpu, SD_NUMA);
6249         rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
6250
6251         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
6252         rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
6253
6254         for_each_domain(cpu, sd) {
6255                 if (sd->groups->sge)
6256                         ea_sd = sd;
6257                 else
6258                         break;
6259         }
6260         rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
6261
6262         sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
6263         rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
6264 }
6265
6266 /*
6267  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6268  * hold the hotplug lock.
6269  */
6270 static void
6271 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6272 {
6273         struct rq *rq = cpu_rq(cpu);
6274         struct sched_domain *tmp;
6275
6276         /* Remove the sched domains which do not contribute to scheduling. */
6277         for (tmp = sd; tmp; ) {
6278                 struct sched_domain *parent = tmp->parent;
6279                 if (!parent)
6280                         break;
6281
6282                 if (sd_parent_degenerate(tmp, parent)) {
6283                         tmp->parent = parent->parent;
6284                         if (parent->parent)
6285                                 parent->parent->child = tmp;
6286                         /*
6287                          * Transfer SD_PREFER_SIBLING down in case of a
6288                          * degenerate parent; the spans match for this
6289                          * so the property transfers.
6290                          */
6291                         if (parent->flags & SD_PREFER_SIBLING)
6292                                 tmp->flags |= SD_PREFER_SIBLING;
6293                         destroy_sched_domain(parent, cpu);
6294                 } else
6295                         tmp = tmp->parent;
6296         }
6297
6298         if (sd && sd_degenerate(sd)) {
6299                 tmp = sd;
6300                 sd = sd->parent;
6301                 destroy_sched_domain(tmp, cpu);
6302                 if (sd)
6303                         sd->child = NULL;
6304         }
6305
6306         sched_domain_debug(sd, cpu);
6307
6308         rq_attach_root(rq, rd);
6309         tmp = rq->sd;
6310         rcu_assign_pointer(rq->sd, sd);
6311         destroy_sched_domains(tmp, cpu);
6312
6313         update_top_cache_domain(cpu);
6314 }
6315
6316 /* Setup the mask of cpus configured for isolated domains */
6317 static int __init isolated_cpu_setup(char *str)
6318 {
6319         alloc_bootmem_cpumask_var(&cpu_isolated_map);
6320         cpulist_parse(str, cpu_isolated_map);
6321         return 1;
6322 }
6323
6324 __setup("isolcpus=", isolated_cpu_setup);
6325
6326 struct s_data {
6327         struct sched_domain ** __percpu sd;
6328         struct root_domain      *rd;
6329 };
6330
6331 enum s_alloc {
6332         sa_rootdomain,
6333         sa_sd,
6334         sa_sd_storage,
6335         sa_none,
6336 };
6337
6338 /*
6339  * Build an iteration mask that can exclude certain CPUs from the upwards
6340  * domain traversal.
6341  *
6342  * Only CPUs that can arrive at this group should be considered to continue
6343  * balancing.
6344  *
6345  * Asymmetric node setups can result in situations where the domain tree is of
6346  * unequal depth, make sure to skip domains that already cover the entire
6347  * range.
6348  *
6349  * In that case build_sched_domains() will have terminated the iteration early
6350  * and our sibling sd spans will be empty. Domains should always include the
6351  * cpu they're built on, so check that.
6352  *
6353  */
6354 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6355 {
6356         const struct cpumask *sg_span = sched_group_cpus(sg);
6357         struct sd_data *sdd = sd->private;
6358         struct sched_domain *sibling;
6359         int i;
6360
6361         for_each_cpu(i, sg_span) {
6362                 sibling = *per_cpu_ptr(sdd->sd, i);
6363
6364                 /*
6365                  * Can happen in the asymmetric case, where these siblings are
6366                  * unused. The mask will not be empty because those CPUs that
6367                  * do have the top domain _should_ span the domain.
6368                  */
6369                 if (!sibling->child)
6370                         continue;
6371
6372                 /* If we would not end up here, we can't continue from here */
6373                 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
6374                         continue;
6375
6376                 cpumask_set_cpu(i, sched_group_mask(sg));
6377         }
6378
6379         /* We must not have empty masks here */
6380         WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
6381 }
6382
6383 /*
6384  * Return the canonical balance cpu for this group, this is the first cpu
6385  * of this group that's also in the iteration mask.
6386  */
6387 int group_balance_cpu(struct sched_group *sg)
6388 {
6389         return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6390 }
6391
6392 static int
6393 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6394 {
6395         struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6396         const struct cpumask *span = sched_domain_span(sd);
6397         struct cpumask *covered = sched_domains_tmpmask;
6398         struct sd_data *sdd = sd->private;
6399         struct sched_domain *sibling;
6400         int i;
6401
6402         cpumask_clear(covered);
6403
6404         for_each_cpu(i, span) {
6405                 struct cpumask *sg_span;
6406
6407                 if (cpumask_test_cpu(i, covered))
6408                         continue;
6409
6410                 sibling = *per_cpu_ptr(sdd->sd, i);
6411
6412                 /* See the comment near build_group_mask(). */
6413                 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6414                         continue;
6415
6416                 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6417                                 GFP_KERNEL, cpu_to_node(cpu));
6418
6419                 if (!sg)
6420                         goto fail;
6421
6422                 sg_span = sched_group_cpus(sg);
6423                 if (sibling->child)
6424                         cpumask_copy(sg_span, sched_domain_span(sibling->child));
6425                 else
6426                         cpumask_set_cpu(i, sg_span);
6427
6428                 cpumask_or(covered, covered, sg_span);
6429
6430                 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
6431                 if (atomic_inc_return(&sg->sgc->ref) == 1)
6432                         build_group_mask(sd, sg);
6433
6434                 /*
6435                  * Initialize sgc->capacity such that even if we mess up the
6436                  * domains and no possible iteration will get us here, we won't
6437                  * die on a /0 trap.
6438                  */
6439                 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
6440                 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
6441                 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
6442
6443                 /*
6444                  * Make sure the first group of this domain contains the
6445                  * canonical balance cpu. Otherwise the sched_domain iteration
6446                  * breaks. See update_sg_lb_stats().
6447                  */
6448                 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6449                     group_balance_cpu(sg) == cpu)
6450                         groups = sg;
6451
6452                 if (!first)
6453                         first = sg;
6454                 if (last)
6455                         last->next = sg;
6456                 last = sg;
6457                 last->next = first;
6458         }
6459         sd->groups = groups;
6460
6461         return 0;
6462
6463 fail:
6464         free_sched_groups(first, 0);
6465
6466         return -ENOMEM;
6467 }
6468
6469 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6470 {
6471         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6472         struct sched_domain *child = sd->child;
6473
6474         if (child)
6475                 cpu = cpumask_first(sched_domain_span(child));
6476
6477         if (sg) {
6478                 *sg = *per_cpu_ptr(sdd->sg, cpu);
6479                 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
6480                 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
6481         }
6482
6483         return cpu;
6484 }
6485
6486 /*
6487  * build_sched_groups will build a circular linked list of the groups
6488  * covered by the given span, and will set each group's ->cpumask correctly,
6489  * and ->cpu_capacity to 0.
6490  *
6491  * Assumes the sched_domain tree is fully constructed
6492  */
6493 static int
6494 build_sched_groups(struct sched_domain *sd, int cpu)
6495 {
6496         struct sched_group *first = NULL, *last = NULL;
6497         struct sd_data *sdd = sd->private;
6498         const struct cpumask *span = sched_domain_span(sd);
6499         struct cpumask *covered;
6500         int i;
6501
6502         get_group(cpu, sdd, &sd->groups);
6503         atomic_inc(&sd->groups->ref);
6504
6505         if (cpu != cpumask_first(span))
6506                 return 0;
6507
6508         lockdep_assert_held(&sched_domains_mutex);
6509         covered = sched_domains_tmpmask;
6510
6511         cpumask_clear(covered);
6512
6513         for_each_cpu(i, span) {
6514                 struct sched_group *sg;
6515                 int group, j;
6516
6517                 if (cpumask_test_cpu(i, covered))
6518                         continue;
6519
6520                 group = get_group(i, sdd, &sg);
6521                 cpumask_setall(sched_group_mask(sg));
6522
6523                 for_each_cpu(j, span) {
6524                         if (get_group(j, sdd, NULL) != group)
6525                                 continue;
6526
6527                         cpumask_set_cpu(j, covered);
6528                         cpumask_set_cpu(j, sched_group_cpus(sg));
6529                 }
6530
6531                 if (!first)
6532                         first = sg;
6533                 if (last)
6534                         last->next = sg;
6535                 last = sg;
6536         }
6537         last->next = first;
6538
6539         return 0;
6540 }
6541
6542 /*
6543  * Initialize sched groups cpu_capacity.
6544  *
6545  * cpu_capacity indicates the capacity of sched group, which is used while
6546  * distributing the load between different sched groups in a sched domain.
6547  * Typically cpu_capacity for all the groups in a sched domain will be same
6548  * unless there are asymmetries in the topology. If there are asymmetries,
6549  * group having more cpu_capacity will pickup more load compared to the
6550  * group having less cpu_capacity.
6551  */
6552 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6553 {
6554         struct sched_group *sg = sd->groups;
6555
6556         WARN_ON(!sg);
6557
6558         do {
6559                 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6560                 sg = sg->next;
6561         } while (sg != sd->groups);
6562
6563         if (cpu != group_balance_cpu(sg))
6564                 return;
6565
6566         update_group_capacity(sd, cpu);
6567         atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
6568 }
6569
6570 /*
6571  * Check that the per-cpu provided sd energy data is consistent for all cpus
6572  * within the mask.
6573  */
6574 static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
6575                                            const struct cpumask *cpumask)
6576 {
6577         const struct sched_group_energy * const sge = fn(cpu);
6578         struct cpumask mask;
6579         int i;
6580
6581         if (cpumask_weight(cpumask) <= 1)
6582                 return;
6583
6584         cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
6585
6586         for_each_cpu(i, &mask) {
6587                 const struct sched_group_energy * const e = fn(i);
6588                 int y;
6589
6590                 BUG_ON(e->nr_idle_states != sge->nr_idle_states);
6591
6592                 for (y = 0; y < (e->nr_idle_states); y++) {
6593                         BUG_ON(e->idle_states[y].power !=
6594                                         sge->idle_states[y].power);
6595                 }
6596
6597                 BUG_ON(e->nr_cap_states != sge->nr_cap_states);
6598
6599                 for (y = 0; y < (e->nr_cap_states); y++) {
6600                         BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
6601                         BUG_ON(e->cap_states[y].power !=
6602                                         sge->cap_states[y].power);
6603                 }
6604         }
6605 }
6606
6607 static void init_sched_energy(int cpu, struct sched_domain *sd,
6608                               sched_domain_energy_f fn)
6609 {
6610         if (!(fn && fn(cpu)))
6611                 return;
6612
6613         if (cpu != group_balance_cpu(sd->groups))
6614                 return;
6615
6616         if (sd->child && !sd->child->groups->sge) {
6617                 pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
6618 #ifdef CONFIG_SCHED_DEBUG
6619                 pr_err("     energy data on %s but not on %s domain\n",
6620                         sd->name, sd->child->name);
6621 #endif
6622                 return;
6623         }
6624
6625         check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
6626
6627         sd->groups->sge = fn(cpu);
6628 }
6629
6630 /*
6631  * Initializers for schedule domains
6632  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6633  */
6634
6635 static int default_relax_domain_level = -1;
6636 int sched_domain_level_max;
6637
6638 static int __init setup_relax_domain_level(char *str)
6639 {
6640         if (kstrtoint(str, 0, &default_relax_domain_level))
6641                 pr_warn("Unable to set relax_domain_level\n");
6642
6643         return 1;
6644 }
6645 __setup("relax_domain_level=", setup_relax_domain_level);
6646
6647 static void set_domain_attribute(struct sched_domain *sd,
6648                                  struct sched_domain_attr *attr)
6649 {
6650         int request;
6651
6652         if (!attr || attr->relax_domain_level < 0) {
6653                 if (default_relax_domain_level < 0)
6654                         return;
6655                 else
6656                         request = default_relax_domain_level;
6657         } else
6658                 request = attr->relax_domain_level;
6659         if (request < sd->level) {
6660                 /* turn off idle balance on this domain */
6661                 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6662         } else {
6663                 /* turn on idle balance on this domain */
6664                 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6665         }
6666 }
6667
6668 static void __sdt_free(const struct cpumask *cpu_map);
6669 static int __sdt_alloc(const struct cpumask *cpu_map);
6670
6671 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6672                                  const struct cpumask *cpu_map)
6673 {
6674         switch (what) {
6675         case sa_rootdomain:
6676                 if (!atomic_read(&d->rd->refcount))
6677                         free_rootdomain(&d->rd->rcu); /* fall through */
6678         case sa_sd:
6679                 free_percpu(d->sd); /* fall through */
6680         case sa_sd_storage:
6681                 __sdt_free(cpu_map); /* fall through */
6682         case sa_none:
6683                 break;
6684         }
6685 }
6686
6687 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6688                                                    const struct cpumask *cpu_map)
6689 {
6690         memset(d, 0, sizeof(*d));
6691
6692         if (__sdt_alloc(cpu_map))
6693                 return sa_sd_storage;
6694         d->sd = alloc_percpu(struct sched_domain *);
6695         if (!d->sd)
6696                 return sa_sd_storage;
6697         d->rd = alloc_rootdomain();
6698         if (!d->rd)
6699                 return sa_sd;
6700         return sa_rootdomain;
6701 }
6702
6703 /*
6704  * NULL the sd_data elements we've used to build the sched_domain and
6705  * sched_group structure so that the subsequent __free_domain_allocs()
6706  * will not free the data we're using.
6707  */
6708 static void claim_allocations(int cpu, struct sched_domain *sd)
6709 {
6710         struct sd_data *sdd = sd->private;
6711
6712         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6713         *per_cpu_ptr(sdd->sd, cpu) = NULL;
6714
6715         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6716                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6717
6718         if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
6719                 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
6720 }
6721
6722 #ifdef CONFIG_NUMA
6723 static int sched_domains_numa_levels;
6724 enum numa_topology_type sched_numa_topology_type;
6725 static int *sched_domains_numa_distance;
6726 int sched_max_numa_distance;
6727 static struct cpumask ***sched_domains_numa_masks;
6728 static int sched_domains_curr_level;
6729 #endif
6730
6731 /*
6732  * SD_flags allowed in topology descriptions.
6733  *
6734  * These flags are purely descriptive of the topology and do not prescribe
6735  * behaviour. Behaviour is artificial and mapped in the below sd_init()
6736  * function:
6737  *
6738  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
6739  *   SD_SHARE_PKG_RESOURCES - describes shared caches
6740  *   SD_NUMA                - describes NUMA topologies
6741  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
6742  *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
6743  *   SD_SHARE_CAP_STATES    - describes shared capacity states
6744  *
6745  * Odd one out, which beside describing the topology has a quirk also
6746  * prescribes the desired behaviour that goes along with it:
6747  *
6748  * Odd one out:
6749  * SD_ASYM_PACKING        - describes SMT quirks
6750  */
6751 #define TOPOLOGY_SD_FLAGS               \
6752         (SD_SHARE_CPUCAPACITY |         \
6753          SD_SHARE_PKG_RESOURCES |       \
6754          SD_NUMA |                      \
6755          SD_ASYM_PACKING |              \
6756          SD_ASYM_CPUCAPACITY |          \
6757          SD_SHARE_POWERDOMAIN |         \
6758          SD_SHARE_CAP_STATES)
6759
6760 static struct sched_domain *
6761 sd_init(struct sched_domain_topology_level *tl,
6762         struct sched_domain *child, int cpu)
6763 {
6764         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6765         int sd_weight, sd_flags = 0;
6766
6767 #ifdef CONFIG_NUMA
6768         /*
6769          * Ugly hack to pass state to sd_numa_mask()...
6770          */
6771         sched_domains_curr_level = tl->numa_level;
6772 #endif
6773
6774         sd_weight = cpumask_weight(tl->mask(cpu));
6775
6776         if (tl->sd_flags)
6777                 sd_flags = (*tl->sd_flags)();
6778         if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6779                         "wrong sd_flags in topology description\n"))
6780                 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6781
6782         *sd = (struct sched_domain){
6783                 .min_interval           = sd_weight,
6784                 .max_interval           = 2*sd_weight,
6785                 .busy_factor            = 32,
6786                 .imbalance_pct          = 125,
6787
6788                 .cache_nice_tries       = 0,
6789                 .busy_idx               = 0,
6790                 .idle_idx               = 0,
6791                 .newidle_idx            = 0,
6792                 .wake_idx               = 0,
6793                 .forkexec_idx           = 0,
6794
6795                 .flags                  = 1*SD_LOAD_BALANCE
6796                                         | 1*SD_BALANCE_NEWIDLE
6797                                         | 1*SD_BALANCE_EXEC
6798                                         | 1*SD_BALANCE_FORK
6799                                         | 0*SD_BALANCE_WAKE
6800                                         | 1*SD_WAKE_AFFINE
6801                                         | 0*SD_SHARE_CPUCAPACITY
6802                                         | 0*SD_SHARE_PKG_RESOURCES
6803                                         | 0*SD_SERIALIZE
6804                                         | 0*SD_PREFER_SIBLING
6805                                         | 0*SD_NUMA
6806                                         | sd_flags
6807                                         ,
6808
6809                 .last_balance           = jiffies,
6810                 .balance_interval       = sd_weight,
6811                 .smt_gain               = 0,
6812                 .max_newidle_lb_cost    = 0,
6813                 .next_decay_max_lb_cost = jiffies,
6814                 .child                  = child,
6815 #ifdef CONFIG_SCHED_DEBUG
6816                 .name                   = tl->name,
6817 #endif
6818         };
6819
6820         /*
6821          * Convert topological properties into behaviour.
6822          */
6823
6824         if (sd->flags & SD_ASYM_CPUCAPACITY) {
6825                 struct sched_domain *t = sd;
6826
6827                 for_each_lower_domain(t)
6828                         t->flags |= SD_BALANCE_WAKE;
6829         }
6830
6831         if (sd->flags & SD_SHARE_CPUCAPACITY) {
6832                 sd->flags |= SD_PREFER_SIBLING;
6833                 sd->imbalance_pct = 110;
6834                 sd->smt_gain = 1178; /* ~15% */
6835
6836         } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6837                 sd->imbalance_pct = 117;
6838                 sd->cache_nice_tries = 1;
6839                 sd->busy_idx = 2;
6840
6841 #ifdef CONFIG_NUMA
6842         } else if (sd->flags & SD_NUMA) {
6843                 sd->cache_nice_tries = 2;
6844                 sd->busy_idx = 3;
6845                 sd->idle_idx = 2;
6846
6847                 sd->flags |= SD_SERIALIZE;
6848                 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6849                         sd->flags &= ~(SD_BALANCE_EXEC |
6850                                        SD_BALANCE_FORK |
6851                                        SD_WAKE_AFFINE);
6852                 }
6853
6854 #endif
6855         } else {
6856                 sd->flags |= SD_PREFER_SIBLING;
6857                 sd->cache_nice_tries = 1;
6858                 sd->busy_idx = 2;
6859                 sd->idle_idx = 1;
6860         }
6861
6862         sd->private = &tl->data;
6863
6864         return sd;
6865 }
6866
6867 /*
6868  * Topology list, bottom-up.
6869  */
6870 static struct sched_domain_topology_level default_topology[] = {
6871 #ifdef CONFIG_SCHED_SMT
6872         { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6873 #endif
6874 #ifdef CONFIG_SCHED_MC
6875         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6876 #endif
6877         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6878         { NULL, },
6879 };
6880
6881 static struct sched_domain_topology_level *sched_domain_topology =
6882         default_topology;
6883
6884 #define for_each_sd_topology(tl)                        \
6885         for (tl = sched_domain_topology; tl->mask; tl++)
6886
6887 void set_sched_topology(struct sched_domain_topology_level *tl)
6888 {
6889         sched_domain_topology = tl;
6890 }
6891
6892 #ifdef CONFIG_NUMA
6893
6894 static const struct cpumask *sd_numa_mask(int cpu)
6895 {
6896         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6897 }
6898
6899 static void sched_numa_warn(const char *str)
6900 {
6901         static int done = false;
6902         int i,j;
6903
6904         if (done)
6905                 return;
6906
6907         done = true;
6908
6909         printk(KERN_WARNING "ERROR: %s\n\n", str);
6910
6911         for (i = 0; i < nr_node_ids; i++) {
6912                 printk(KERN_WARNING "  ");
6913                 for (j = 0; j < nr_node_ids; j++)
6914                         printk(KERN_CONT "%02d ", node_distance(i,j));
6915                 printk(KERN_CONT "\n");
6916         }
6917         printk(KERN_WARNING "\n");
6918 }
6919
6920 bool find_numa_distance(int distance)
6921 {
6922         int i;
6923
6924         if (distance == node_distance(0, 0))
6925                 return true;
6926
6927         for (i = 0; i < sched_domains_numa_levels; i++) {
6928                 if (sched_domains_numa_distance[i] == distance)
6929                         return true;
6930         }
6931
6932         return false;
6933 }
6934
6935 /*
6936  * A system can have three types of NUMA topology:
6937  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6938  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6939  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6940  *
6941  * The difference between a glueless mesh topology and a backplane
6942  * topology lies in whether communication between not directly
6943  * connected nodes goes through intermediary nodes (where programs
6944  * could run), or through backplane controllers. This affects
6945  * placement of programs.
6946  *
6947  * The type of topology can be discerned with the following tests:
6948  * - If the maximum distance between any nodes is 1 hop, the system
6949  *   is directly connected.
6950  * - If for two nodes A and B, located N > 1 hops away from each other,
6951  *   there is an intermediary node C, which is < N hops away from both
6952  *   nodes A and B, the system is a glueless mesh.
6953  */
6954 static void init_numa_topology_type(void)
6955 {
6956         int a, b, c, n;
6957
6958         n = sched_max_numa_distance;
6959
6960         if (sched_domains_numa_levels <= 1) {
6961                 sched_numa_topology_type = NUMA_DIRECT;
6962                 return;
6963         }
6964
6965         for_each_online_node(a) {
6966                 for_each_online_node(b) {
6967                         /* Find two nodes furthest removed from each other. */
6968                         if (node_distance(a, b) < n)
6969                                 continue;
6970
6971                         /* Is there an intermediary node between a and b? */
6972                         for_each_online_node(c) {
6973                                 if (node_distance(a, c) < n &&
6974                                     node_distance(b, c) < n) {
6975                                         sched_numa_topology_type =
6976                                                         NUMA_GLUELESS_MESH;
6977                                         return;
6978                                 }
6979                         }
6980
6981                         sched_numa_topology_type = NUMA_BACKPLANE;
6982                         return;
6983                 }
6984         }
6985 }
6986
6987 static void sched_init_numa(void)
6988 {
6989         int next_distance, curr_distance = node_distance(0, 0);
6990         struct sched_domain_topology_level *tl;
6991         int level = 0;
6992         int i, j, k;
6993
6994         sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6995         if (!sched_domains_numa_distance)
6996                 return;
6997
6998         /*
6999          * O(nr_nodes^2) deduplicating selection sort -- in order to find the
7000          * unique distances in the node_distance() table.
7001          *
7002          * Assumes node_distance(0,j) includes all distances in
7003          * node_distance(i,j) in order to avoid cubic time.
7004          */
7005         next_distance = curr_distance;
7006         for (i = 0; i < nr_node_ids; i++) {
7007                 for (j = 0; j < nr_node_ids; j++) {
7008                         for (k = 0; k < nr_node_ids; k++) {
7009                                 int distance = node_distance(i, k);
7010
7011                                 if (distance > curr_distance &&
7012                                     (distance < next_distance ||
7013                                      next_distance == curr_distance))
7014                                         next_distance = distance;
7015
7016                                 /*
7017                                  * While not a strong assumption it would be nice to know
7018                                  * about cases where if node A is connected to B, B is not
7019                                  * equally connected to A.
7020                                  */
7021                                 if (sched_debug() && node_distance(k, i) != distance)
7022                                         sched_numa_warn("Node-distance not symmetric");
7023
7024                                 if (sched_debug() && i && !find_numa_distance(distance))
7025                                         sched_numa_warn("Node-0 not representative");
7026                         }
7027                         if (next_distance != curr_distance) {
7028                                 sched_domains_numa_distance[level++] = next_distance;
7029                                 sched_domains_numa_levels = level;
7030                                 curr_distance = next_distance;
7031                         } else break;
7032                 }
7033
7034                 /*
7035                  * In case of sched_debug() we verify the above assumption.
7036                  */
7037                 if (!sched_debug())
7038                         break;
7039         }
7040
7041         if (!level)
7042                 return;
7043
7044         /*
7045          * 'level' contains the number of unique distances, excluding the
7046          * identity distance node_distance(i,i).
7047          *
7048          * The sched_domains_numa_distance[] array includes the actual distance
7049          * numbers.
7050          */
7051
7052         /*
7053          * Here, we should temporarily reset sched_domains_numa_levels to 0.
7054          * If it fails to allocate memory for array sched_domains_numa_masks[][],
7055          * the array will contain less then 'level' members. This could be
7056          * dangerous when we use it to iterate array sched_domains_numa_masks[][]
7057          * in other functions.
7058          *
7059          * We reset it to 'level' at the end of this function.
7060          */
7061         sched_domains_numa_levels = 0;
7062
7063         sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
7064         if (!sched_domains_numa_masks)
7065                 return;
7066
7067         /*
7068          * Now for each level, construct a mask per node which contains all
7069          * cpus of nodes that are that many hops away from us.
7070          */
7071         for (i = 0; i < level; i++) {
7072                 sched_domains_numa_masks[i] =
7073                         kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
7074                 if (!sched_domains_numa_masks[i])
7075                         return;
7076
7077                 for (j = 0; j < nr_node_ids; j++) {
7078                         struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
7079                         if (!mask)
7080                                 return;
7081
7082                         sched_domains_numa_masks[i][j] = mask;
7083
7084                         for_each_node(k) {
7085                                 if (node_distance(j, k) > sched_domains_numa_distance[i])
7086                                         continue;
7087
7088                                 cpumask_or(mask, mask, cpumask_of_node(k));
7089                         }
7090                 }
7091         }
7092
7093         /* Compute default topology size */
7094         for (i = 0; sched_domain_topology[i].mask; i++);
7095
7096         tl = kzalloc((i + level + 1) *
7097                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
7098         if (!tl)
7099                 return;
7100
7101         /*
7102          * Copy the default topology bits..
7103          */
7104         for (i = 0; sched_domain_topology[i].mask; i++)
7105                 tl[i] = sched_domain_topology[i];
7106
7107         /*
7108          * .. and append 'j' levels of NUMA goodness.
7109          */
7110         for (j = 0; j < level; i++, j++) {
7111                 tl[i] = (struct sched_domain_topology_level){
7112                         .mask = sd_numa_mask,
7113                         .sd_flags = cpu_numa_flags,
7114                         .flags = SDTL_OVERLAP,
7115                         .numa_level = j,
7116                         SD_INIT_NAME(NUMA)
7117                 };
7118         }
7119
7120         sched_domain_topology = tl;
7121
7122         sched_domains_numa_levels = level;
7123         sched_max_numa_distance = sched_domains_numa_distance[level - 1];
7124
7125         init_numa_topology_type();
7126 }
7127
7128 static void sched_domains_numa_masks_set(int cpu)
7129 {
7130         int i, j;
7131         int node = cpu_to_node(cpu);
7132
7133         for (i = 0; i < sched_domains_numa_levels; i++) {
7134                 for (j = 0; j < nr_node_ids; j++) {
7135                         if (node_distance(j, node) <= sched_domains_numa_distance[i])
7136                                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
7137                 }
7138         }
7139 }
7140
7141 static void sched_domains_numa_masks_clear(int cpu)
7142 {
7143         int i, j;
7144         for (i = 0; i < sched_domains_numa_levels; i++) {
7145                 for (j = 0; j < nr_node_ids; j++)
7146                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
7147         }
7148 }
7149
7150 /*
7151  * Update sched_domains_numa_masks[level][node] array when new cpus
7152  * are onlined.
7153  */
7154 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
7155                                            unsigned long action,
7156                                            void *hcpu)
7157 {
7158         int cpu = (long)hcpu;
7159
7160         switch (action & ~CPU_TASKS_FROZEN) {
7161         case CPU_ONLINE:
7162                 sched_domains_numa_masks_set(cpu);
7163                 break;
7164
7165         case CPU_DEAD:
7166                 sched_domains_numa_masks_clear(cpu);
7167                 break;
7168
7169         default:
7170                 return NOTIFY_DONE;
7171         }
7172
7173         return NOTIFY_OK;
7174 }
7175 #else
7176 static inline void sched_init_numa(void)
7177 {
7178 }
7179
7180 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
7181                                            unsigned long action,
7182                                            void *hcpu)
7183 {
7184         return 0;
7185 }
7186 #endif /* CONFIG_NUMA */
7187
7188 static int __sdt_alloc(const struct cpumask *cpu_map)
7189 {
7190         struct sched_domain_topology_level *tl;
7191         int j;
7192
7193         for_each_sd_topology(tl) {
7194                 struct sd_data *sdd = &tl->data;
7195
7196                 sdd->sd = alloc_percpu(struct sched_domain *);
7197                 if (!sdd->sd)
7198                         return -ENOMEM;
7199
7200                 sdd->sg = alloc_percpu(struct sched_group *);
7201                 if (!sdd->sg)
7202                         return -ENOMEM;
7203
7204                 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
7205                 if (!sdd->sgc)
7206                         return -ENOMEM;
7207
7208                 for_each_cpu(j, cpu_map) {
7209                         struct sched_domain *sd;
7210                         struct sched_group *sg;
7211                         struct sched_group_capacity *sgc;
7212
7213                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7214                                         GFP_KERNEL, cpu_to_node(j));
7215                         if (!sd)
7216                                 return -ENOMEM;
7217
7218                         *per_cpu_ptr(sdd->sd, j) = sd;
7219
7220                         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7221                                         GFP_KERNEL, cpu_to_node(j));
7222                         if (!sg)
7223                                 return -ENOMEM;
7224
7225                         sg->next = sg;
7226
7227                         *per_cpu_ptr(sdd->sg, j) = sg;
7228
7229                         sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
7230                                         GFP_KERNEL, cpu_to_node(j));
7231                         if (!sgc)
7232                                 return -ENOMEM;
7233
7234                         *per_cpu_ptr(sdd->sgc, j) = sgc;
7235                 }
7236         }
7237
7238         return 0;
7239 }
7240
7241 static void __sdt_free(const struct cpumask *cpu_map)
7242 {
7243         struct sched_domain_topology_level *tl;
7244         int j;
7245
7246         for_each_sd_topology(tl) {
7247                 struct sd_data *sdd = &tl->data;
7248
7249                 for_each_cpu(j, cpu_map) {
7250                         struct sched_domain *sd;
7251
7252                         if (sdd->sd) {
7253                                 sd = *per_cpu_ptr(sdd->sd, j);
7254                                 if (sd && (sd->flags & SD_OVERLAP))
7255                                         free_sched_groups(sd->groups, 0);
7256                                 kfree(*per_cpu_ptr(sdd->sd, j));
7257                         }
7258
7259                         if (sdd->sg)
7260                                 kfree(*per_cpu_ptr(sdd->sg, j));
7261                         if (sdd->sgc)
7262                                 kfree(*per_cpu_ptr(sdd->sgc, j));
7263                 }
7264                 free_percpu(sdd->sd);
7265                 sdd->sd = NULL;
7266                 free_percpu(sdd->sg);
7267                 sdd->sg = NULL;
7268                 free_percpu(sdd->sgc);
7269                 sdd->sgc = NULL;
7270         }
7271 }
7272
7273 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7274                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7275                 struct sched_domain *child, int cpu)
7276 {
7277         struct sched_domain *sd = sd_init(tl, child, cpu);
7278
7279         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7280         if (child) {
7281                 sd->level = child->level + 1;
7282                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7283                 child->parent = sd;
7284
7285                 if (!cpumask_subset(sched_domain_span(child),
7286                                     sched_domain_span(sd))) {
7287                         pr_err("BUG: arch topology borken\n");
7288 #ifdef CONFIG_SCHED_DEBUG
7289                         pr_err("     the %s domain not a subset of the %s domain\n",
7290                                         child->name, sd->name);
7291 #endif
7292                         /* Fixup, ensure @sd has at least @child cpus. */
7293                         cpumask_or(sched_domain_span(sd),
7294                                    sched_domain_span(sd),
7295                                    sched_domain_span(child));
7296                 }
7297
7298         }
7299         set_domain_attribute(sd, attr);
7300
7301         return sd;
7302 }
7303
7304 /*
7305  * Build sched domains for a given set of cpus and attach the sched domains
7306  * to the individual cpus
7307  */
7308 static int build_sched_domains(const struct cpumask *cpu_map,
7309                                struct sched_domain_attr *attr)
7310 {
7311         enum s_alloc alloc_state;
7312         struct sched_domain *sd;
7313         struct s_data d;
7314         int i, ret = -ENOMEM;
7315
7316         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7317         if (alloc_state != sa_rootdomain)
7318                 goto error;
7319
7320         /* Set up domains for cpus specified by the cpu_map. */
7321         for_each_cpu(i, cpu_map) {
7322                 struct sched_domain_topology_level *tl;
7323
7324                 sd = NULL;
7325                 for_each_sd_topology(tl) {
7326                         sd = build_sched_domain(tl, cpu_map, attr, sd, i);
7327                         if (tl == sched_domain_topology)
7328                                 *per_cpu_ptr(d.sd, i) = sd;
7329                         if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7330                                 sd->flags |= SD_OVERLAP;
7331                 }
7332         }
7333
7334         /* Build the groups for the domains */
7335         for_each_cpu(i, cpu_map) {
7336                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7337                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
7338                         if (sd->flags & SD_OVERLAP) {
7339                                 if (build_overlap_sched_groups(sd, i))
7340                                         goto error;
7341                         } else {
7342                                 if (build_sched_groups(sd, i))
7343                                         goto error;
7344                         }
7345                 }
7346         }
7347
7348         /* Calculate CPU capacity for physical packages and nodes */
7349         for (i = nr_cpumask_bits-1; i >= 0; i--) {
7350                 struct sched_domain_topology_level *tl = sched_domain_topology;
7351
7352                 if (!cpumask_test_cpu(i, cpu_map))
7353                         continue;
7354
7355                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
7356                         init_sched_energy(i, sd, tl->energy);
7357                         claim_allocations(i, sd);
7358                         init_sched_groups_capacity(i, sd);
7359                 }
7360         }
7361
7362         /* Attach the domains */
7363         rcu_read_lock();
7364         for_each_cpu(i, cpu_map) {
7365                 int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
7366                 int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
7367
7368                 if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
7369                     cpu_rq(max_cpu)->cpu_capacity_orig))
7370                         WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
7371
7372                 if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
7373                     cpu_rq(min_cpu)->cpu_capacity_orig))
7374                         WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
7375
7376                 sd = *per_cpu_ptr(d.sd, i);
7377
7378                 cpu_attach_domain(sd, d.rd, i);
7379         }
7380         rcu_read_unlock();
7381
7382         ret = 0;
7383 error:
7384         __free_domain_allocs(&d, alloc_state, cpu_map);
7385         return ret;
7386 }
7387
7388 static cpumask_var_t *doms_cur; /* current sched domains */
7389 static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
7390 static struct sched_domain_attr *dattr_cur;
7391                                 /* attribues of custom domains in 'doms_cur' */
7392
7393 /*
7394  * Special case: If a kmalloc of a doms_cur partition (array of
7395  * cpumask) fails, then fallback to a single sched domain,
7396  * as determined by the single cpumask fallback_doms.
7397  */
7398 static cpumask_var_t fallback_doms;
7399
7400 /*
7401  * arch_update_cpu_topology lets virtualized architectures update the
7402  * cpu core maps. It is supposed to return 1 if the topology changed
7403  * or 0 if it stayed the same.
7404  */
7405 int __weak arch_update_cpu_topology(void)
7406 {
7407         return 0;
7408 }
7409
7410 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7411 {
7412         int i;
7413         cpumask_var_t *doms;
7414
7415         doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7416         if (!doms)
7417                 return NULL;
7418         for (i = 0; i < ndoms; i++) {
7419                 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7420                         free_sched_domains(doms, i);
7421                         return NULL;
7422                 }
7423         }
7424         return doms;
7425 }
7426
7427 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7428 {
7429         unsigned int i;
7430         for (i = 0; i < ndoms; i++)
7431                 free_cpumask_var(doms[i]);
7432         kfree(doms);
7433 }
7434
7435 /*
7436  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7437  * For now this just excludes isolated cpus, but could be used to
7438  * exclude other special cases in the future.
7439  */
7440 static int init_sched_domains(const struct cpumask *cpu_map)
7441 {
7442         int err;
7443
7444         arch_update_cpu_topology();
7445         ndoms_cur = 1;
7446         doms_cur = alloc_sched_domains(ndoms_cur);
7447         if (!doms_cur)
7448                 doms_cur = &fallback_doms;
7449         cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7450         err = build_sched_domains(doms_cur[0], NULL);
7451         register_sched_domain_sysctl();
7452
7453         return err;
7454 }
7455
7456 /*
7457  * Detach sched domains from a group of cpus specified in cpu_map
7458  * These cpus will now be attached to the NULL domain
7459  */
7460 static void detach_destroy_domains(const struct cpumask *cpu_map)
7461 {
7462         int i;
7463
7464         rcu_read_lock();
7465         for_each_cpu(i, cpu_map)
7466                 cpu_attach_domain(NULL, &def_root_domain, i);
7467         rcu_read_unlock();
7468 }
7469
7470 /* handle null as "default" */
7471 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7472                         struct sched_domain_attr *new, int idx_new)
7473 {
7474         struct sched_domain_attr tmp;
7475
7476         /* fast path */
7477         if (!new && !cur)
7478                 return 1;
7479
7480         tmp = SD_ATTR_INIT;
7481         return !memcmp(cur ? (cur + idx_cur) : &tmp,
7482                         new ? (new + idx_new) : &tmp,
7483                         sizeof(struct sched_domain_attr));
7484 }
7485
7486 /*
7487  * Partition sched domains as specified by the 'ndoms_new'
7488  * cpumasks in the array doms_new[] of cpumasks. This compares
7489  * doms_new[] to the current sched domain partitioning, doms_cur[].
7490  * It destroys each deleted domain and builds each new domain.
7491  *
7492  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7493  * The masks don't intersect (don't overlap.) We should setup one
7494  * sched domain for each mask. CPUs not in any of the cpumasks will
7495  * not be load balanced. If the same cpumask appears both in the
7496  * current 'doms_cur' domains and in the new 'doms_new', we can leave
7497  * it as it is.
7498  *
7499  * The passed in 'doms_new' should be allocated using
7500  * alloc_sched_domains.  This routine takes ownership of it and will
7501  * free_sched_domains it when done with it. If the caller failed the
7502  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7503  * and partition_sched_domains() will fallback to the single partition
7504  * 'fallback_doms', it also forces the domains to be rebuilt.
7505  *
7506  * If doms_new == NULL it will be replaced with cpu_online_mask.
7507  * ndoms_new == 0 is a special case for destroying existing domains,
7508  * and it will not create the default domain.
7509  *
7510  * Call with hotplug lock held
7511  */
7512 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7513                              struct sched_domain_attr *dattr_new)
7514 {
7515         int i, j, n;
7516         int new_topology;
7517
7518         mutex_lock(&sched_domains_mutex);
7519
7520         /* always unregister in case we don't destroy any domains */
7521         unregister_sched_domain_sysctl();
7522
7523         /* Let architecture update cpu core mappings. */
7524         new_topology = arch_update_cpu_topology();
7525
7526         n = doms_new ? ndoms_new : 0;
7527
7528         /* Destroy deleted domains */
7529         for (i = 0; i < ndoms_cur; i++) {
7530                 for (j = 0; j < n && !new_topology; j++) {
7531                         if (cpumask_equal(doms_cur[i], doms_new[j])
7532                             && dattrs_equal(dattr_cur, i, dattr_new, j))
7533                                 goto match1;
7534                 }
7535                 /* no match - a current sched domain not in new doms_new[] */
7536                 detach_destroy_domains(doms_cur[i]);
7537 match1:
7538                 ;
7539         }
7540
7541         n = ndoms_cur;
7542         if (doms_new == NULL) {
7543                 n = 0;
7544                 doms_new = &fallback_doms;
7545                 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7546                 WARN_ON_ONCE(dattr_new);
7547         }
7548
7549         /* Build new domains */
7550         for (i = 0; i < ndoms_new; i++) {
7551                 for (j = 0; j < n && !new_topology; j++) {
7552                         if (cpumask_equal(doms_new[i], doms_cur[j])
7553                             && dattrs_equal(dattr_new, i, dattr_cur, j))
7554                                 goto match2;
7555                 }
7556                 /* no match - add a new doms_new */
7557                 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7558 match2:
7559                 ;
7560         }
7561
7562         /* Remember the new sched domains */
7563         if (doms_cur != &fallback_doms)
7564                 free_sched_domains(doms_cur, ndoms_cur);
7565         kfree(dattr_cur);       /* kfree(NULL) is safe */
7566         doms_cur = doms_new;
7567         dattr_cur = dattr_new;
7568         ndoms_cur = ndoms_new;
7569
7570         register_sched_domain_sysctl();
7571
7572         mutex_unlock(&sched_domains_mutex);
7573 }
7574
7575 static int num_cpus_frozen;     /* used to mark begin/end of suspend/resume */
7576
7577 /*
7578  * Update cpusets according to cpu_active mask.  If cpusets are
7579  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7580  * around partition_sched_domains().
7581  *
7582  * If we come here as part of a suspend/resume, don't touch cpusets because we
7583  * want to restore it back to its original state upon resume anyway.
7584  */
7585 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7586                              void *hcpu)
7587 {
7588         switch (action) {
7589         case CPU_ONLINE_FROZEN:
7590         case CPU_DOWN_FAILED_FROZEN:
7591
7592                 /*
7593                  * num_cpus_frozen tracks how many CPUs are involved in suspend
7594                  * resume sequence. As long as this is not the last online
7595                  * operation in the resume sequence, just build a single sched
7596                  * domain, ignoring cpusets.
7597                  */
7598                 partition_sched_domains(1, NULL, NULL);
7599                 if (--num_cpus_frozen)
7600                         break;
7601
7602                 /*
7603                  * This is the last CPU online operation. So fall through and
7604                  * restore the original sched domains by considering the
7605                  * cpuset configurations.
7606                  */
7607                 cpuset_force_rebuild();
7608
7609         case CPU_ONLINE:
7610                 cpuset_update_active_cpus(true);
7611                 break;
7612         default:
7613                 return NOTIFY_DONE;
7614         }
7615         return NOTIFY_OK;
7616 }
7617
7618 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7619                                void *hcpu)
7620 {
7621         unsigned long flags;
7622         long cpu = (long)hcpu;
7623         struct dl_bw *dl_b;
7624         bool overflow;
7625         int cpus;
7626
7627         switch (action) {
7628         case CPU_DOWN_PREPARE:
7629                 rcu_read_lock_sched();
7630                 dl_b = dl_bw_of(cpu);
7631
7632                 raw_spin_lock_irqsave(&dl_b->lock, flags);
7633                 cpus = dl_bw_cpus(cpu);
7634                 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7635                 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7636
7637                 rcu_read_unlock_sched();
7638
7639                 if (overflow)
7640                         return notifier_from_errno(-EBUSY);
7641                 cpuset_update_active_cpus(false);
7642                 break;
7643         case CPU_DOWN_PREPARE_FROZEN:
7644                 num_cpus_frozen++;
7645                 partition_sched_domains(1, NULL, NULL);
7646                 break;
7647         default:
7648                 return NOTIFY_DONE;
7649         }
7650         return NOTIFY_OK;
7651 }
7652
7653 void __init sched_init_smp(void)
7654 {
7655         cpumask_var_t non_isolated_cpus;
7656
7657         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7658         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7659
7660         sched_init_numa();
7661
7662         /*
7663          * There's no userspace yet to cause hotplug operations; hence all the
7664          * cpu masks are stable and all blatant races in the below code cannot
7665          * happen.
7666          */
7667         mutex_lock(&sched_domains_mutex);
7668         init_sched_domains(cpu_active_mask);
7669         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7670         if (cpumask_empty(non_isolated_cpus))
7671                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7672         mutex_unlock(&sched_domains_mutex);
7673
7674         hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
7675         hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7676         hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7677
7678         init_hrtick();
7679
7680         /* Move init over to a non-isolated CPU */
7681         if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7682                 BUG();
7683         sched_init_granularity();
7684         free_cpumask_var(non_isolated_cpus);
7685
7686         init_sched_rt_class();
7687         init_sched_dl_class();
7688 }
7689 #else
7690 void __init sched_init_smp(void)
7691 {
7692         sched_init_granularity();
7693 }
7694 #endif /* CONFIG_SMP */
7695
7696 int in_sched_functions(unsigned long addr)
7697 {
7698         return in_lock_functions(addr) ||
7699                 (addr >= (unsigned long)__sched_text_start
7700                 && addr < (unsigned long)__sched_text_end);
7701 }
7702
7703 #ifdef CONFIG_CGROUP_SCHED
7704 /*
7705  * Default task group.
7706  * Every task in system belongs to this group at bootup.
7707  */
7708 struct task_group root_task_group;
7709 LIST_HEAD(task_groups);
7710 #endif
7711
7712 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7713
7714 void __init sched_init(void)
7715 {
7716         int i, j;
7717         unsigned long alloc_size = 0, ptr;
7718
7719 #ifdef CONFIG_FAIR_GROUP_SCHED
7720         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7721 #endif
7722 #ifdef CONFIG_RT_GROUP_SCHED
7723         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7724 #endif
7725         if (alloc_size) {
7726                 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7727
7728 #ifdef CONFIG_FAIR_GROUP_SCHED
7729                 root_task_group.se = (struct sched_entity **)ptr;
7730                 ptr += nr_cpu_ids * sizeof(void **);
7731
7732                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7733                 ptr += nr_cpu_ids * sizeof(void **);
7734
7735 #endif /* CONFIG_FAIR_GROUP_SCHED */
7736 #ifdef CONFIG_RT_GROUP_SCHED
7737                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7738                 ptr += nr_cpu_ids * sizeof(void **);
7739
7740                 root_task_group.rt_rq = (struct rt_rq **)ptr;
7741                 ptr += nr_cpu_ids * sizeof(void **);
7742
7743 #endif /* CONFIG_RT_GROUP_SCHED */
7744         }
7745 #ifdef CONFIG_CPUMASK_OFFSTACK
7746         for_each_possible_cpu(i) {
7747                 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7748                         cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7749         }
7750 #endif /* CONFIG_CPUMASK_OFFSTACK */
7751
7752         init_rt_bandwidth(&def_rt_bandwidth,
7753                         global_rt_period(), global_rt_runtime());
7754         init_dl_bandwidth(&def_dl_bandwidth,
7755                         global_rt_period(), global_rt_runtime());
7756
7757 #ifdef CONFIG_SMP
7758         init_defrootdomain();
7759 #endif
7760
7761 #ifdef CONFIG_RT_GROUP_SCHED
7762         init_rt_bandwidth(&root_task_group.rt_bandwidth,
7763                         global_rt_period(), global_rt_runtime());
7764 #endif /* CONFIG_RT_GROUP_SCHED */
7765
7766 #ifdef CONFIG_CGROUP_SCHED
7767         list_add(&root_task_group.list, &task_groups);
7768         INIT_LIST_HEAD(&root_task_group.children);
7769         INIT_LIST_HEAD(&root_task_group.siblings);
7770         autogroup_init(&init_task);
7771
7772 #endif /* CONFIG_CGROUP_SCHED */
7773
7774         for_each_possible_cpu(i) {
7775                 struct rq *rq;
7776
7777                 rq = cpu_rq(i);
7778                 raw_spin_lock_init(&rq->lock);
7779                 rq->nr_running = 0;
7780                 rq->calc_load_active = 0;
7781                 rq->calc_load_update = jiffies + LOAD_FREQ;
7782                 init_cfs_rq(&rq->cfs);
7783                 init_rt_rq(&rq->rt);
7784                 init_dl_rq(&rq->dl);
7785 #ifdef CONFIG_FAIR_GROUP_SCHED
7786                 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7787                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7788                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
7789                 /*
7790                  * How much cpu bandwidth does root_task_group get?
7791                  *
7792                  * In case of task-groups formed thr' the cgroup filesystem, it
7793                  * gets 100% of the cpu resources in the system. This overall
7794                  * system cpu resource is divided among the tasks of
7795                  * root_task_group and its child task-groups in a fair manner,
7796                  * based on each entity's (task or task-group's) weight
7797                  * (se->load.weight).
7798                  *
7799                  * In other words, if root_task_group has 10 tasks of weight
7800                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
7801                  * then A0's share of the cpu resource is:
7802                  *
7803                  *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7804                  *
7805                  * We achieve this by letting root_task_group's tasks sit
7806                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7807                  */
7808                 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7809                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7810 #endif /* CONFIG_FAIR_GROUP_SCHED */
7811
7812                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7813 #ifdef CONFIG_RT_GROUP_SCHED
7814                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7815 #endif
7816
7817                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7818                         rq->cpu_load[j] = 0;
7819
7820                 rq->last_load_update_tick = jiffies;
7821
7822 #ifdef CONFIG_SMP
7823                 rq->sd = NULL;
7824                 rq->rd = NULL;
7825                 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7826                 rq->balance_callback = NULL;
7827                 rq->active_balance = 0;
7828                 rq->next_balance = jiffies;
7829                 rq->push_cpu = 0;
7830                 rq->push_task = NULL;
7831                 rq->cpu = i;
7832                 rq->online = 0;
7833                 rq->idle_stamp = 0;
7834                 rq->avg_idle = 2*sysctl_sched_migration_cost;
7835                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7836 #ifdef CONFIG_SCHED_WALT
7837                 rq->cur_irqload = 0;
7838                 rq->avg_irqload = 0;
7839                 rq->irqload_ts = 0;
7840 #endif
7841
7842                 INIT_LIST_HEAD(&rq->cfs_tasks);
7843
7844                 rq_attach_root(rq, &def_root_domain);
7845 #ifdef CONFIG_NO_HZ_COMMON
7846                 rq->nohz_flags = 0;
7847 #endif
7848 #ifdef CONFIG_NO_HZ_FULL
7849                 rq->last_sched_tick = 0;
7850 #endif
7851 #endif
7852                 init_rq_hrtick(rq);
7853                 atomic_set(&rq->nr_iowait, 0);
7854         }
7855
7856         set_load_weight(&init_task);
7857
7858 #ifdef CONFIG_PREEMPT_NOTIFIERS
7859         INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7860 #endif
7861
7862         /*
7863          * The boot idle thread does lazy MMU switching as well:
7864          */
7865         atomic_inc(&init_mm.mm_count);
7866         enter_lazy_tlb(&init_mm, current);
7867
7868         /*
7869          * During early bootup we pretend to be a normal task:
7870          */
7871         current->sched_class = &fair_sched_class;
7872
7873         /*
7874          * Make us the idle thread. Technically, schedule() should not be
7875          * called from this thread, however somewhere below it might be,
7876          * but because we are the idle thread, we just pick up running again
7877          * when this runqueue becomes "idle".
7878          */
7879         init_idle(current, smp_processor_id());
7880
7881         calc_load_update = jiffies + LOAD_FREQ;
7882
7883 #ifdef CONFIG_SMP
7884         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7885         /* May be allocated at isolcpus cmdline parse time */
7886         if (cpu_isolated_map == NULL)
7887                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7888         idle_thread_set_boot_cpu();
7889         set_cpu_rq_start_time();
7890 #endif
7891         init_sched_fair_class();
7892
7893         scheduler_running = 1;
7894 }
7895
7896 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7897 static inline int preempt_count_equals(int preempt_offset)
7898 {
7899         int nested = preempt_count() + rcu_preempt_depth();
7900
7901         return (nested == preempt_offset);
7902 }
7903
7904 static int __might_sleep_init_called;
7905 int __init __might_sleep_init(void)
7906 {
7907         __might_sleep_init_called = 1;
7908         return 0;
7909 }
7910 early_initcall(__might_sleep_init);
7911
7912 void __might_sleep(const char *file, int line, int preempt_offset)
7913 {
7914         /*
7915          * Blocking primitives will set (and therefore destroy) current->state,
7916          * since we will exit with TASK_RUNNING make sure we enter with it,
7917          * otherwise we will destroy state.
7918          */
7919         WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7920                         "do not call blocking ops when !TASK_RUNNING; "
7921                         "state=%lx set at [<%p>] %pS\n",
7922                         current->state,
7923                         (void *)current->task_state_change,
7924                         (void *)current->task_state_change);
7925
7926         ___might_sleep(file, line, preempt_offset);
7927 }
7928 EXPORT_SYMBOL(__might_sleep);
7929
7930 void ___might_sleep(const char *file, int line, int preempt_offset)
7931 {
7932         static unsigned long prev_jiffy;        /* ratelimiting */
7933
7934         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7935         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7936              !is_idle_task(current)) || oops_in_progress)
7937                 return;
7938         if (system_state != SYSTEM_RUNNING &&
7939             (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
7940                 return;
7941         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7942                 return;
7943         prev_jiffy = jiffies;
7944
7945         printk(KERN_ERR
7946                 "BUG: sleeping function called from invalid context at %s:%d\n",
7947                         file, line);
7948         printk(KERN_ERR
7949                 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7950                         in_atomic(), irqs_disabled(),
7951                         current->pid, current->comm);
7952
7953         if (task_stack_end_corrupted(current))
7954                 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7955
7956         debug_show_held_locks(current);
7957         if (irqs_disabled())
7958                 print_irqtrace_events(current);
7959 #ifdef CONFIG_DEBUG_PREEMPT
7960         if (!preempt_count_equals(preempt_offset)) {
7961                 pr_err("Preemption disabled at:");
7962                 print_ip_sym(current->preempt_disable_ip);
7963                 pr_cont("\n");
7964         }
7965 #endif
7966         dump_stack();
7967 }
7968 EXPORT_SYMBOL(___might_sleep);
7969 #endif
7970
7971 #ifdef CONFIG_MAGIC_SYSRQ
7972 void normalize_rt_tasks(void)
7973 {
7974         struct task_struct *g, *p;
7975         struct sched_attr attr = {
7976                 .sched_policy = SCHED_NORMAL,
7977         };
7978
7979         read_lock(&tasklist_lock);
7980         for_each_process_thread(g, p) {
7981                 /*
7982                  * Only normalize user tasks:
7983                  */
7984                 if (p->flags & PF_KTHREAD)
7985                         continue;
7986
7987                 p->se.exec_start                = 0;
7988 #ifdef CONFIG_SCHEDSTATS
7989                 p->se.statistics.wait_start     = 0;
7990                 p->se.statistics.sleep_start    = 0;
7991                 p->se.statistics.block_start    = 0;
7992 #endif
7993
7994                 if (!dl_task(p) && !rt_task(p)) {
7995                         /*
7996                          * Renice negative nice level userspace
7997                          * tasks back to 0:
7998                          */
7999                         if (task_nice(p) < 0)
8000                                 set_user_nice(p, 0);
8001                         continue;
8002                 }
8003
8004                 __sched_setscheduler(p, &attr, false, false);
8005         }
8006         read_unlock(&tasklist_lock);
8007 }
8008
8009 #endif /* CONFIG_MAGIC_SYSRQ */
8010
8011 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8012 /*
8013  * These functions are only useful for the IA64 MCA handling, or kdb.
8014  *
8015  * They can only be called when the whole system has been
8016  * stopped - every CPU needs to be quiescent, and no scheduling
8017  * activity can take place. Using them for anything else would
8018  * be a serious bug, and as a result, they aren't even visible
8019  * under any other configuration.
8020  */
8021
8022 /**
8023  * curr_task - return the current task for a given cpu.
8024  * @cpu: the processor in question.
8025  *
8026  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8027  *
8028  * Return: The current task for @cpu.
8029  */
8030 struct task_struct *curr_task(int cpu)
8031 {
8032         return cpu_curr(cpu);
8033 }
8034
8035 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8036
8037 #ifdef CONFIG_IA64
8038 /**
8039  * set_curr_task - set the current task for a given cpu.
8040  * @cpu: the processor in question.
8041  * @p: the task pointer to set.
8042  *
8043  * Description: This function must only be used when non-maskable interrupts
8044  * are serviced on a separate stack. It allows the architecture to switch the
8045  * notion of the current task on a cpu in a non-blocking manner. This function
8046  * must be called with all CPU's synchronized, and interrupts disabled, the
8047  * and caller must save the original value of the current task (see
8048  * curr_task() above) and restore that value before reenabling interrupts and
8049  * re-starting the system.
8050  *
8051  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8052  */
8053 void set_curr_task(int cpu, struct task_struct *p)
8054 {
8055         cpu_curr(cpu) = p;
8056 }
8057
8058 #endif
8059
8060 #ifdef CONFIG_CGROUP_SCHED
8061 /* task_group_lock serializes the addition/removal of task groups */
8062 static DEFINE_SPINLOCK(task_group_lock);
8063
8064 static void sched_free_group(struct task_group *tg)
8065 {
8066         free_fair_sched_group(tg);
8067         free_rt_sched_group(tg);
8068         autogroup_free(tg);
8069         kfree(tg);
8070 }
8071
8072 /* allocate runqueue etc for a new task group */
8073 struct task_group *sched_create_group(struct task_group *parent)
8074 {
8075         struct task_group *tg;
8076
8077         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8078         if (!tg)
8079                 return ERR_PTR(-ENOMEM);
8080
8081         if (!alloc_fair_sched_group(tg, parent))
8082                 goto err;
8083
8084         if (!alloc_rt_sched_group(tg, parent))
8085                 goto err;
8086
8087         return tg;
8088
8089 err:
8090         sched_free_group(tg);
8091         return ERR_PTR(-ENOMEM);
8092 }
8093
8094 void sched_online_group(struct task_group *tg, struct task_group *parent)
8095 {
8096         unsigned long flags;
8097
8098         spin_lock_irqsave(&task_group_lock, flags);
8099         list_add_rcu(&tg->list, &task_groups);
8100
8101         WARN_ON(!parent); /* root should already exist */
8102
8103         tg->parent = parent;
8104         INIT_LIST_HEAD(&tg->children);
8105         list_add_rcu(&tg->siblings, &parent->children);
8106         spin_unlock_irqrestore(&task_group_lock, flags);
8107 }
8108
8109 /* rcu callback to free various structures associated with a task group */
8110 static void sched_free_group_rcu(struct rcu_head *rhp)
8111 {
8112         /* now it should be safe to free those cfs_rqs */
8113         sched_free_group(container_of(rhp, struct task_group, rcu));
8114 }
8115
8116 void sched_destroy_group(struct task_group *tg)
8117 {
8118         /* wait for possible concurrent references to cfs_rqs complete */
8119         call_rcu(&tg->rcu, sched_free_group_rcu);
8120 }
8121
8122 void sched_offline_group(struct task_group *tg)
8123 {
8124         unsigned long flags;
8125         int i;
8126
8127         /* end participation in shares distribution */
8128         for_each_possible_cpu(i)
8129                 unregister_fair_sched_group(tg, i);
8130
8131         spin_lock_irqsave(&task_group_lock, flags);
8132         list_del_rcu(&tg->list);
8133         list_del_rcu(&tg->siblings);
8134         spin_unlock_irqrestore(&task_group_lock, flags);
8135 }
8136
8137 static void sched_change_group(struct task_struct *tsk, int type)
8138 {
8139         struct task_group *tg;
8140
8141         /*
8142          * All callers are synchronized by task_rq_lock(); we do not use RCU
8143          * which is pointless here. Thus, we pass "true" to task_css_check()
8144          * to prevent lockdep warnings.
8145          */
8146         tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
8147                           struct task_group, css);
8148         tg = autogroup_task_group(tsk, tg);
8149         tsk->sched_task_group = tg;
8150
8151 #ifdef CONFIG_FAIR_GROUP_SCHED
8152         if (tsk->sched_class->task_change_group)
8153                 tsk->sched_class->task_change_group(tsk, type);
8154         else
8155 #endif
8156                 set_task_rq(tsk, task_cpu(tsk));
8157 }
8158
8159 /*
8160  * Change task's runqueue when it moves between groups.
8161  *
8162  * The caller of this function should have put the task in its new group by
8163  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
8164  * its new group.
8165  */
8166 void sched_move_task(struct task_struct *tsk)
8167 {
8168         int queued, running;
8169         unsigned long flags;
8170         struct rq *rq;
8171
8172         rq = task_rq_lock(tsk, &flags);
8173
8174         running = task_current(rq, tsk);
8175         queued = task_on_rq_queued(tsk);
8176
8177         if (queued)
8178                 dequeue_task(rq, tsk, DEQUEUE_SAVE);
8179         if (unlikely(running))
8180                 put_prev_task(rq, tsk);
8181
8182         sched_change_group(tsk, TASK_MOVE_GROUP);
8183
8184         if (unlikely(running))
8185                 tsk->sched_class->set_curr_task(rq);
8186         if (queued)
8187                 enqueue_task(rq, tsk, ENQUEUE_RESTORE);
8188
8189         task_rq_unlock(rq, tsk, &flags);
8190 }
8191 #endif /* CONFIG_CGROUP_SCHED */
8192
8193 #ifdef CONFIG_RT_GROUP_SCHED
8194 /*
8195  * Ensure that the real time constraints are schedulable.
8196  */
8197 static DEFINE_MUTEX(rt_constraints_mutex);
8198
8199 /* Must be called with tasklist_lock held */
8200 static inline int tg_has_rt_tasks(struct task_group *tg)
8201 {
8202         struct task_struct *g, *p;
8203
8204         /*
8205          * Autogroups do not have RT tasks; see autogroup_create().
8206          */
8207         if (task_group_is_autogroup(tg))
8208                 return 0;
8209
8210         for_each_process_thread(g, p) {
8211                 if (rt_task(p) && task_group(p) == tg)
8212                         return 1;
8213         }
8214
8215         return 0;
8216 }
8217
8218 struct rt_schedulable_data {
8219         struct task_group *tg;
8220         u64 rt_period;
8221         u64 rt_runtime;
8222 };
8223
8224 static int tg_rt_schedulable(struct task_group *tg, void *data)
8225 {
8226         struct rt_schedulable_data *d = data;
8227         struct task_group *child;
8228         unsigned long total, sum = 0;
8229         u64 period, runtime;
8230
8231         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8232         runtime = tg->rt_bandwidth.rt_runtime;
8233
8234         if (tg == d->tg) {
8235                 period = d->rt_period;
8236                 runtime = d->rt_runtime;
8237         }
8238
8239         /*
8240          * Cannot have more runtime than the period.
8241          */
8242         if (runtime > period && runtime != RUNTIME_INF)
8243                 return -EINVAL;
8244
8245         /*
8246          * Ensure we don't starve existing RT tasks.
8247          */
8248         if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8249                 return -EBUSY;
8250
8251         total = to_ratio(period, runtime);
8252
8253         /*
8254          * Nobody can have more than the global setting allows.
8255          */
8256         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8257                 return -EINVAL;
8258
8259         /*
8260          * The sum of our children's runtime should not exceed our own.
8261          */
8262         list_for_each_entry_rcu(child, &tg->children, siblings) {
8263                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8264                 runtime = child->rt_bandwidth.rt_runtime;
8265
8266                 if (child == d->tg) {
8267                         period = d->rt_period;
8268                         runtime = d->rt_runtime;
8269                 }
8270
8271                 sum += to_ratio(period, runtime);
8272         }
8273
8274         if (sum > total)
8275                 return -EINVAL;
8276
8277         return 0;
8278 }
8279
8280 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8281 {
8282         int ret;
8283
8284         struct rt_schedulable_data data = {
8285                 .tg = tg,
8286                 .rt_period = period,
8287                 .rt_runtime = runtime,
8288         };
8289
8290         rcu_read_lock();
8291         ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8292         rcu_read_unlock();
8293
8294         return ret;
8295 }
8296
8297 static int tg_set_rt_bandwidth(struct task_group *tg,
8298                 u64 rt_period, u64 rt_runtime)
8299 {
8300         int i, err = 0;
8301
8302         /*
8303          * Disallowing the root group RT runtime is BAD, it would disallow the
8304          * kernel creating (and or operating) RT threads.
8305          */
8306         if (tg == &root_task_group && rt_runtime == 0)
8307                 return -EINVAL;
8308
8309         /* No period doesn't make any sense. */
8310         if (rt_period == 0)
8311                 return -EINVAL;
8312
8313         mutex_lock(&rt_constraints_mutex);
8314         read_lock(&tasklist_lock);
8315         err = __rt_schedulable(tg, rt_period, rt_runtime);
8316         if (err)
8317                 goto unlock;
8318
8319         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8320         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8321         tg->rt_bandwidth.rt_runtime = rt_runtime;
8322
8323         for_each_possible_cpu(i) {
8324                 struct rt_rq *rt_rq = tg->rt_rq[i];
8325
8326                 raw_spin_lock(&rt_rq->rt_runtime_lock);
8327                 rt_rq->rt_runtime = rt_runtime;
8328                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8329         }
8330         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8331 unlock:
8332         read_unlock(&tasklist_lock);
8333         mutex_unlock(&rt_constraints_mutex);
8334
8335         return err;
8336 }
8337
8338 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8339 {
8340         u64 rt_runtime, rt_period;
8341
8342         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8343         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8344         if (rt_runtime_us < 0)
8345                 rt_runtime = RUNTIME_INF;
8346
8347         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8348 }
8349
8350 static long sched_group_rt_runtime(struct task_group *tg)
8351 {
8352         u64 rt_runtime_us;
8353
8354         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8355                 return -1;
8356
8357         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8358         do_div(rt_runtime_us, NSEC_PER_USEC);
8359         return rt_runtime_us;
8360 }
8361
8362 static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
8363 {
8364         u64 rt_runtime, rt_period;
8365
8366         rt_period = rt_period_us * NSEC_PER_USEC;
8367         rt_runtime = tg->rt_bandwidth.rt_runtime;
8368
8369         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8370 }
8371
8372 static long sched_group_rt_period(struct task_group *tg)
8373 {
8374         u64 rt_period_us;
8375
8376         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8377         do_div(rt_period_us, NSEC_PER_USEC);
8378         return rt_period_us;
8379 }
8380 #endif /* CONFIG_RT_GROUP_SCHED */
8381
8382 #ifdef CONFIG_RT_GROUP_SCHED
8383 static int sched_rt_global_constraints(void)
8384 {
8385         int ret = 0;
8386
8387         mutex_lock(&rt_constraints_mutex);
8388         read_lock(&tasklist_lock);
8389         ret = __rt_schedulable(NULL, 0, 0);
8390         read_unlock(&tasklist_lock);
8391         mutex_unlock(&rt_constraints_mutex);
8392
8393         return ret;
8394 }
8395
8396 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8397 {
8398         /* Don't accept realtime tasks when there is no way for them to run */
8399         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8400                 return 0;
8401
8402         return 1;
8403 }
8404
8405 #else /* !CONFIG_RT_GROUP_SCHED */
8406 static int sched_rt_global_constraints(void)
8407 {
8408         unsigned long flags;
8409         int i, ret = 0;
8410
8411         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8412         for_each_possible_cpu(i) {
8413                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8414
8415                 raw_spin_lock(&rt_rq->rt_runtime_lock);
8416                 rt_rq->rt_runtime = global_rt_runtime();
8417                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8418         }
8419         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8420
8421         return ret;
8422 }
8423 #endif /* CONFIG_RT_GROUP_SCHED */
8424
8425 static int sched_dl_global_validate(void)
8426 {
8427         u64 runtime = global_rt_runtime();
8428         u64 period = global_rt_period();
8429         u64 new_bw = to_ratio(period, runtime);
8430         struct dl_bw *dl_b;
8431         int cpu, ret = 0;
8432         unsigned long flags;
8433
8434         /*
8435          * Here we want to check the bandwidth not being set to some
8436          * value smaller than the currently allocated bandwidth in
8437          * any of the root_domains.
8438          *
8439          * FIXME: Cycling on all the CPUs is overdoing, but simpler than
8440          * cycling on root_domains... Discussion on different/better
8441          * solutions is welcome!
8442          */
8443         for_each_possible_cpu(cpu) {
8444                 rcu_read_lock_sched();
8445                 dl_b = dl_bw_of(cpu);
8446
8447                 raw_spin_lock_irqsave(&dl_b->lock, flags);
8448                 if (new_bw < dl_b->total_bw)
8449                         ret = -EBUSY;
8450                 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8451
8452                 rcu_read_unlock_sched();
8453
8454                 if (ret)
8455                         break;
8456         }
8457
8458         return ret;
8459 }
8460
8461 static void sched_dl_do_global(void)
8462 {
8463         u64 new_bw = -1;
8464         struct dl_bw *dl_b;
8465         int cpu;
8466         unsigned long flags;
8467
8468         def_dl_bandwidth.dl_period = global_rt_period();
8469         def_dl_bandwidth.dl_runtime = global_rt_runtime();
8470
8471         if (global_rt_runtime() != RUNTIME_INF)
8472                 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
8473
8474         /*
8475          * FIXME: As above...
8476          */
8477         for_each_possible_cpu(cpu) {
8478                 rcu_read_lock_sched();
8479                 dl_b = dl_bw_of(cpu);
8480
8481                 raw_spin_lock_irqsave(&dl_b->lock, flags);
8482                 dl_b->bw = new_bw;
8483                 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8484
8485                 rcu_read_unlock_sched();
8486         }
8487 }
8488
8489 static int sched_rt_global_validate(void)
8490 {
8491         if (sysctl_sched_rt_period <= 0)
8492                 return -EINVAL;
8493
8494         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
8495                 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
8496                 return -EINVAL;
8497
8498         return 0;
8499 }
8500
8501 static void sched_rt_do_global(void)
8502 {
8503         def_rt_bandwidth.rt_runtime = global_rt_runtime();
8504         def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
8505 }
8506
8507 int sched_rt_handler(struct ctl_table *table, int write,
8508                 void __user *buffer, size_t *lenp,
8509                 loff_t *ppos)
8510 {
8511         int old_period, old_runtime;
8512         static DEFINE_MUTEX(mutex);
8513         int ret;
8514
8515         mutex_lock(&mutex);
8516         old_period = sysctl_sched_rt_period;
8517         old_runtime = sysctl_sched_rt_runtime;
8518
8519         ret = proc_dointvec(table, write, buffer, lenp, ppos);
8520
8521         if (!ret && write) {
8522                 ret = sched_rt_global_validate();
8523                 if (ret)
8524                         goto undo;
8525
8526                 ret = sched_dl_global_validate();
8527                 if (ret)
8528                         goto undo;
8529
8530                 ret = sched_rt_global_constraints();
8531                 if (ret)
8532                         goto undo;
8533
8534                 sched_rt_do_global();
8535                 sched_dl_do_global();
8536         }
8537         if (0) {
8538 undo:
8539                 sysctl_sched_rt_period = old_period;
8540                 sysctl_sched_rt_runtime = old_runtime;
8541         }
8542         mutex_unlock(&mutex);
8543
8544         return ret;
8545 }
8546
8547 int sched_rr_handler(struct ctl_table *table, int write,
8548                 void __user *buffer, size_t *lenp,
8549                 loff_t *ppos)
8550 {
8551         int ret;
8552         static DEFINE_MUTEX(mutex);
8553
8554         mutex_lock(&mutex);
8555         ret = proc_dointvec(table, write, buffer, lenp, ppos);
8556         /* make sure that internally we keep jiffies */
8557         /* also, writing zero resets timeslice to default */
8558         if (!ret && write) {
8559                 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
8560                         RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
8561         }
8562         mutex_unlock(&mutex);
8563         return ret;
8564 }
8565
8566 #ifdef CONFIG_CGROUP_SCHED
8567
8568 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8569 {
8570         return css ? container_of(css, struct task_group, css) : NULL;
8571 }
8572
8573 static struct cgroup_subsys_state *
8574 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8575 {
8576         struct task_group *parent = css_tg(parent_css);
8577         struct task_group *tg;
8578
8579         if (!parent) {
8580                 /* This is early initialization for the top cgroup */
8581                 return &root_task_group.css;
8582         }
8583
8584         tg = sched_create_group(parent);
8585         if (IS_ERR(tg))
8586                 return ERR_PTR(-ENOMEM);
8587
8588         return &tg->css;
8589 }
8590
8591 /* Expose task group only after completing cgroup initialization */
8592 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
8593 {
8594         struct task_group *tg = css_tg(css);
8595         struct task_group *parent = css_tg(css->parent);
8596
8597         if (parent)
8598                 sched_online_group(tg, parent);
8599         return 0;
8600 }
8601
8602 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8603 {
8604         struct task_group *tg = css_tg(css);
8605
8606         sched_offline_group(tg);
8607 }
8608
8609 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8610 {
8611         struct task_group *tg = css_tg(css);
8612
8613         /*
8614          * Relies on the RCU grace period between css_released() and this.
8615          */
8616         sched_free_group(tg);
8617 }
8618
8619 /*
8620  * This is called before wake_up_new_task(), therefore we really only
8621  * have to set its group bits, all the other stuff does not apply.
8622  */
8623 static void cpu_cgroup_fork(struct task_struct *task, void *private)
8624 {
8625         unsigned long flags;
8626         struct rq *rq;
8627
8628         rq = task_rq_lock(task, &flags);
8629
8630         update_rq_clock(rq);
8631         sched_change_group(task, TASK_SET_GROUP);
8632
8633         task_rq_unlock(rq, task, &flags);
8634 }
8635
8636 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8637 {
8638         struct task_struct *task;
8639         struct cgroup_subsys_state *css;
8640         int ret = 0;
8641
8642         cgroup_taskset_for_each(task, css, tset) {
8643 #ifdef CONFIG_RT_GROUP_SCHED
8644                 if (!sched_rt_can_attach(css_tg(css), task))
8645                         return -EINVAL;
8646 #else
8647                 /* We don't support RT-tasks being in separate groups */
8648                 if (task->sched_class != &fair_sched_class)
8649                         return -EINVAL;
8650 #endif
8651                 /*
8652                  * Serialize against wake_up_new_task() such that if its
8653                  * running, we're sure to observe its full state.
8654                  */
8655                 raw_spin_lock_irq(&task->pi_lock);
8656                 /*
8657                  * Avoid calling sched_move_task() before wake_up_new_task()
8658                  * has happened. This would lead to problems with PELT, due to
8659                  * move wanting to detach+attach while we're not attached yet.
8660                  */
8661                 if (task->state == TASK_NEW)
8662                         ret = -EINVAL;
8663                 raw_spin_unlock_irq(&task->pi_lock);
8664
8665                 if (ret)
8666                         break;
8667         }
8668         return ret;
8669 }
8670
8671 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8672 {
8673         struct task_struct *task;
8674         struct cgroup_subsys_state *css;
8675
8676         cgroup_taskset_for_each(task, css, tset)
8677                 sched_move_task(task);
8678 }
8679
8680 #ifdef CONFIG_FAIR_GROUP_SCHED
8681 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8682                                 struct cftype *cftype, u64 shareval)
8683 {
8684         return sched_group_set_shares(css_tg(css), scale_load(shareval));
8685 }
8686
8687 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8688                                struct cftype *cft)
8689 {
8690         struct task_group *tg = css_tg(css);
8691
8692         return (u64) scale_load_down(tg->shares);
8693 }
8694
8695 #ifdef CONFIG_CFS_BANDWIDTH
8696 static DEFINE_MUTEX(cfs_constraints_mutex);
8697
8698 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
8699 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8700
8701 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8702
8703 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8704 {
8705         int i, ret = 0, runtime_enabled, runtime_was_enabled;
8706         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8707
8708         if (tg == &root_task_group)
8709                 return -EINVAL;
8710
8711         /*
8712          * Ensure we have at some amount of bandwidth every period.  This is
8713          * to prevent reaching a state of large arrears when throttled via
8714          * entity_tick() resulting in prolonged exit starvation.
8715          */
8716         if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8717                 return -EINVAL;
8718
8719         /*
8720          * Likewise, bound things on the otherside by preventing insane quota
8721          * periods.  This also allows us to normalize in computing quota
8722          * feasibility.
8723          */
8724         if (period > max_cfs_quota_period)
8725                 return -EINVAL;
8726
8727         /*
8728          * Prevent race between setting of cfs_rq->runtime_enabled and
8729          * unthrottle_offline_cfs_rqs().
8730          */
8731         get_online_cpus();
8732         mutex_lock(&cfs_constraints_mutex);
8733         ret = __cfs_schedulable(tg, period, quota);
8734         if (ret)
8735                 goto out_unlock;
8736
8737         runtime_enabled = quota != RUNTIME_INF;
8738         runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8739         /*
8740          * If we need to toggle cfs_bandwidth_used, off->on must occur
8741          * before making related changes, and on->off must occur afterwards
8742          */
8743         if (runtime_enabled && !runtime_was_enabled)
8744                 cfs_bandwidth_usage_inc();
8745         raw_spin_lock_irq(&cfs_b->lock);
8746         cfs_b->period = ns_to_ktime(period);
8747         cfs_b->quota = quota;
8748
8749         __refill_cfs_bandwidth_runtime(cfs_b);
8750         /* restart the period timer (if active) to handle new period expiry */
8751         if (runtime_enabled)
8752                 start_cfs_bandwidth(cfs_b);
8753         raw_spin_unlock_irq(&cfs_b->lock);
8754
8755         for_each_online_cpu(i) {
8756                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8757                 struct rq *rq = cfs_rq->rq;
8758
8759                 raw_spin_lock_irq(&rq->lock);
8760                 cfs_rq->runtime_enabled = runtime_enabled;
8761                 cfs_rq->runtime_remaining = 0;
8762
8763                 if (cfs_rq->throttled)
8764                         unthrottle_cfs_rq(cfs_rq);
8765                 raw_spin_unlock_irq(&rq->lock);
8766         }
8767         if (runtime_was_enabled && !runtime_enabled)
8768                 cfs_bandwidth_usage_dec();
8769 out_unlock:
8770         mutex_unlock(&cfs_constraints_mutex);
8771         put_online_cpus();
8772
8773         return ret;
8774 }
8775
8776 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8777 {
8778         u64 quota, period;
8779
8780         period = ktime_to_ns(tg->cfs_bandwidth.period);
8781         if (cfs_quota_us < 0)
8782                 quota = RUNTIME_INF;
8783         else
8784                 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8785
8786         return tg_set_cfs_bandwidth(tg, period, quota);
8787 }
8788
8789 long tg_get_cfs_quota(struct task_group *tg)
8790 {
8791         u64 quota_us;
8792
8793         if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8794                 return -1;
8795
8796         quota_us = tg->cfs_bandwidth.quota;
8797         do_div(quota_us, NSEC_PER_USEC);
8798
8799         return quota_us;
8800 }
8801
8802 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8803 {
8804         u64 quota, period;
8805
8806         period = (u64)cfs_period_us * NSEC_PER_USEC;
8807         quota = tg->cfs_bandwidth.quota;
8808
8809         return tg_set_cfs_bandwidth(tg, period, quota);
8810 }
8811
8812 long tg_get_cfs_period(struct task_group *tg)
8813 {
8814         u64 cfs_period_us;
8815
8816         cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8817         do_div(cfs_period_us, NSEC_PER_USEC);
8818
8819         return cfs_period_us;
8820 }
8821
8822 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8823                                   struct cftype *cft)
8824 {
8825         return tg_get_cfs_quota(css_tg(css));
8826 }
8827
8828 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8829                                    struct cftype *cftype, s64 cfs_quota_us)
8830 {
8831         return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8832 }
8833
8834 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8835                                    struct cftype *cft)
8836 {
8837         return tg_get_cfs_period(css_tg(css));
8838 }
8839
8840 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8841                                     struct cftype *cftype, u64 cfs_period_us)
8842 {
8843         return tg_set_cfs_period(css_tg(css), cfs_period_us);
8844 }
8845
8846 struct cfs_schedulable_data {
8847         struct task_group *tg;
8848         u64 period, quota;
8849 };
8850
8851 /*
8852  * normalize group quota/period to be quota/max_period
8853  * note: units are usecs
8854  */
8855 static u64 normalize_cfs_quota(struct task_group *tg,
8856                                struct cfs_schedulable_data *d)
8857 {
8858         u64 quota, period;
8859
8860         if (tg == d->tg) {
8861                 period = d->period;
8862                 quota = d->quota;
8863         } else {
8864                 period = tg_get_cfs_period(tg);
8865                 quota = tg_get_cfs_quota(tg);
8866         }
8867
8868         /* note: these should typically be equivalent */
8869         if (quota == RUNTIME_INF || quota == -1)
8870                 return RUNTIME_INF;
8871
8872         return to_ratio(period, quota);
8873 }
8874
8875 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8876 {
8877         struct cfs_schedulable_data *d = data;
8878         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8879         s64 quota = 0, parent_quota = -1;
8880
8881         if (!tg->parent) {
8882                 quota = RUNTIME_INF;
8883         } else {
8884                 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8885
8886                 quota = normalize_cfs_quota(tg, d);
8887                 parent_quota = parent_b->hierarchical_quota;
8888
8889                 /*
8890                  * ensure max(child_quota) <= parent_quota, inherit when no
8891                  * limit is set
8892                  */
8893                 if (quota == RUNTIME_INF)
8894                         quota = parent_quota;
8895                 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8896                         return -EINVAL;
8897         }
8898         cfs_b->hierarchical_quota = quota;
8899
8900         return 0;
8901 }
8902
8903 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8904 {
8905         int ret;
8906         struct cfs_schedulable_data data = {
8907                 .tg = tg,
8908                 .period = period,
8909                 .quota = quota,
8910         };
8911
8912         if (quota != RUNTIME_INF) {
8913                 do_div(data.period, NSEC_PER_USEC);
8914                 do_div(data.quota, NSEC_PER_USEC);
8915         }
8916
8917         rcu_read_lock();
8918         ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8919         rcu_read_unlock();
8920
8921         return ret;
8922 }
8923
8924 static int cpu_stats_show(struct seq_file *sf, void *v)
8925 {
8926         struct task_group *tg = css_tg(seq_css(sf));
8927         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8928
8929         seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8930         seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8931         seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8932
8933         return 0;
8934 }
8935 #endif /* CONFIG_CFS_BANDWIDTH */
8936 #endif /* CONFIG_FAIR_GROUP_SCHED */
8937
8938 #ifdef CONFIG_RT_GROUP_SCHED
8939 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8940                                 struct cftype *cft, s64 val)
8941 {
8942         return sched_group_set_rt_runtime(css_tg(css), val);
8943 }
8944
8945 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8946                                struct cftype *cft)
8947 {
8948         return sched_group_rt_runtime(css_tg(css));
8949 }
8950
8951 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8952                                     struct cftype *cftype, u64 rt_period_us)
8953 {
8954         return sched_group_set_rt_period(css_tg(css), rt_period_us);
8955 }
8956
8957 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8958                                    struct cftype *cft)
8959 {
8960         return sched_group_rt_period(css_tg(css));
8961 }
8962 #endif /* CONFIG_RT_GROUP_SCHED */
8963
8964 static struct cftype cpu_files[] = {
8965 #ifdef CONFIG_FAIR_GROUP_SCHED
8966         {
8967                 .name = "shares",
8968                 .read_u64 = cpu_shares_read_u64,
8969                 .write_u64 = cpu_shares_write_u64,
8970         },
8971 #endif
8972 #ifdef CONFIG_CFS_BANDWIDTH
8973         {
8974                 .name = "cfs_quota_us",
8975                 .read_s64 = cpu_cfs_quota_read_s64,
8976                 .write_s64 = cpu_cfs_quota_write_s64,
8977         },
8978         {
8979                 .name = "cfs_period_us",
8980                 .read_u64 = cpu_cfs_period_read_u64,
8981                 .write_u64 = cpu_cfs_period_write_u64,
8982         },
8983         {
8984                 .name = "stat",
8985                 .seq_show = cpu_stats_show,
8986         },
8987 #endif
8988 #ifdef CONFIG_RT_GROUP_SCHED
8989         {
8990                 .name = "rt_runtime_us",
8991                 .read_s64 = cpu_rt_runtime_read,
8992                 .write_s64 = cpu_rt_runtime_write,
8993         },
8994         {
8995                 .name = "rt_period_us",
8996                 .read_u64 = cpu_rt_period_read_uint,
8997                 .write_u64 = cpu_rt_period_write_uint,
8998         },
8999 #endif
9000         { }     /* terminate */
9001 };
9002
9003 struct cgroup_subsys cpu_cgrp_subsys = {
9004         .css_alloc      = cpu_cgroup_css_alloc,
9005         .css_online     = cpu_cgroup_css_online,
9006         .css_released   = cpu_cgroup_css_released,
9007         .css_free       = cpu_cgroup_css_free,
9008         .fork           = cpu_cgroup_fork,
9009         .can_attach     = cpu_cgroup_can_attach,
9010         .attach         = cpu_cgroup_attach,
9011         .legacy_cftypes = cpu_files,
9012         .early_init     = 1,
9013 };
9014
9015 #endif  /* CONFIG_CGROUP_SCHED */
9016
9017 void dump_cpu_task(int cpu)
9018 {
9019         pr_info("Task dump for CPU %d:\n", cpu);
9020         sched_show_task(cpu_curr(cpu));
9021 }