kernel/sched/rt.c

   1 /*
   2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3  * policies)
   4  */
   5
   6 #include "sched.h"
   7
   8 #include <linux/interrupt.h>
   9 #include <linux/slab.h>
  10 #include <linux/irq_work.h>
  11 #include <trace/events/sched.h>
  12 #include <linux/hrtimer.h>
  13
  14 #include "tune.h"
  15
  16 int sched_rr_timeslice = RR_TIMESLICE;
  17
  18 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  19
  20 struct rt_bandwidth def_rt_bandwidth;
  21
  22 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  23 {
  24         struct rt_bandwidth *rt_b =
  25                 container_of(timer, struct rt_bandwidth, rt_period_timer);
  26         int idle = 0;
  27         int overrun;
  28
  29         raw_spin_lock(&rt_b->rt_runtime_lock);
  30         for (;;) {
  31                 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  32                 if (!overrun)
  33                         break;
  34
  35                 raw_spin_unlock(&rt_b->rt_runtime_lock);
  36                 idle = do_sched_rt_period_timer(rt_b, overrun);
  37                 raw_spin_lock(&rt_b->rt_runtime_lock);
  38         }
  39         if (idle)
  40                 rt_b->rt_period_active = 0;
  41         raw_spin_unlock(&rt_b->rt_runtime_lock);
  42
  43         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  44 }
  45
  46 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  47 {
  48         rt_b->rt_period = ns_to_ktime(period);
  49         rt_b->rt_runtime = runtime;
  50
  51         raw_spin_lock_init(&rt_b->rt_runtime_lock);
  52
  53         hrtimer_init(&rt_b->rt_period_timer,
  54                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  55         rt_b->rt_period_timer.function = sched_rt_period_timer;
  56 }
  57
  58 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  59 {
  60         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  61                 return;
  62
  63         raw_spin_lock(&rt_b->rt_runtime_lock);
  64         if (!rt_b->rt_period_active) {
  65                 rt_b->rt_period_active = 1;
  66                 hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
  67                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  68         }
  69         raw_spin_unlock(&rt_b->rt_runtime_lock);
  70 }
  71
  72 void init_rt_rq(struct rt_rq *rt_rq)
  73 {
  74         struct rt_prio_array *array;
  75         int i;
  76
  77         array = &rt_rq->active;
  78         for (i = 0; i < MAX_RT_PRIO; i++) {
  79                 INIT_LIST_HEAD(array->queue + i);
  80                 __clear_bit(i, array->bitmap);
  81         }
  82         /* delimiter for bitsearch: */
  83         __set_bit(MAX_RT_PRIO, array->bitmap);
  84
  85 #if defined CONFIG_SMP
  86         rt_rq->highest_prio.curr = MAX_RT_PRIO;
  87         rt_rq->highest_prio.next = MAX_RT_PRIO;
  88         rt_rq->rt_nr_migratory = 0;
  89         rt_rq->overloaded = 0;
  90         plist_head_init(&rt_rq->pushable_tasks);
  91 #endif /* CONFIG_SMP */
  92         /* We start is dequeued state, because no RT tasks are queued */
  93         rt_rq->rt_queued = 0;
  94
  95         rt_rq->rt_time = 0;
  96         rt_rq->rt_throttled = 0;
  97         rt_rq->rt_runtime = 0;
  98         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
  99 }
 100
 101 #ifdef CONFIG_RT_GROUP_SCHED
 102 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 103 {
 104         hrtimer_cancel(&rt_b->rt_period_timer);
 105 }
 106
 107 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 108
 109 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 110 {
 111 #ifdef CONFIG_SCHED_DEBUG
 112         WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 113 #endif
 114         return container_of(rt_se, struct task_struct, rt);
 115 }
 116
 117 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 118 {
 119         return rt_rq->rq;
 120 }
 121
 122 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 123 {
 124         return rt_se->rt_rq;
 125 }
 126
 127 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 128 {
 129         struct rt_rq *rt_rq = rt_se->rt_rq;
 130
 131         return rt_rq->rq;
 132 }
 133
 134 void free_rt_sched_group(struct task_group *tg)
 135 {
 136         int i;
 137
 138         if (tg->rt_se)
 139                 destroy_rt_bandwidth(&tg->rt_bandwidth);
 140
 141         for_each_possible_cpu(i) {
 142                 if (tg->rt_rq)
 143                         kfree(tg->rt_rq[i]);
 144                 if (tg->rt_se)
 145                         kfree(tg->rt_se[i]);
 146         }
 147
 148         kfree(tg->rt_rq);
 149         kfree(tg->rt_se);
 150 }
 151
 152 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 153                 struct sched_rt_entity *rt_se, int cpu,
 154                 struct sched_rt_entity *parent)
 155 {
 156         struct rq *rq = cpu_rq(cpu);
 157
 158         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 159         rt_rq->rt_nr_boosted = 0;
 160         rt_rq->rq = rq;
 161         rt_rq->tg = tg;
 162
 163         tg->rt_rq[cpu] = rt_rq;
 164         tg->rt_se[cpu] = rt_se;
 165
 166         if (!rt_se)
 167                 return;
 168
 169         if (!parent)
 170                 rt_se->rt_rq = &rq->rt;
 171         else
 172                 rt_se->rt_rq = parent->my_q;
 173
 174         rt_se->my_q = rt_rq;
 175         rt_se->parent = parent;
 176         INIT_LIST_HEAD(&rt_se->run_list);
 177 }
 178
 179 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 180 {
 181         struct rt_rq *rt_rq;
 182         struct sched_rt_entity *rt_se;
 183         int i;
 184
 185         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 186         if (!tg->rt_rq)
 187                 goto err;
 188         tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 189         if (!tg->rt_se)
 190                 goto err;
 191
 192         init_rt_bandwidth(&tg->rt_bandwidth,
 193                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 194
 195         for_each_possible_cpu(i) {
 196                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
 197                                      GFP_KERNEL, cpu_to_node(i));
 198                 if (!rt_rq)
 199                         goto err;
 200
 201                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 202                                      GFP_KERNEL, cpu_to_node(i));
 203                 if (!rt_se)
 204                         goto err_free_rq;
 205
 206                 init_rt_rq(rt_rq);
 207                 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 208                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 209         }
 210
 211         return 1;
 212
 213 err_free_rq:
 214         kfree(rt_rq);
 215 err:
 216         return 0;
 217 }
 218
 219 #else /* CONFIG_RT_GROUP_SCHED */
 220
 221 #define rt_entity_is_task(rt_se) (1)
 222
 223 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 224 {
 225         return container_of(rt_se, struct task_struct, rt);
 226 }
 227
 228 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 229 {
 230         return container_of(rt_rq, struct rq, rt);
 231 }
 232
 233 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 234 {
 235         struct task_struct *p = rt_task_of(rt_se);
 236
 237         return task_rq(p);
 238 }
 239
 240 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 241 {
 242         struct rq *rq = rq_of_rt_se(rt_se);
 243
 244         return &rq->rt;
 245 }
 246
 247 void free_rt_sched_group(struct task_group *tg) { }
 248
 249 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 250 {
 251         return 1;
 252 }
 253 #endif /* CONFIG_RT_GROUP_SCHED */
 254
 255 #ifdef CONFIG_SMP
 256
 257 static void pull_rt_task(struct rq *this_rq);
 258
 259 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 260 {
 261         /*
 262          * Try to pull RT tasks here if we lower this rq's prio and cpu is not
 263          * isolated
 264          */
 265         return rq->rt.highest_prio.curr > prev->prio &&
 266                !cpu_isolated(cpu_of(rq));
 267 }
 268
 269 static inline int rt_overloaded(struct rq *rq)
 270 {
 271         return atomic_read(&rq->rd->rto_count);
 272 }
 273
 274 static inline void rt_set_overload(struct rq *rq)
 275 {
 276         if (!rq->online)
 277                 return;
 278
 279         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 280         /*
 281          * Make sure the mask is visible before we set
 282          * the overload count. That is checked to determine
 283          * if we should look at the mask. It would be a shame
 284          * if we looked at the mask, but the mask was not
 285          * updated yet.
 286          *
 287          * Matched by the barrier in pull_rt_task().
 288          */
 289         smp_wmb();
 290         atomic_inc(&rq->rd->rto_count);
 291 }
 292
 293 static inline void rt_clear_overload(struct rq *rq)
 294 {
 295         if (!rq->online)
 296                 return;
 297
 298         /* the order here really doesn't matter */
 299         atomic_dec(&rq->rd->rto_count);
 300         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 301 }
 302
 303 static void update_rt_migration(struct rt_rq *rt_rq)
 304 {
 305         if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 306                 if (!rt_rq->overloaded) {
 307                         rt_set_overload(rq_of_rt_rq(rt_rq));
 308                         rt_rq->overloaded = 1;
 309                 }
 310         } else if (rt_rq->overloaded) {
 311                 rt_clear_overload(rq_of_rt_rq(rt_rq));
 312                 rt_rq->overloaded = 0;
 313         }
 314 }
 315
 316 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 317 {
 318         struct task_struct *p;
 319
 320         if (!rt_entity_is_task(rt_se))
 321                 return;
 322
 323         p = rt_task_of(rt_se);
 324         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 325
 326         rt_rq->rt_nr_total++;
 327         if (p->nr_cpus_allowed > 1)
 328                 rt_rq->rt_nr_migratory++;
 329
 330         update_rt_migration(rt_rq);
 331 }
 332
 333 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 334 {
 335         struct task_struct *p;
 336
 337         if (!rt_entity_is_task(rt_se))
 338                 return;
 339
 340         p = rt_task_of(rt_se);
 341         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 342
 343         rt_rq->rt_nr_total--;
 344         if (p->nr_cpus_allowed > 1)
 345                 rt_rq->rt_nr_migratory--;
 346
 347         update_rt_migration(rt_rq);
 348 }
 349
 350 static inline int has_pushable_tasks(struct rq *rq)
 351 {
 352         return !plist_head_empty(&rq->rt.pushable_tasks);
 353 }
 354
 355 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 356 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 357
 358 static void push_rt_tasks(struct rq *);
 359 static void pull_rt_task(struct rq *);
 360
 361 static inline void queue_push_tasks(struct rq *rq)
 362 {
 363         if (!has_pushable_tasks(rq))
 364                 return;
 365
 366         queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 367 }
 368
 369 static inline void queue_pull_task(struct rq *rq)
 370 {
 371         queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 372 }
 373
 374 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 375 {
 376         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 377         plist_node_init(&p->pushable_tasks, p->prio);
 378         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 379
 380         /* Update the highest prio pushable task */
 381         if (p->prio < rq->rt.highest_prio.next)
 382                 rq->rt.highest_prio.next = p->prio;
 383 }
 384
 385 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 386 {
 387         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 388
 389         /* Update the new highest prio pushable task */
 390         if (has_pushable_tasks(rq)) {
 391                 p = plist_first_entry(&rq->rt.pushable_tasks,
 392                                       struct task_struct, pushable_tasks);
 393                 rq->rt.highest_prio.next = p->prio;
 394         } else
 395                 rq->rt.highest_prio.next = MAX_RT_PRIO;
 396 }
 397
 398 #else
 399
 400 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 401 {
 402 }
 403
 404 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 405 {
 406 }
 407
 408 static inline
 409 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 410 {
 411 }
 412
 413 static inline
 414 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 415 {
 416 }
 417
 418 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 419 {
 420         return false;
 421 }
 422
 423 static inline void pull_rt_task(struct rq *this_rq)
 424 {
 425 }
 426
 427 static inline void queue_push_tasks(struct rq *rq)
 428 {
 429 }
 430 #endif /* CONFIG_SMP */
 431
 432 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 433 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 434
 435 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 436 {
 437         return rt_se->on_rq;
 438 }
 439
 440 #ifdef CONFIG_RT_GROUP_SCHED
 441
 442 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 443 {
 444         if (!rt_rq->tg)
 445                 return RUNTIME_INF;
 446
 447         return rt_rq->rt_runtime;
 448 }
 449
 450 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 451 {
 452         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 453 }
 454
 455 typedef struct task_group *rt_rq_iter_t;
 456
 457 static inline struct task_group *next_task_group(struct task_group *tg)
 458 {
 459         do {
 460                 tg = list_entry_rcu(tg->list.next,
 461                         typeof(struct task_group), list);
 462         } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 463
 464         if (&tg->list == &task_groups)
 465                 tg = NULL;
 466
 467         return tg;
 468 }
 469
 470 #define for_each_rt_rq(rt_rq, iter, rq)                                 \
 471         for (iter = container_of(&task_groups, typeof(*iter), list);    \
 472                 (iter = next_task_group(iter)) &&                       \
 473                 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 474
 475 #define for_each_sched_rt_entity(rt_se) \
 476         for (; rt_se; rt_se = rt_se->parent)
 477
 478 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 479 {
 480         return rt_se->my_q;
 481 }
 482
 483 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 484 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 485
 486 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 487 {
 488         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 489         struct rq *rq = rq_of_rt_rq(rt_rq);
 490         struct sched_rt_entity *rt_se;
 491
 492         int cpu = cpu_of(rq);
 493
 494         rt_se = rt_rq->tg->rt_se[cpu];
 495
 496         if (rt_rq->rt_nr_running) {
 497                 if (!rt_se)
 498                         enqueue_top_rt_rq(rt_rq);
 499                 else if (!on_rt_rq(rt_se))
 500                         enqueue_rt_entity(rt_se, 0);
 501
 502                 if (rt_rq->highest_prio.curr < curr->prio)
 503                         resched_curr(rq);
 504         }
 505 }
 506
 507 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 508 {
 509         struct sched_rt_entity *rt_se;
 510         int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 511
 512         rt_se = rt_rq->tg->rt_se[cpu];
 513
 514         if (!rt_se)
 515                 dequeue_top_rt_rq(rt_rq);
 516         else if (on_rt_rq(rt_se))
 517                 dequeue_rt_entity(rt_se, 0);
 518 }
 519
 520 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 521 {
 522         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 523 }
 524
 525 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 526 {
 527         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 528         struct task_struct *p;
 529
 530         if (rt_rq)
 531                 return !!rt_rq->rt_nr_boosted;
 532
 533         p = rt_task_of(rt_se);
 534         return p->prio != p->normal_prio;
 535 }
 536
 537 #ifdef CONFIG_SMP
 538 static inline const struct cpumask *sched_rt_period_mask(void)
 539 {
 540         return this_rq()->rd->span;
 541 }
 542 #else
 543 static inline const struct cpumask *sched_rt_period_mask(void)
 544 {
 545         return cpu_online_mask;
 546 }
 547 #endif
 548
 549 static inline
 550 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 551 {
 552         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 553 }
 554
 555 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 556 {
 557         return &rt_rq->tg->rt_bandwidth;
 558 }
 559
 560 #else /* !CONFIG_RT_GROUP_SCHED */
 561
 562 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 563 {
 564         return rt_rq->rt_runtime;
 565 }
 566
 567 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 568 {
 569         return ktime_to_ns(def_rt_bandwidth.rt_period);
 570 }
 571
 572 typedef struct rt_rq *rt_rq_iter_t;
 573
 574 #define for_each_rt_rq(rt_rq, iter, rq) \
 575         for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 576
 577 #define for_each_sched_rt_entity(rt_se) \
 578         for (; rt_se; rt_se = NULL)
 579
 580 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 581 {
 582         return NULL;
 583 }
 584
 585 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 586 {
 587         struct rq *rq = rq_of_rt_rq(rt_rq);
 588
 589         if (!rt_rq->rt_nr_running)
 590                 return;
 591
 592         enqueue_top_rt_rq(rt_rq);
 593         resched_curr(rq);
 594 }
 595
 596 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 597 {
 598         dequeue_top_rt_rq(rt_rq);
 599 }
 600
 601 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 602 {
 603         return rt_rq->rt_throttled;
 604 }
 605
 606 static inline const struct cpumask *sched_rt_period_mask(void)
 607 {
 608         return cpu_online_mask;
 609 }
 610
 611 static inline
 612 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 613 {
 614         return &cpu_rq(cpu)->rt;
 615 }
 616
 617 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 618 {
 619         return &def_rt_bandwidth;
 620 }
 621
 622 #endif /* CONFIG_RT_GROUP_SCHED */
 623
 624 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 625 {
 626         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 627
 628         return (hrtimer_active(&rt_b->rt_period_timer) ||
 629                 rt_rq->rt_time < rt_b->rt_runtime);
 630 }
 631
 632 #ifdef CONFIG_SMP
 633 /*
 634  * We ran out of runtime, see if we can borrow some from our neighbours.
 635  */
 636 static void do_balance_runtime(struct rt_rq *rt_rq)
 637 {
 638         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 639         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 640         int i, weight;
 641         u64 rt_period;
 642
 643         weight = cpumask_weight(rd->span);
 644
 645         raw_spin_lock(&rt_b->rt_runtime_lock);
 646         rt_period = ktime_to_ns(rt_b->rt_period);
 647         for_each_cpu(i, rd->span) {
 648                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 649                 s64 diff;
 650
 651                 if (iter == rt_rq)
 652                         continue;
 653
 654                 raw_spin_lock(&iter->rt_runtime_lock);
 655                 /*
 656                  * Either all rqs have inf runtime and there's nothing to steal
 657                  * or __disable_runtime() below sets a specific rq to inf to
 658                  * indicate its been disabled and disalow stealing.
 659                  */
 660                 if (iter->rt_runtime == RUNTIME_INF)
 661                         goto next;
 662
 663                 /*
 664                  * From runqueues with spare time, take 1/n part of their
 665                  * spare time, but no more than our period.
 666                  */
 667                 diff = iter->rt_runtime - iter->rt_time;
 668                 if (diff > 0) {
 669                         diff = div_u64((u64)diff, weight);
 670                         if (rt_rq->rt_runtime + diff > rt_period)
 671                                 diff = rt_period - rt_rq->rt_runtime;
 672                         iter->rt_runtime -= diff;
 673                         rt_rq->rt_runtime += diff;
 674                         if (rt_rq->rt_runtime == rt_period) {
 675                                 raw_spin_unlock(&iter->rt_runtime_lock);
 676                                 break;
 677                         }
 678                 }
 679 next:
 680                 raw_spin_unlock(&iter->rt_runtime_lock);
 681         }
 682         raw_spin_unlock(&rt_b->rt_runtime_lock);
 683 }
 684
 685 /*
 686  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 687  */
 688 static void __disable_runtime(struct rq *rq)
 689 {
 690         struct root_domain *rd = rq->rd;
 691         rt_rq_iter_t iter;
 692         struct rt_rq *rt_rq;
 693
 694         if (unlikely(!scheduler_running))
 695                 return;
 696
 697         for_each_rt_rq(rt_rq, iter, rq) {
 698                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 699                 s64 want;
 700                 int i;
 701
 702                 raw_spin_lock(&rt_b->rt_runtime_lock);
 703                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 704                 /*
 705                  * Either we're all inf and nobody needs to borrow, or we're
 706                  * already disabled and thus have nothing to do, or we have
 707                  * exactly the right amount of runtime to take out.
 708                  */
 709                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 710                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 711                         goto balanced;
 712                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 713
 714                 /*
 715                  * Calculate the difference between what we started out with
 716                  * and what we current have, that's the amount of runtime
 717                  * we lend and now have to reclaim.
 718                  */
 719                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 720
 721                 /*
 722                  * Greedy reclaim, take back as much as we can.
 723                  */
 724                 for_each_cpu(i, rd->span) {
 725                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 726                         s64 diff;
 727
 728                         /*
 729                          * Can't reclaim from ourselves or disabled runqueues.
 730                          */
 731                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 732                                 continue;
 733
 734                         raw_spin_lock(&iter->rt_runtime_lock);
 735                         if (want > 0) {
 736                                 diff = min_t(s64, iter->rt_runtime, want);
 737                                 iter->rt_runtime -= diff;
 738                                 want -= diff;
 739                         } else {
 740                                 iter->rt_runtime -= want;
 741                                 want -= want;
 742                         }
 743                         raw_spin_unlock(&iter->rt_runtime_lock);
 744
 745                         if (!want)
 746                                 break;
 747                 }
 748
 749                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 750                 /*
 751                  * We cannot be left wanting - that would mean some runtime
 752                  * leaked out of the system.
 753                  */
 754                 BUG_ON(want);
 755 balanced:
 756                 /*
 757                  * Disable all the borrow logic by pretending we have inf
 758                  * runtime - in which case borrowing doesn't make sense.
 759                  */
 760                 rt_rq->rt_runtime = RUNTIME_INF;
 761                 rt_rq->rt_throttled = 0;
 762                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 763                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 764
 765                 /* Make rt_rq available for pick_next_task() */
 766                 sched_rt_rq_enqueue(rt_rq);
 767         }
 768 }
 769
 770 static void __enable_runtime(struct rq *rq)
 771 {
 772         rt_rq_iter_t iter;
 773         struct rt_rq *rt_rq;
 774
 775         if (unlikely(!scheduler_running))
 776                 return;
 777
 778         /*
 779          * Reset each runqueue's bandwidth settings
 780          */
 781         for_each_rt_rq(rt_rq, iter, rq) {
 782                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 783
 784                 raw_spin_lock(&rt_b->rt_runtime_lock);
 785                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 786                 rt_rq->rt_runtime = rt_b->rt_runtime;
 787                 rt_rq->rt_time = 0;
 788                 rt_rq->rt_throttled = 0;
 789                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 790                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 791         }
 792 }
 793
 794 static void balance_runtime(struct rt_rq *rt_rq)
 795 {
 796         if (!sched_feat(RT_RUNTIME_SHARE))
 797                 return;
 798
 799         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 800                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 801                 do_balance_runtime(rt_rq);
 802                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 803         }
 804 }
 805 #else /* !CONFIG_SMP */
 806 static inline void balance_runtime(struct rt_rq *rt_rq) {}
 807 #endif /* CONFIG_SMP */
 808
 809 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 810 {
 811         int i, idle = 1, throttled = 0;
 812         const struct cpumask *span;
 813
 814         span = sched_rt_period_mask();
 815 #ifdef CONFIG_RT_GROUP_SCHED
 816         /*
 817          * FIXME: isolated CPUs should really leave the root task group,
 818          * whether they are isolcpus or were isolated via cpusets, lest
 819          * the timer run on a CPU which does not service all runqueues,
 820          * potentially leaving other CPUs indefinitely throttled.  If
 821          * isolation is really required, the user will turn the throttle
 822          * off to kill the perturbations it causes anyway.  Meanwhile,
 823          * this maintains functionality for boot and/or troubleshooting.
 824          */
 825         if (rt_b == &root_task_group.rt_bandwidth)
 826                 span = cpu_online_mask;
 827 #endif
 828         for_each_cpu(i, span) {
 829                 int enqueue = 0;
 830                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 831                 struct rq *rq = rq_of_rt_rq(rt_rq);
 832
 833                 raw_spin_lock(&rq->lock);
 834                 if (rt_rq->rt_time) {
 835                         u64 runtime;
 836
 837                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 838                         if (rt_rq->rt_throttled)
 839                                 balance_runtime(rt_rq);
 840                         runtime = rt_rq->rt_runtime;
 841                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 842                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 843                                 rt_rq->rt_throttled = 0;
 844                                 enqueue = 1;
 845
 846                                 /*
 847                                  * When we're idle and a woken (rt) task is
 848                                  * throttled check_preempt_curr() will set
 849                                  * skip_update and the time between the wakeup
 850                                  * and this unthrottle will get accounted as
 851                                  * 'runtime'.
 852                                  */
 853                                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 854                                         rq_clock_skip_update(rq, false);
 855                         }
 856                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 857                                 idle = 0;
 858                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
 859                 } else if (rt_rq->rt_nr_running) {
 860                         idle = 0;
 861                         if (!rt_rq_throttled(rt_rq))
 862                                 enqueue = 1;
 863                 }
 864                 if (rt_rq->rt_throttled)
 865                         throttled = 1;
 866
 867                 if (enqueue)
 868                         sched_rt_rq_enqueue(rt_rq);
 869                 raw_spin_unlock(&rq->lock);
 870         }
 871
 872         if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 873                 return 1;
 874
 875         return idle;
 876 }
 877
 878 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 879 {
 880 #ifdef CONFIG_RT_GROUP_SCHED
 881         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 882
 883         if (rt_rq)
 884                 return rt_rq->highest_prio.curr;
 885 #endif
 886
 887         return rt_task_of(rt_se)->prio;
 888 }
 889
 890 static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
 891 {
 892         struct rt_prio_array *array = &rt_rq->active;
 893         struct sched_rt_entity *rt_se;
 894         char buf[500];
 895         char *pos = buf;
 896         char *end = buf + sizeof(buf);
 897         int idx;
 898
 899         pos += snprintf(pos, sizeof(buf),
 900                 "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
 901                 rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
 902
 903         if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
 904                 goto out;
 905
 906         pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
 907         idx = sched_find_first_bit(array->bitmap);
 908         while (idx < MAX_RT_PRIO) {
 909                 list_for_each_entry(rt_se, array->queue + idx, run_list) {
 910                         struct task_struct *p;
 911
 912                         if (!rt_entity_is_task(rt_se))
 913                                 continue;
 914
 915                         p = rt_task_of(rt_se);
 916                         if (pos < end)
 917                                 pos += snprintf(pos, end - pos, "\t%s (%d)\n",
 918                                         p->comm, p->pid);
 919                 }
 920                 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
 921         }
 922 out:
 923 #ifdef CONFIG_PANIC_ON_RT_THROTTLING
 924         /*
 925          * Use pr_err() in the BUG() case since printk_sched() will
 926          * not get flushed and deadlock is not a concern.
 927          */
 928         pr_err("%s", buf);
 929         BUG();
 930 #else
 931         printk_deferred("%s", buf);
 932 #endif
 933 }
 934
 935 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 936 {
 937         u64 runtime = sched_rt_runtime(rt_rq);
 938
 939         if (rt_rq->rt_throttled)
 940                 return rt_rq_throttled(rt_rq);
 941
 942         if (runtime >= sched_rt_period(rt_rq))
 943                 return 0;
 944
 945         balance_runtime(rt_rq);
 946         runtime = sched_rt_runtime(rt_rq);
 947         if (runtime == RUNTIME_INF)
 948                 return 0;
 949
 950         if (rt_rq->rt_time > runtime) {
 951                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 952
 953                 /*
 954                  * Don't actually throttle groups that have no runtime assigned
 955                  * but accrue some time due to boosting.
 956                  */
 957                 if (likely(rt_b->rt_runtime)) {
 958                         static bool once = false;
 959
 960                         rt_rq->rt_throttled = 1;
 961
 962                         if (!once) {
 963                                 once = true;
 964                                 dump_throttled_rt_tasks(rt_rq);
 965                         }
 966                 } else {
 967                         /*
 968                          * In case we did anyway, make it go away,
 969                          * replenishment is a joke, since it will replenish us
 970                          * with exactly 0 ns.
 971                          */
 972                         rt_rq->rt_time = 0;
 973                 }
 974
 975                 if (rt_rq_throttled(rt_rq)) {
 976                         sched_rt_rq_dequeue(rt_rq);
 977                         return 1;
 978                 }
 979         }
 980
 981         return 0;
 982 }
 983
 984 #define RT_SCHEDTUNE_INTERVAL 50000000ULL
 985
 986 static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
 987 {
 988         struct sched_rt_entity *rt_se = container_of(timer,
 989                         struct sched_rt_entity,
 990                         schedtune_timer);
 991         struct task_struct *p = rt_task_of(rt_se);
 992         struct rq *rq = task_rq(p);
 993
 994         raw_spin_lock(&rq->lock);
 995
 996         /*
 997          * Nothing to do if:
 998          * - task has switched runqueues
 999          * - task isn't RT anymore
1000          */
1001         if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
1002                 goto out;
1003
1004         /*
1005          * If task got enqueued back during callback time, it means we raced
1006          * with the enqueue on another cpu, that's Ok, just do nothing as
1007          * enqueue path would have tried to cancel us and we shouldn't run
1008          * Also check the schedtune_enqueued flag as class-switch on a
1009          * sleeping task may have already canceled the timer and done dq
1010          */
1011         if (p->on_rq || !rt_se->schedtune_enqueued)
1012                 goto out;
1013
1014         /*
1015          * RT task is no longer active, cancel boost
1016          */
1017         rt_se->schedtune_enqueued = false;
1018         schedtune_dequeue_task(p, cpu_of(rq));
1019         cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
1020 out:
1021         raw_spin_unlock(&rq->lock);
1022
1023         /*
1024          * This can free the task_struct if no more references.
1025          */
1026         put_task_struct(p);
1027
1028         return HRTIMER_NORESTART;
1029 }
1030
1031 void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
1032 {
1033         struct hrtimer *timer = &rt_se->schedtune_timer;
1034
1035         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1036         timer->function = rt_schedtune_timer;
1037         rt_se->schedtune_enqueued = false;
1038 }
1039
1040 static void start_schedtune_timer(struct sched_rt_entity *rt_se)
1041 {
1042         struct hrtimer *timer = &rt_se->schedtune_timer;
1043
1044         hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
1045                         HRTIMER_MODE_REL_PINNED);
1046 }
1047
1048 /*
1049  * Update the current task's runtime statistics. Skip current tasks that
1050  * are not in our scheduling class.
1051  */
1052 static void update_curr_rt(struct rq *rq)
1053 {
1054         struct task_struct *curr = rq->curr;
1055         struct sched_rt_entity *rt_se = &curr->rt;
1056         u64 delta_exec;
1057
1058         if (curr->sched_class != &rt_sched_class)
1059                 return;
1060
1061         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
1062         if (unlikely((s64)delta_exec <= 0))
1063                 return;
1064
1065         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1066         cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
1067
1068         schedstat_set(curr->se.statistics.exec_max,
1069                       max(curr->se.statistics.exec_max, delta_exec));
1070
1071         curr->se.sum_exec_runtime += delta_exec;
1072         account_group_exec_runtime(curr, delta_exec);
1073
1074         curr->se.exec_start = rq_clock_task(rq);
1075         cpuacct_charge(curr, delta_exec);
1076
1077         sched_rt_avg_update(rq, delta_exec);
1078
1079         if (!rt_bandwidth_enabled())
1080                 return;
1081
1082         for_each_sched_rt_entity(rt_se) {
1083                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1084
1085                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1086                         raw_spin_lock(&rt_rq->rt_runtime_lock);
1087                         rt_rq->rt_time += delta_exec;
1088                         if (sched_rt_runtime_exceeded(rt_rq))
1089                                 resched_curr(rq);
1090                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1091                 }
1092         }
1093 }
1094
1095 static void
1096 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1097 {
1098         struct rq *rq = rq_of_rt_rq(rt_rq);
1099
1100         BUG_ON(&rq->rt != rt_rq);
1101
1102         if (!rt_rq->rt_queued)
1103                 return;
1104
1105         BUG_ON(!rq->nr_running);
1106
1107         sub_nr_running(rq, rt_rq->rt_nr_running);
1108         rt_rq->rt_queued = 0;
1109 }
1110
1111 static void
1112 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1113 {
1114         struct rq *rq = rq_of_rt_rq(rt_rq);
1115
1116         BUG_ON(&rq->rt != rt_rq);
1117
1118         if (rt_rq->rt_queued)
1119                 return;
1120         if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1121                 return;
1122
1123         add_nr_running(rq, rt_rq->rt_nr_running);
1124         rt_rq->rt_queued = 1;
1125 }
1126
1127 #if defined CONFIG_SMP
1128
1129 static void
1130 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1131 {
1132         struct rq *rq = rq_of_rt_rq(rt_rq);
1133
1134 #ifdef CONFIG_RT_GROUP_SCHED
1135         /*
1136          * Change rq's cpupri only if rt_rq is the top queue.
1137          */
1138         if (&rq->rt != rt_rq)
1139                 return;
1140 #endif
1141         if (rq->online && prio < prev_prio)
1142                 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1143 }
1144
1145 static void
1146 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1147 {
1148         struct rq *rq = rq_of_rt_rq(rt_rq);
1149
1150 #ifdef CONFIG_RT_GROUP_SCHED
1151         /*
1152          * Change rq's cpupri only if rt_rq is the top queue.
1153          */
1154         if (&rq->rt != rt_rq)
1155                 return;
1156 #endif
1157         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1158                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1159 }
1160
1161 #else /* CONFIG_SMP */
1162
1163 static inline
1164 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1165 static inline
1166 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1167
1168 #endif /* CONFIG_SMP */
1169
1170 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1171 static void
1172 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1173 {
1174         int prev_prio = rt_rq->highest_prio.curr;
1175
1176         if (prio < prev_prio)
1177                 rt_rq->highest_prio.curr = prio;
1178
1179         inc_rt_prio_smp(rt_rq, prio, prev_prio);
1180 }
1181
1182 static void
1183 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1184 {
1185         int prev_prio = rt_rq->highest_prio.curr;
1186
1187         if (rt_rq->rt_nr_running) {
1188
1189                 WARN_ON(prio < prev_prio);
1190
1191                 /*
1192                  * This may have been our highest task, and therefore
1193                  * we may have some recomputation to do
1194                  */
1195                 if (prio == prev_prio) {
1196                         struct rt_prio_array *array = &rt_rq->active;
1197
1198                         rt_rq->highest_prio.curr =
1199                                 sched_find_first_bit(array->bitmap);
1200                 }
1201
1202         } else
1203                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1204
1205         dec_rt_prio_smp(rt_rq, prio, prev_prio);
1206 }
1207
1208 #else
1209
1210 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1211 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1212
1213 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1214
1215 #ifdef CONFIG_RT_GROUP_SCHED
1216
1217 static void
1218 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1219 {
1220         if (rt_se_boosted(rt_se))
1221                 rt_rq->rt_nr_boosted++;
1222
1223         if (rt_rq->tg)
1224                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1225 }
1226
1227 static void
1228 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1229 {
1230         if (rt_se_boosted(rt_se))
1231                 rt_rq->rt_nr_boosted--;
1232
1233         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1234 }
1235
1236 #else /* CONFIG_RT_GROUP_SCHED */
1237
1238 static void
1239 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1240 {
1241         start_rt_bandwidth(&def_rt_bandwidth);
1242 }
1243
1244 static inline
1245 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1246
1247 #endif /* CONFIG_RT_GROUP_SCHED */
1248
1249 #ifdef CONFIG_SCHED_HMP
1250
1251 static void
1252 inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
1253 {
1254         inc_cumulative_runnable_avg(&rq->hmp_stats, p);
1255 }
1256
1257 static void
1258 dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
1259 {
1260         dec_cumulative_runnable_avg(&rq->hmp_stats, p);
1261 }
1262
1263 static void
1264 fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p,
1265                          u32 new_task_load, u32 new_pred_demand)
1266 {
1267         s64 task_load_delta = (s64)new_task_load - task_load(p);
1268         s64 pred_demand_delta = PRED_DEMAND_DELTA;
1269
1270         fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
1271                                       pred_demand_delta);
1272 }
1273
1274 #else   /* CONFIG_SCHED_HMP */
1275
1276 static inline void
1277 inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
1278
1279 static inline void
1280 dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
1281
1282 #endif  /* CONFIG_SCHED_HMP */
1283
1284 static inline
1285 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1286 {
1287         struct rt_rq *group_rq = group_rt_rq(rt_se);
1288
1289         if (group_rq)
1290                 return group_rq->rt_nr_running;
1291         else
1292                 return 1;
1293 }
1294
1295 static inline
1296 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1297 {
1298         int prio = rt_se_prio(rt_se);
1299
1300         WARN_ON(!rt_prio(prio));
1301         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1302
1303         inc_rt_prio(rt_rq, prio);
1304         inc_rt_migration(rt_se, rt_rq);
1305         inc_rt_group(rt_se, rt_rq);
1306 }
1307
1308 static inline
1309 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1310 {
1311         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1312         WARN_ON(!rt_rq->rt_nr_running);
1313         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1314
1315         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1316         dec_rt_migration(rt_se, rt_rq);
1317         dec_rt_group(rt_se, rt_rq);
1318 }
1319
1320 /*
1321  * Change rt_se->run_list location unless SAVE && !MOVE
1322  *
1323  * assumes ENQUEUE/DEQUEUE flags match
1324  */
1325 static inline bool move_entity(unsigned int flags)
1326 {
1327         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1328                 return false;
1329
1330         return true;
1331 }
1332
1333 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1334 {
1335         list_del_init(&rt_se->run_list);
1336
1337         if (list_empty(array->queue + rt_se_prio(rt_se)))
1338                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1339
1340         rt_se->on_list = 0;
1341 }
1342
1343 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1344 {
1345         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1346         struct rt_prio_array *array = &rt_rq->active;
1347         struct rt_rq *group_rq = group_rt_rq(rt_se);
1348         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1349
1350         /*
1351          * Don't enqueue the group if its throttled, or when empty.
1352          * The latter is a consequence of the former when a child group
1353          * get throttled and the current group doesn't have any other
1354          * active members.
1355          */
1356         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1357                 if (rt_se->on_list)
1358                         __delist_rt_entity(rt_se, array);
1359                 return;
1360         }
1361
1362         if (move_entity(flags)) {
1363                 WARN_ON_ONCE(rt_se->on_list);
1364                 if (flags & ENQUEUE_HEAD)
1365                         list_add(&rt_se->run_list, queue);
1366                 else
1367                         list_add_tail(&rt_se->run_list, queue);
1368
1369                 __set_bit(rt_se_prio(rt_se), array->bitmap);
1370                 rt_se->on_list = 1;
1371         }
1372         rt_se->on_rq = 1;
1373
1374         inc_rt_tasks(rt_se, rt_rq);
1375 }
1376
1377 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1378 {
1379         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1380         struct rt_prio_array *array = &rt_rq->active;
1381
1382         if (move_entity(flags)) {
1383                 WARN_ON_ONCE(!rt_se->on_list);
1384                 __delist_rt_entity(rt_se, array);
1385         }
1386         rt_se->on_rq = 0;
1387
1388         dec_rt_tasks(rt_se, rt_rq);
1389 }
1390
1391 /*
1392  * Because the prio of an upper entry depends on the lower
1393  * entries, we must remove entries top - down.
1394  */
1395 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1396 {
1397         struct sched_rt_entity *back = NULL;
1398
1399         for_each_sched_rt_entity(rt_se) {
1400                 rt_se->back = back;
1401                 back = rt_se;
1402         }
1403
1404         dequeue_top_rt_rq(rt_rq_of_se(back));
1405
1406         for (rt_se = back; rt_se; rt_se = rt_se->back) {
1407                 if (on_rt_rq(rt_se))
1408                         __dequeue_rt_entity(rt_se, flags);
1409         }
1410 }
1411
1412 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1413 {
1414         struct rq *rq = rq_of_rt_se(rt_se);
1415
1416         dequeue_rt_stack(rt_se, flags);
1417         for_each_sched_rt_entity(rt_se)
1418                 __enqueue_rt_entity(rt_se, flags);
1419         enqueue_top_rt_rq(&rq->rt);
1420 }
1421
1422 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1423 {
1424         struct rq *rq = rq_of_rt_se(rt_se);
1425
1426         dequeue_rt_stack(rt_se, flags);
1427
1428         for_each_sched_rt_entity(rt_se) {
1429                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1430
1431                 if (rt_rq && rt_rq->rt_nr_running)
1432                         __enqueue_rt_entity(rt_se, flags);
1433         }
1434         enqueue_top_rt_rq(&rq->rt);
1435 }
1436
1437 /*
1438  * Adding/removing a task to/from a priority array:
1439  */
1440 static void
1441 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1442 {
1443         struct sched_rt_entity *rt_se = &p->rt;
1444
1445         if (flags & ENQUEUE_WAKEUP)
1446                 rt_se->timeout = 0;
1447
1448         enqueue_rt_entity(rt_se, flags);
1449         inc_hmp_sched_stats_rt(rq, p);
1450
1451         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1452                 enqueue_pushable_task(rq, p);
1453
1454         if (!schedtune_task_boost(p))
1455                 return;
1456
1457         /*
1458          * If schedtune timer is active, that means a boost was already
1459          * done, just cancel the timer so that deboost doesn't happen.
1460          * Otherwise, increase the boost. If an enqueued timer was
1461          * cancelled, put the task reference.
1462          */
1463         if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
1464                 put_task_struct(p);
1465
1466         /*
1467          * schedtune_enqueued can be true in the following situation:
1468          * enqueue_task_rt grabs rq lock before timer fires
1469          *    or before its callback acquires rq lock
1470          * schedtune_enqueued can be false if timer callback is running
1471          * and timer just released rq lock, or if the timer finished
1472          * running and canceling the boost
1473          */
1474         if (rt_se->schedtune_enqueued)
1475                 return;
1476
1477         rt_se->schedtune_enqueued = true;
1478         schedtune_enqueue_task(p, cpu_of(rq));
1479         cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
1480 }
1481
1482 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1483 {
1484         struct sched_rt_entity *rt_se = &p->rt;
1485
1486         update_curr_rt(rq);
1487         dequeue_rt_entity(rt_se, flags);
1488         dec_hmp_sched_stats_rt(rq, p);
1489
1490         dequeue_pushable_task(rq, p);
1491
1492         if (!rt_se->schedtune_enqueued)
1493                 return;
1494
1495         if (flags == DEQUEUE_SLEEP) {
1496                 get_task_struct(p);
1497                 start_schedtune_timer(rt_se);
1498                 return;
1499         }
1500
1501         rt_se->schedtune_enqueued = false;
1502         schedtune_dequeue_task(p, cpu_of(rq));
1503         cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
1504 }
1505
1506 /*
1507  * Put task to the head or the end of the run list without the overhead of
1508  * dequeue followed by enqueue.
1509  */
1510 static void
1511 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1512 {
1513         if (on_rt_rq(rt_se)) {
1514                 struct rt_prio_array *array = &rt_rq->active;
1515                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1516
1517                 if (head)
1518                         list_move(&rt_se->run_list, queue);
1519                 else
1520                         list_move_tail(&rt_se->run_list, queue);
1521         }
1522 }
1523
1524 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1525 {
1526         struct sched_rt_entity *rt_se = &p->rt;
1527         struct rt_rq *rt_rq;
1528
1529         for_each_sched_rt_entity(rt_se) {
1530                 rt_rq = rt_rq_of_se(rt_se);
1531                 requeue_rt_entity(rt_rq, rt_se, head);
1532         }
1533 }
1534
1535 static void yield_task_rt(struct rq *rq)
1536 {
1537         requeue_task_rt(rq, rq->curr, 0);
1538 }
1539
1540 #ifdef CONFIG_SMP
1541 static int find_lowest_rq(struct task_struct *task);
1542
1543 #ifdef CONFIG_SCHED_HMP
1544 static int
1545 select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
1546 {
1547         int target;
1548
1549         rcu_read_lock();
1550         target = find_lowest_rq(p);
1551         if (target != -1)
1552                 cpu = target;
1553         rcu_read_unlock();
1554
1555         return cpu;
1556 }
1557 #endif
1558
1559 /*
1560  * Return whether the task on the given cpu is currently non-preemptible
1561  * while handling a potentially long softint, or if the task is likely
1562  * to block preemptions soon because it is a ksoftirq thread that is
1563  * handling slow softints.
1564  */
1565 bool
1566 task_may_not_preempt(struct task_struct *task, int cpu)
1567 {
1568         __u32 softirqs = per_cpu(active_softirqs, cpu) |
1569                          __IRQ_STAT(cpu, __softirq_pending);
1570         struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
1571
1572         return ((softirqs & LONG_SOFTIRQ_MASK) &&
1573                 (task == cpu_ksoftirqd ||
1574                  task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
1575 }
1576
1577 /*
1578  * Perform a schedtune dequeue and cancelation of boost timers if needed.
1579  * Should be called only with the rq->lock held.
1580  */
1581 static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
1582 {
1583         struct sched_rt_entity *rt_se = &p->rt;
1584
1585         BUG_ON(!raw_spin_is_locked(&rq->lock));
1586
1587         if (!rt_se->schedtune_enqueued)
1588                 return;
1589
1590         /*
1591          * Incase of class change cancel any active timers. If an enqueued
1592          * timer was cancelled, put the task ref.
1593          */
1594         if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
1595                 put_task_struct(p);
1596
1597         /* schedtune_enqueued is true, deboost it */
1598         rt_se->schedtune_enqueued = false;
1599         schedtune_dequeue_task(p, task_cpu(p));
1600         cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
1601 }
1602
1603 static int
1604 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1605                   int sibling_count_hint)
1606 {
1607         struct task_struct *curr;
1608         struct rq *rq;
1609         bool may_not_preempt;
1610
1611 #ifdef CONFIG_SCHED_HMP
1612         return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
1613 #endif
1614
1615         /* For anything but wake ups, just return the task_cpu */
1616         if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1617                 goto out;
1618
1619         rq = cpu_rq(cpu);
1620
1621         rcu_read_lock();
1622         curr = READ_ONCE(rq->curr); /* unlocked access */
1623
1624         /*
1625          * If the current task on @p's runqueue is a softirq task,
1626          * it may run without preemption for a time that is
1627          * ill-suited for a waiting RT task. Therefore, try to
1628          * wake this RT task on another runqueue.
1629          *
1630          * Also, if the current task on @p's runqueue is an RT task, then
1631          * it may run without preemption for a time that is
1632          * ill-suited for a waiting RT task. Therefore, try to
1633          * wake this RT task on another runqueue.
1634          *
1635          * Also, if the current task on @p's runqueue is an RT task, then
1636          * try to see if we can wake this RT task up on another
1637          * runqueue. Otherwise simply start this RT task
1638          * on its current runqueue.
1639          *
1640          * We want to avoid overloading runqueues. If the woken
1641          * task is a higher priority, then it will stay on this CPU
1642          * and the lower prio task should be moved to another CPU.
1643          * Even though this will probably make the lower prio task
1644          * lose its cache, we do not want to bounce a higher task
1645          * around just because it gave up its CPU, perhaps for a
1646          * lock?
1647          *
1648          * For equal prio tasks, we just let the scheduler sort it out.
1649          *
1650          * Otherwise, just let it ride on the affined RQ and the
1651          * post-schedule router will push the preempted task away
1652          *
1653          * This test is optimistic, if we get it wrong the load-balancer
1654          * will have to sort it out.
1655          */
1656         may_not_preempt = task_may_not_preempt(curr, cpu);
1657         if (may_not_preempt ||
1658             (unlikely(rt_task(curr)) &&
1659             (curr->nr_cpus_allowed < 2 ||
1660              curr->prio <= p->prio))) {
1661                 int target = find_lowest_rq(p);
1662
1663                 /*
1664                  * If cpu is non-preemptible, prefer remote cpu
1665                  * even if it's running a higher-prio task.
1666                  * Otherwise: Don't bother moving it if the
1667                  * destination CPU is not running a lower priority task.
1668                  */
1669                 if (target != -1 &&
1670                    (may_not_preempt ||
1671                     p->prio < cpu_rq(target)->rt.highest_prio.curr))
1672                         cpu = target;
1673         }
1674         rcu_read_unlock();
1675
1676 out:
1677         /*
1678          * If previous CPU was different, make sure to cancel any active
1679          * schedtune timers and deboost.
1680          */
1681         if (task_cpu(p) != cpu) {
1682                 unsigned long fl;
1683                 struct rq *prq = task_rq(p);
1684
1685                 raw_spin_lock_irqsave(&prq->lock, fl);
1686                 schedtune_dequeue_rt(prq, p);
1687                 raw_spin_unlock_irqrestore(&prq->lock, fl);
1688         }
1689
1690         return cpu;
1691 }
1692
1693 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1694 {
1695         /*
1696          * Current can't be migrated, useless to reschedule,
1697          * let's hope p can move out.
1698          */
1699         if (rq->curr->nr_cpus_allowed == 1 ||
1700             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1701                 return;
1702
1703         /*
1704          * p is migratable, so let's not schedule it and
1705          * see if it is pushed or pulled somewhere else.
1706          */
1707         if (p->nr_cpus_allowed != 1
1708             && cpupri_find(&rq->rd->cpupri, p, NULL))
1709                 return;
1710
1711         /*
1712          * There appears to be other cpus that can accept
1713          * current and none to run 'p', so lets reschedule
1714          * to try and push current away:
1715          */
1716         requeue_task_rt(rq, p, 1);
1717         resched_curr(rq);
1718 }
1719
1720 #endif /* CONFIG_SMP */
1721
1722 /*
1723  * Preempt the current task with a newly woken task if needed:
1724  */
1725 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1726 {
1727         if (p->prio < rq->curr->prio) {
1728                 resched_curr(rq);
1729                 return;
1730         }
1731
1732 #ifdef CONFIG_SMP
1733         /*
1734          * If:
1735          *
1736          * - the newly woken task is of equal priority to the current task
1737          * - the newly woken task is non-migratable while current is migratable
1738          * - current will be preempted on the next reschedule
1739          *
1740          * we should check to see if current can readily move to a different
1741          * cpu.  If so, we will reschedule to allow the push logic to try
1742          * to move current somewhere else, making room for our non-migratable
1743          * task.
1744          */
1745         if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1746                 check_preempt_equal_prio(rq, p);
1747 #endif
1748 }
1749
1750 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1751                                                    struct rt_rq *rt_rq)
1752 {
1753         struct rt_prio_array *array = &rt_rq->active;
1754         struct sched_rt_entity *next = NULL;
1755         struct list_head *queue;
1756         int idx;
1757
1758         idx = sched_find_first_bit(array->bitmap);
1759         BUG_ON(idx >= MAX_RT_PRIO);
1760
1761         queue = array->queue + idx;
1762         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1763
1764         return next;
1765 }
1766
1767 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1768 {
1769         struct sched_rt_entity *rt_se;
1770         struct task_struct *p;
1771         struct rt_rq *rt_rq  = &rq->rt;
1772
1773         do {
1774                 rt_se = pick_next_rt_entity(rq, rt_rq);
1775                 BUG_ON(!rt_se);
1776                 rt_rq = group_rt_rq(rt_se);
1777         } while (rt_rq);
1778
1779         p = rt_task_of(rt_se);
1780         p->se.exec_start = rq_clock_task(rq);
1781
1782         return p;
1783 }
1784
1785 static struct task_struct *
1786 pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1787 {
1788         struct task_struct *p;
1789         struct rt_rq *rt_rq = &rq->rt;
1790
1791         if (need_pull_rt_task(rq, prev)) {
1792                 /*
1793                  * This is OK, because current is on_cpu, which avoids it being
1794                  * picked for load-balance and preemption/IRQs are still
1795                  * disabled avoiding further scheduler activity on it and we're
1796                  * being very careful to re-start the picking loop.
1797                  */
1798                 lockdep_unpin_lock(&rq->lock);
1799                 pull_rt_task(rq);
1800                 lockdep_pin_lock(&rq->lock);
1801                 /*
1802                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
1803                  * means a dl or stop task can slip in, in which case we need
1804                  * to re-start task selection.
1805                  */
1806                 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1807                              rq->dl.dl_nr_running))
1808                         return RETRY_TASK;
1809         }
1810
1811         /*
1812          * We may dequeue prev's rt_rq in put_prev_task().
1813          * So, we update time before rt_nr_running check.
1814          */
1815         if (prev->sched_class == &rt_sched_class)
1816                 update_curr_rt(rq);
1817
1818         if (!rt_rq->rt_queued)
1819                 return NULL;
1820
1821         put_prev_task(rq, prev);
1822
1823         p = _pick_next_task_rt(rq);
1824
1825         /* The running task is never eligible for pushing */
1826         dequeue_pushable_task(rq, p);
1827
1828         queue_push_tasks(rq);
1829
1830         return p;
1831 }
1832
1833 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1834 {
1835         update_curr_rt(rq);
1836
1837         /*
1838          * The previous task needs to be made eligible for pushing
1839          * if it is still active
1840          */
1841         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1842                 enqueue_pushable_task(rq, p);
1843 }
1844
1845 #ifdef CONFIG_SMP
1846
1847 /* Only try algorithms three times */
1848 #define RT_MAX_TRIES 3
1849
1850 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1851 {
1852         if (!task_running(rq, p) &&
1853             cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1854                 return 1;
1855         return 0;
1856 }
1857
1858 /*
1859  * Return the highest pushable rq's task, which is suitable to be executed
1860  * on the cpu, NULL otherwise
1861  */
1862 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1863 {
1864         struct plist_head *head = &rq->rt.pushable_tasks;
1865         struct task_struct *p;
1866
1867         if (!has_pushable_tasks(rq))
1868                 return NULL;
1869
1870         plist_for_each_entry(p, head, pushable_tasks) {
1871                 if (pick_rt_task(rq, p, cpu))
1872                         return p;
1873         }
1874
1875         return NULL;
1876 }
1877
1878 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1879
1880 #ifdef CONFIG_SCHED_HMP
1881
1882 static int find_lowest_rq_hmp(struct task_struct *task)
1883 {
1884         struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask);
1885         struct cpumask candidate_mask = CPU_MASK_NONE;
1886         struct sched_cluster *cluster;
1887         int best_cpu = -1;
1888         int prev_cpu = task_cpu(task);
1889         u64 cpu_load, min_load = ULLONG_MAX;
1890         int i;
1891         int restrict_cluster;
1892         int boost_on_big;
1893         int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
1894
1895         boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
1896                         sched_boost_policy() == SCHED_BOOST_ON_BIG;
1897
1898         restrict_cluster = sysctl_sched_restrict_cluster_spill;
1899
1900         /* Make sure the mask is initialized first */
1901         if (unlikely(!lowest_mask))
1902                 return best_cpu;
1903
1904         if (task->nr_cpus_allowed == 1)
1905                 return best_cpu; /* No other targets possible */
1906
1907         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1908                 return best_cpu; /* No targets found */
1909
1910         pack_task = is_short_burst_task(task);
1911
1912         /*
1913          * At this point we have built a mask of cpus representing the
1914          * lowest priority tasks in the system.  Now we want to elect
1915          * the best one based on our affinity and topology.
1916          */
1917
1918 retry:
1919         for_each_sched_cluster(cluster) {
1920                 if (boost_on_big && cluster->capacity != max_possible_capacity)
1921                         continue;
1922
1923                 cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
1924                 cpumask_andnot(&candidate_mask, &candidate_mask,
1925                                cpu_isolated_mask);
1926                 /*
1927                  * When placement boost is active, if there is no eligible CPU
1928                  * in the highest capacity cluster, we fallback to the other
1929                  * clusters. So clear the CPUs of the traversed cluster from
1930                  * the lowest_mask.
1931                  */
1932                 if (unlikely(boost_on_big))
1933                         cpumask_andnot(lowest_mask, lowest_mask,
1934                                        &cluster->cpus);
1935
1936                 if (cpumask_empty(&candidate_mask))
1937                         continue;
1938
1939                 for_each_cpu(i, &candidate_mask) {
1940                         if (sched_cpu_high_irqload(i))
1941                                 continue;
1942
1943                         cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg;
1944                         if (!restrict_cluster)
1945                                 cpu_load = scale_load_to_cpu(cpu_load, i);
1946
1947                         if (pack_task) {
1948                                 wakeup_latency = cpu_rq(i)->wakeup_latency;
1949
1950                                 if (wakeup_latency > least_wakeup_latency)
1951                                         continue;
1952
1953                                 if (wakeup_latency < least_wakeup_latency) {
1954                                         least_wakeup_latency = wakeup_latency;
1955                                         min_load = cpu_load;
1956                                         best_cpu = i;
1957                                         continue;
1958                                 }
1959                         }
1960
1961                         if (cpu_load < min_load ||
1962                                 (cpu_load == min_load &&
1963                                 (i == prev_cpu || (best_cpu != prev_cpu &&
1964                                 cpus_share_cache(prev_cpu, i))))) {
1965                                 min_load = cpu_load;
1966                                 best_cpu = i;
1967                         }
1968                 }
1969
1970                 if (restrict_cluster && best_cpu != -1)
1971                         break;
1972         }
1973
1974         if (unlikely(boost_on_big && best_cpu == -1)) {
1975                 boost_on_big = 0;
1976                 goto retry;
1977         }
1978
1979         return best_cpu;
1980 }
1981 #endif  /* CONFIG_SCHED_HMP */
1982
1983 static int find_lowest_rq(struct task_struct *task)
1984 {
1985         struct sched_domain *sd;
1986         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1987         int this_cpu = smp_processor_id();
1988         int cpu      = task_cpu(task);
1989
1990 #ifdef CONFIG_SCHED_HMP
1991         return find_lowest_rq_hmp(task);
1992 #endif
1993
1994         /* Make sure the mask is initialized first */
1995         if (unlikely(!lowest_mask))
1996                 return -1;
1997
1998         if (task->nr_cpus_allowed == 1)
1999                 return -1; /* No other targets possible */
2000
2001         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
2002                 return -1; /* No targets found */
2003
2004         /*
2005          * At this point we have built a mask of cpus representing the
2006          * lowest priority tasks in the system.  Now we want to elect
2007          * the best one based on our affinity and topology.
2008          *
2009          * We prioritize the last cpu that the task executed on since
2010          * it is most likely cache-hot in that location.
2011          */
2012         if (cpumask_test_cpu(cpu, lowest_mask))
2013                 return cpu;
2014
2015         /*
2016          * Otherwise, we consult the sched_domains span maps to figure
2017          * out which cpu is logically closest to our hot cache data.
2018          */
2019         if (!cpumask_test_cpu(this_cpu, lowest_mask))
2020                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
2021
2022         rcu_read_lock();
2023         for_each_domain(cpu, sd) {
2024                 if (sd->flags & SD_WAKE_AFFINE) {
2025                         int best_cpu;
2026
2027                         /*
2028                          * "this_cpu" is cheaper to preempt than a
2029                          * remote processor.
2030                          */
2031                         if (this_cpu != -1 &&
2032                             cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
2033                                 rcu_read_unlock();
2034                                 return this_cpu;
2035                         }
2036
2037                         best_cpu = cpumask_first_and(lowest_mask,
2038                                                      sched_domain_span(sd));
2039                         if (best_cpu < nr_cpu_ids) {
2040                                 rcu_read_unlock();
2041                                 return best_cpu;
2042                         }
2043                 }
2044         }
2045         rcu_read_unlock();
2046
2047         /*
2048          * And finally, if there were no matches within the domains
2049          * just give the caller *something* to work with from the compatible
2050          * locations.
2051          */
2052         if (this_cpu != -1)
2053                 return this_cpu;
2054
2055         cpu = cpumask_any(lowest_mask);
2056         if (cpu < nr_cpu_ids)
2057                 return cpu;
2058         return -1;
2059 }
2060
2061 /* Will lock the rq it finds */
2062 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
2063 {
2064         struct rq *lowest_rq = NULL;
2065         int tries;
2066         int cpu;
2067
2068         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
2069                 cpu = find_lowest_rq(task);
2070
2071                 if ((cpu == -1) || (cpu == rq->cpu))
2072                         break;
2073
2074                 lowest_rq = cpu_rq(cpu);
2075
2076                 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
2077                         /*
2078                          * Target rq has tasks of equal or higher priority,
2079                          * retrying does not release any lock and is unlikely
2080                          * to yield a different result.
2081                          */
2082                         lowest_rq = NULL;
2083                         break;
2084                 }
2085
2086                 /* if the prio of this runqueue changed, try again */
2087                 if (double_lock_balance(rq, lowest_rq)) {
2088                         /*
2089                          * We had to unlock the run queue. In
2090                          * the mean time, task could have
2091                          * migrated already or had its affinity changed.
2092                          * Also make sure that it wasn't scheduled on its rq.
2093                          */
2094                         if (unlikely(task_rq(task) != rq ||
2095                                      !cpumask_test_cpu(lowest_rq->cpu,
2096                                                        tsk_cpus_allowed(task)) ||
2097                                      task_running(rq, task) ||
2098                                      !task_on_rq_queued(task))) {
2099
2100                                 double_unlock_balance(rq, lowest_rq);
2101                                 lowest_rq = NULL;
2102                                 break;
2103                         }
2104                 }
2105
2106                 /* If this rq is still suitable use it. */
2107                 if (lowest_rq->rt.highest_prio.curr > task->prio)
2108                         break;
2109
2110                 /* try again */
2111                 double_unlock_balance(rq, lowest_rq);
2112                 lowest_rq = NULL;
2113         }
2114
2115         return lowest_rq;
2116 }
2117
2118 static struct task_struct *pick_next_pushable_task(struct rq *rq)
2119 {
2120         struct task_struct *p;
2121
2122         if (!has_pushable_tasks(rq))
2123                 return NULL;
2124
2125         p = plist_first_entry(&rq->rt.pushable_tasks,
2126                               struct task_struct, pushable_tasks);
2127
2128         BUG_ON(rq->cpu != task_cpu(p));
2129         BUG_ON(task_current(rq, p));
2130         BUG_ON(p->nr_cpus_allowed <= 1);
2131
2132         BUG_ON(!task_on_rq_queued(p));
2133         BUG_ON(!rt_task(p));
2134
2135         return p;
2136 }
2137
2138 /*
2139  * If the current CPU has more than one RT task, see if the non
2140  * running task can migrate over to a CPU that is running a task
2141  * of lesser priority.
2142  */
2143 static int push_rt_task(struct rq *rq)
2144 {
2145         struct task_struct *next_task;
2146         struct rq *lowest_rq;
2147         int ret = 0;
2148
2149         if (!rq->rt.overloaded)
2150                 return 0;
2151
2152         next_task = pick_next_pushable_task(rq);
2153         if (!next_task)
2154                 return 0;
2155
2156 retry:
2157         if (unlikely(next_task == rq->curr)) {
2158                 WARN_ON(1);
2159                 return 0;
2160         }
2161
2162         /*
2163          * It's possible that the next_task slipped in of
2164          * higher priority than current. If that's the case
2165          * just reschedule current.
2166          */
2167         if (unlikely(next_task->prio < rq->curr->prio)) {
2168                 resched_curr(rq);
2169                 return 0;
2170         }
2171
2172         /* We might release rq lock */
2173         get_task_struct(next_task);
2174
2175         /* find_lock_lowest_rq locks the rq if found */
2176         lowest_rq = find_lock_lowest_rq(next_task, rq);
2177         if (!lowest_rq) {
2178                 struct task_struct *task;
2179                 /*
2180                  * find_lock_lowest_rq releases rq->lock
2181                  * so it is possible that next_task has migrated.
2182                  *
2183                  * We need to make sure that the task is still on the same
2184                  * run-queue and is also still the next task eligible for
2185                  * pushing.
2186                  */
2187                 task = pick_next_pushable_task(rq);
2188                 if (task_cpu(next_task) == rq->cpu && task == next_task) {
2189                         /*
2190                          * The task hasn't migrated, and is still the next
2191                          * eligible task, but we failed to find a run-queue
2192                          * to push it to.  Do not retry in this case, since
2193                          * other cpus will pull from us when ready.
2194                          */
2195                         goto out;
2196                 }
2197
2198                 if (!task)
2199                         /* No more tasks, just exit */
2200                         goto out;
2201
2202                 /*
2203                  * Something has shifted, try again.
2204                  */
2205                 put_task_struct(next_task);
2206                 next_task = task;
2207                 goto retry;
2208         }
2209
2210         next_task->on_rq = TASK_ON_RQ_MIGRATING;
2211         deactivate_task(rq, next_task, 0);
2212         next_task->on_rq = TASK_ON_RQ_MIGRATING;
2213         set_task_cpu(next_task, lowest_rq->cpu);
2214         next_task->on_rq = TASK_ON_RQ_QUEUED;
2215         activate_task(lowest_rq, next_task, 0);
2216         next_task->on_rq = TASK_ON_RQ_QUEUED;
2217         ret = 1;
2218
2219         resched_curr(lowest_rq);
2220
2221         double_unlock_balance(rq, lowest_rq);
2222
2223 out:
2224         put_task_struct(next_task);
2225
2226         return ret;
2227 }
2228
2229 static void push_rt_tasks(struct rq *rq)
2230 {
2231         /* push_rt_task will return true if it moved an RT */
2232         while (push_rt_task(rq))
2233                 ;
2234 }
2235
2236 #ifdef HAVE_RT_PUSH_IPI
2237
2238 /*
2239  * When a high priority task schedules out from a CPU and a lower priority
2240  * task is scheduled in, a check is made to see if there's any RT tasks
2241  * on other CPUs that are waiting to run because a higher priority RT task
2242  * is currently running on its CPU. In this case, the CPU with multiple RT
2243  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2244  * up that may be able to run one of its non-running queued RT tasks.
2245  *
2246  * All CPUs with overloaded RT tasks need to be notified as there is currently
2247  * no way to know which of these CPUs have the highest priority task waiting
2248  * to run. Instead of trying to take a spinlock on each of these CPUs,
2249  * which has shown to cause large latency when done on machines with many
2250  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2251  * RT tasks waiting to run.
2252  *
2253  * Just sending an IPI to each of the CPUs is also an issue, as on large
2254  * count CPU machines, this can cause an IPI storm on a CPU, especially
2255  * if its the only CPU with multiple RT tasks queued, and a large number
2256  * of CPUs scheduling a lower priority task at the same time.
2257  *
2258  * Each root domain has its own irq work function that can iterate over
2259  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2260  * tassk must be checked if there's one or many CPUs that are lowering
2261  * their priority, there's a single irq work iterator that will try to
2262  * push off RT tasks that are waiting to run.
2263  *
2264  * When a CPU schedules a lower priority task, it will kick off the
2265  * irq work iterator that will jump to each CPU with overloaded RT tasks.
2266  * As it only takes the first CPU that schedules a lower priority task
2267  * to start the process, the rto_start variable is incremented and if
2268  * the atomic result is one, then that CPU will try to take the rto_lock.
2269  * This prevents high contention on the lock as the process handles all
2270  * CPUs scheduling lower priority tasks.
2271  *
2272  * All CPUs that are scheduling a lower priority task will increment the
2273  * rt_loop_next variable. This will make sure that the irq work iterator
2274  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2275  * priority task, even if the iterator is in the middle of a scan. Incrementing
2276  * the rt_loop_next will cause the iterator to perform another scan.
2277  *
2278  */
2279 static int rto_next_cpu(struct rq *rq)
2280 {
2281         struct root_domain *rd = rq->rd;
2282         int next;
2283         int cpu;
2284
2285         /*
2286          * When starting the IPI RT pushing, the rto_cpu is set to -1,
2287          * rt_next_cpu() will simply return the first CPU found in
2288          * the rto_mask.
2289          *
2290          * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
2291          * will return the next CPU found in the rto_mask.
2292          *
2293          * If there are no more CPUs left in the rto_mask, then a check is made
2294          * against rto_loop and rto_loop_next. rto_loop is only updated with
2295          * the rto_lock held, but any CPU may increment the rto_loop_next
2296          * without any locking.
2297          */
2298         for (;;) {
2299
2300                 /* When rto_cpu is -1 this acts like cpumask_first() */
2301                 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2302
2303                 rd->rto_cpu = cpu;
2304
2305                 if (cpu < nr_cpu_ids)
2306                         return cpu;
2307
2308                 rd->rto_cpu = -1;
2309
2310                 /*
2311                  * ACQUIRE ensures we see the @rto_mask changes
2312                  * made prior to the @next value observed.
2313                  *
2314                  * Matches WMB in rt_set_overload().
2315                  */
2316                 next = atomic_read_acquire(&rd->rto_loop_next);
2317
2318                 if (rd->rto_loop == next)
2319                         break;
2320
2321                 rd->rto_loop = next;
2322         }
2323
2324         return -1;
2325 }
2326
2327 static inline bool rto_start_trylock(atomic_t *v)
2328 {
2329         return !atomic_cmpxchg_acquire(v, 0, 1);
2330 }
2331
2332 static inline void rto_start_unlock(atomic_t *v)
2333 {
2334         atomic_set_release(v, 0);
2335 }
2336
2337 static void tell_cpu_to_push(struct rq *rq)
2338 {
2339         int cpu = -1;
2340
2341         /* Keep the loop going if the IPI is currently active */
2342         atomic_inc(&rq->rd->rto_loop_next);
2343
2344         /* Only one CPU can initiate a loop at a time */
2345         if (!rto_start_trylock(&rq->rd->rto_loop_start))
2346                 return;
2347
2348         raw_spin_lock(&rq->rd->rto_lock);
2349
2350         /*
2351          * The rto_cpu is updated under the lock, if it has a valid cpu
2352          * then the IPI is still running and will continue due to the
2353          * update to loop_next, and nothing needs to be done here.
2354          * Otherwise it is finishing up and an ipi needs to be sent.
2355          */
2356         if (rq->rd->rto_cpu < 0)
2357                 cpu = rto_next_cpu(rq);
2358
2359         raw_spin_unlock(&rq->rd->rto_lock);
2360
2361         rto_start_unlock(&rq->rd->rto_loop_start);
2362
2363         if (cpu >= 0)
2364                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2365 }
2366
2367 /* Called from hardirq context */
2368 void rto_push_irq_work_func(struct irq_work *work)
2369 {
2370         struct rq *rq;
2371         int cpu;
2372
2373         rq = this_rq();
2374
2375         /*
2376          * We do not need to grab the lock to check for has_pushable_tasks.
2377          * When it gets updated, a check is made if a push is possible.
2378          */
2379         if (has_pushable_tasks(rq)) {
2380                 raw_spin_lock(&rq->lock);
2381                 push_rt_tasks(rq);
2382                 raw_spin_unlock(&rq->lock);
2383         }
2384
2385         raw_spin_lock(&rq->rd->rto_lock);
2386
2387         /* Pass the IPI to the next rt overloaded queue */
2388         cpu = rto_next_cpu(rq);
2389
2390         raw_spin_unlock(&rq->rd->rto_lock);
2391
2392         if (cpu < 0)
2393                 return;
2394
2395         /* Try the next RT overloaded CPU */
2396         irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2397 }
2398 #endif /* HAVE_RT_PUSH_IPI */
2399
2400 static void pull_rt_task(struct rq *this_rq)
2401 {
2402         int this_cpu = this_rq->cpu, cpu;
2403         bool resched = false;
2404         struct task_struct *p;
2405         struct rq *src_rq;
2406         int rt_overload_count = rt_overloaded(this_rq);
2407
2408         if (likely(!rt_overload_count))
2409                 return;
2410
2411         /*
2412          * Match the barrier from rt_set_overloaded; this guarantees that if we
2413          * see overloaded we must also see the rto_mask bit.
2414          */
2415         smp_rmb();
2416
2417         /* If we are the only overloaded CPU do nothing */
2418         if (rt_overload_count == 1 &&
2419             cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2420                 return;
2421
2422 #ifdef HAVE_RT_PUSH_IPI
2423         if (sched_feat(RT_PUSH_IPI)) {
2424                 tell_cpu_to_push(this_rq);
2425                 return;
2426         }
2427 #endif
2428
2429         for_each_cpu(cpu, this_rq->rd->rto_mask) {
2430                 if (this_cpu == cpu)
2431                         continue;
2432
2433                 src_rq = cpu_rq(cpu);
2434
2435                 /*
2436                  * Don't bother taking the src_rq->lock if the next highest
2437                  * task is known to be lower-priority than our current task.
2438                  * This may look racy, but if this value is about to go
2439                  * logically higher, the src_rq will push this task away.
2440                  * And if its going logically lower, we do not care
2441                  */
2442                 if (src_rq->rt.highest_prio.next >=
2443                     this_rq->rt.highest_prio.curr)
2444                         continue;
2445
2446                 /*
2447                  * We can potentially drop this_rq's lock in
2448                  * double_lock_balance, and another CPU could
2449                  * alter this_rq
2450                  */
2451                 double_lock_balance(this_rq, src_rq);
2452
2453                 /*
2454                  * We can pull only a task, which is pushable
2455                  * on its rq, and no others.
2456                  */
2457                 p = pick_highest_pushable_task(src_rq, this_cpu);
2458
2459                 /*
2460                  * Do we have an RT task that preempts
2461                  * the to-be-scheduled task?
2462                  */
2463                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2464                         WARN_ON(p == src_rq->curr);
2465                         WARN_ON(!task_on_rq_queued(p));
2466
2467                         /*
2468                          * There's a chance that p is higher in priority
2469                          * than what's currently running on its cpu.
2470                          * This is just that p is wakeing up and hasn't
2471                          * had a chance to schedule. We only pull
2472                          * p if it is lower in priority than the
2473                          * current task on the run queue
2474                          */
2475                         if (p->prio < src_rq->curr->prio)
2476                                 goto skip;
2477
2478                         resched = true;
2479
2480                         p->on_rq = TASK_ON_RQ_MIGRATING;
2481                         deactivate_task(src_rq, p, 0);
2482                         p->on_rq = TASK_ON_RQ_MIGRATING;
2483                         set_task_cpu(p, this_cpu);
2484                         p->on_rq = TASK_ON_RQ_QUEUED;
2485                         activate_task(this_rq, p, 0);
2486                         p->on_rq = TASK_ON_RQ_QUEUED;
2487                         /*
2488                          * We continue with the search, just in
2489                          * case there's an even higher prio task
2490                          * in another runqueue. (low likelihood
2491                          * but possible)
2492                          */
2493                 }
2494 skip:
2495                 double_unlock_balance(this_rq, src_rq);
2496         }
2497
2498         if (resched)
2499                 resched_curr(this_rq);
2500 }
2501
2502 /*
2503  * If we are not running and we are not going to reschedule soon, we should
2504  * try to push tasks away now
2505  */
2506 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2507 {
2508         if (!task_running(rq, p) &&
2509             !test_tsk_need_resched(rq->curr) &&
2510             p->nr_cpus_allowed > 1 &&
2511             (dl_task(rq->curr) || rt_task(rq->curr)) &&
2512             (rq->curr->nr_cpus_allowed < 2 ||
2513              rq->curr->prio <= p->prio))
2514                 push_rt_tasks(rq);
2515 }
2516
2517 /* Assumes rq->lock is held */
2518 static void rq_online_rt(struct rq *rq)
2519 {
2520         if (rq->rt.overloaded)
2521                 rt_set_overload(rq);
2522
2523         __enable_runtime(rq);
2524
2525         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2526 }
2527
2528 /* Assumes rq->lock is held */
2529 static void rq_offline_rt(struct rq *rq)
2530 {
2531         if (rq->rt.overloaded)
2532                 rt_clear_overload(rq);
2533
2534         __disable_runtime(rq);
2535
2536         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2537 }
2538
2539 /*
2540  * When switch from the rt queue, we bring ourselves to a position
2541  * that we might want to pull RT tasks from other runqueues.
2542  */
2543 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2544 {
2545         /*
2546          * On class switch from rt, always cancel active schedtune timers,
2547          * this handles the cases where we switch class for a task that is
2548          * already rt-dequeued but has a running timer.
2549          */
2550         schedtune_dequeue_rt(rq, p);
2551
2552         /*
2553          * If there are other RT tasks then we will reschedule
2554          * and the scheduling of the other RT tasks will handle
2555          * the balancing. But if we are the last RT task
2556          * we may need to handle the pulling of RT tasks
2557          * now.
2558          */
2559         if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
2560                 cpu_isolated(cpu_of(rq)))
2561                 return;
2562
2563         queue_pull_task(rq);
2564 }
2565
2566 void __init init_sched_rt_class(void)
2567 {
2568         unsigned int i;
2569
2570         for_each_possible_cpu(i) {
2571                 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2572                                         GFP_KERNEL, cpu_to_node(i));
2573         }
2574 }
2575
2576 #endif /* CONFIG_SMP */
2577
2578 /*
2579  * When switching a task to RT, we may overload the runqueue
2580  * with RT tasks. In this case we try to push them off to
2581  * other runqueues.
2582  */
2583 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2584 {
2585         /*
2586          * If we are already running, then there's nothing
2587          * that needs to be done. But if we are not running
2588          * we may need to preempt the current running task.
2589          * If that current running task is also an RT task
2590          * then see if we can move to another run queue.
2591          */
2592         if (task_on_rq_queued(p) && rq->curr != p) {
2593 #ifdef CONFIG_SMP
2594                 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2595                         queue_push_tasks(rq);
2596 #endif /* CONFIG_SMP */
2597                 if (p->prio < rq->curr->prio)
2598                         resched_curr(rq);
2599         }
2600 }
2601
2602 /*
2603  * Priority of the task has changed. This may cause
2604  * us to initiate a push or pull.
2605  */
2606 static void
2607 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2608 {
2609         if (!task_on_rq_queued(p))
2610                 return;
2611
2612         if (rq->curr == p) {
2613 #ifdef CONFIG_SMP
2614                 /*
2615                  * If our priority decreases while running, we
2616                  * may need to pull tasks to this runqueue.
2617                  */
2618                 if (oldprio < p->prio)
2619                         queue_pull_task(rq);
2620
2621                 /*
2622                  * If there's a higher priority task waiting to run
2623                  * then reschedule.
2624                  */
2625                 if (p->prio > rq->rt.highest_prio.curr)
2626                         resched_curr(rq);
2627 #else
2628                 /* For UP simply resched on drop of prio */
2629                 if (oldprio < p->prio)
2630                         resched_curr(rq);
2631 #endif /* CONFIG_SMP */
2632         } else {
2633                 /*
2634                  * This task is not running, but if it is
2635                  * greater than the current running task
2636                  * then reschedule.
2637                  */
2638                 if (p->prio < rq->curr->prio)
2639                         resched_curr(rq);
2640         }
2641 }
2642
2643 static void watchdog(struct rq *rq, struct task_struct *p)
2644 {
2645         unsigned long soft, hard;
2646
2647         /* max may change after cur was read, this will be fixed next tick */
2648         soft = task_rlimit(p, RLIMIT_RTTIME);
2649         hard = task_rlimit_max(p, RLIMIT_RTTIME);
2650
2651         if (soft != RLIM_INFINITY) {
2652                 unsigned long next;
2653
2654                 if (p->rt.watchdog_stamp != jiffies) {
2655                         p->rt.timeout++;
2656                         p->rt.watchdog_stamp = jiffies;
2657                 }
2658
2659                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2660                 if (p->rt.timeout > next)
2661                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2662         }
2663 }
2664
2665 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2666 {
2667         struct sched_rt_entity *rt_se = &p->rt;
2668
2669         update_curr_rt(rq);
2670
2671         watchdog(rq, p);
2672
2673         /*
2674          * RR tasks need a special form of timeslice management.
2675          * FIFO tasks have no timeslices.
2676          */
2677         if (p->policy != SCHED_RR)
2678                 return;
2679
2680         if (--p->rt.time_slice)
2681                 return;
2682
2683         p->rt.time_slice = sched_rr_timeslice;
2684
2685         /*
2686          * Requeue to the end of queue if we (and all of our ancestors) are not
2687          * the only element on the queue
2688          */
2689         for_each_sched_rt_entity(rt_se) {
2690                 if (rt_se->run_list.prev != rt_se->run_list.next) {
2691                         requeue_task_rt(rq, p, 0);
2692                         resched_curr(rq);
2693                         return;
2694                 }
2695         }
2696 }
2697
2698 static void set_curr_task_rt(struct rq *rq)
2699 {
2700         struct task_struct *p = rq->curr;
2701
2702         p->se.exec_start = rq_clock_task(rq);
2703
2704         /* The running task is never eligible for pushing */
2705         dequeue_pushable_task(rq, p);
2706 }
2707
2708 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2709 {
2710         /*
2711          * Time slice is 0 for SCHED_FIFO tasks
2712          */
2713         if (task->policy == SCHED_RR)
2714                 return sched_rr_timeslice;
2715         else
2716                 return 0;
2717 }
2718
2719 const struct sched_class rt_sched_class = {
2720         .next                   = &fair_sched_class,
2721         .enqueue_task           = enqueue_task_rt,
2722         .dequeue_task           = dequeue_task_rt,
2723         .yield_task             = yield_task_rt,
2724
2725         .check_preempt_curr     = check_preempt_curr_rt,
2726
2727         .pick_next_task         = pick_next_task_rt,
2728         .put_prev_task          = put_prev_task_rt,
2729
2730 #ifdef CONFIG_SMP
2731         .select_task_rq         = select_task_rq_rt,
2732
2733         .set_cpus_allowed       = set_cpus_allowed_common,
2734         .rq_online              = rq_online_rt,
2735         .rq_offline             = rq_offline_rt,
2736         .task_woken             = task_woken_rt,
2737         .switched_from          = switched_from_rt,
2738 #endif
2739
2740         .set_curr_task          = set_curr_task_rt,
2741         .task_tick              = task_tick_rt,
2742
2743         .get_rr_interval        = get_rr_interval_rt,
2744
2745         .prio_changed           = prio_changed_rt,
2746         .switched_to            = switched_to_rt,
2747
2748         .update_curr            = update_curr_rt,
2749 #ifdef CONFIG_SCHED_HMP
2750         .inc_hmp_sched_stats    = inc_hmp_sched_stats_rt,
2751         .dec_hmp_sched_stats    = dec_hmp_sched_stats_rt,
2752         .fixup_hmp_sched_stats  = fixup_hmp_sched_stats_rt,
2753 #endif
2754 };
2755
2756 #ifdef CONFIG_SCHED_DEBUG
2757 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2758
2759 void print_rt_stats(struct seq_file *m, int cpu)
2760 {
2761         rt_rq_iter_t iter;
2762         struct rt_rq *rt_rq;
2763
2764         rcu_read_lock();
2765         for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2766                 print_rt_rq(m, cpu, rt_rq);
2767         rcu_read_unlock();
2768 }
2769 #endif /* CONFIG_SCHED_DEBUG */