kernel/sched/cpufreq_schedutil.c

   1 /*
   2  * CPUFreq governor based on scheduler-provided CPU utilization data.
   3  *
   4  * Copyright (C) 2016, Intel Corporation
   5  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include "sched.h"
  15
  16 #include <linux/sched/cpufreq.h>
  17 #include <trace/events/power.h>
  18
  19 struct sugov_tunables {
  20         struct gov_attr_set     attr_set;
  21         unsigned int            up_rate_limit_us;
  22         unsigned int            down_rate_limit_us;
  23 };
  24
  25 struct sugov_policy {
  26         struct cpufreq_policy   *policy;
  27
  28         struct sugov_tunables   *tunables;
  29         struct list_head        tunables_hook;
  30
  31         raw_spinlock_t          update_lock;    /* For shared policies */
  32         u64                     last_freq_update_time;
  33         s64                     min_rate_limit_ns;
  34         s64                     up_rate_delay_ns;
  35         s64                     down_rate_delay_ns;
  36         unsigned int            next_freq;
  37         unsigned int            cached_raw_freq;
  38
  39         /* The next fields are only needed if fast switch cannot be used: */
  40         struct                  irq_work irq_work;
  41         struct                  kthread_work work;
  42         struct                  mutex work_lock;
  43         struct                  kthread_worker worker;
  44         struct task_struct      *thread;
  45         bool                    work_in_progress;
  46
  47         bool                    need_freq_update;
  48 };
  49
  50 struct sugov_cpu {
  51         struct update_util_data update_util;
  52         struct sugov_policy     *sg_policy;
  53         unsigned int            cpu;
  54
  55         bool                    iowait_boost_pending;
  56         unsigned int            iowait_boost;
  57         u64                     last_update;
  58
  59         unsigned long           bw_dl;
  60         unsigned long           min;
  61         unsigned long           max;
  62
  63         /* The field below is for single-CPU policies only: */
  64 #ifdef CONFIG_NO_HZ_COMMON
  65         unsigned long           saved_idle_calls;
  66 #endif
  67 };
  68
  69 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  70
  71 /************************ Governor internals ***********************/
  72
  73 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  74 {
  75         s64 delta_ns;
  76
  77         /*
  78          * Since cpufreq_update_util() is called with rq->lock held for
  79          * the @target_cpu, our per-CPU data is fully serialized.
  80          *
  81          * However, drivers cannot in general deal with cross-CPU
  82          * requests, so while get_next_freq() will work, our
  83          * sugov_update_commit() call may not for the fast switching platforms.
  84          *
  85          * Hence stop here for remote requests if they aren't supported
  86          * by the hardware, as calculating the frequency is pointless if
  87          * we cannot in fact act on it.
  88          *
  89          * For the slow switching platforms, the kthread is always scheduled on
  90          * the right set of CPUs and any CPU can find the next frequency and
  91          * schedule the kthread.
  92          */
  93         if (sg_policy->policy->fast_switch_enabled &&
  94             !cpufreq_this_cpu_can_update(sg_policy->policy))
  95                 return false;
  96
  97         if (unlikely(sg_policy->need_freq_update))
  98                 return true;
  99
 100         /* No need to recalculate next freq for min_rate_limit_us
 101          * at least. However we might still decide to further rate
 102          * limit once frequency change direction is decided, according
 103          * to the separate rate limits.
 104          */
 105
 106         delta_ns = time - sg_policy->last_freq_update_time;
 107         return delta_ns >= sg_policy->min_rate_limit_ns;
 108 }
 109
 110 static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
 111                                      unsigned int next_freq)
 112 {
 113         s64 delta_ns;
 114
 115         delta_ns = time - sg_policy->last_freq_update_time;
 116
 117         if (next_freq > sg_policy->next_freq &&
 118             delta_ns < sg_policy->up_rate_delay_ns)
 119                         return true;
 120
 121         if (next_freq < sg_policy->next_freq &&
 122             delta_ns < sg_policy->down_rate_delay_ns)
 123                         return true;
 124
 125         return false;
 126 }
 127
 128 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
 129                                    unsigned int next_freq)
 130 {
 131         if (sg_policy->next_freq == next_freq)
 132                 return false;
 133
 134         if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
 135                 return false;
 136
 137         sg_policy->next_freq = next_freq;
 138         sg_policy->last_freq_update_time = time;
 139
 140         return true;
 141 }
 142
 143 static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
 144                               unsigned int next_freq)
 145 {
 146         struct cpufreq_policy *policy = sg_policy->policy;
 147
 148         if (!sugov_update_next_freq(sg_policy, time, next_freq))
 149                 return;
 150
 151         next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 152         if (!next_freq)
 153                 return;
 154
 155         policy->cur = next_freq;
 156         trace_cpu_frequency(next_freq, smp_processor_id());
 157 }
 158
 159 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
 160                                   unsigned int next_freq)
 161 {
 162         if (!sugov_update_next_freq(sg_policy, time, next_freq))
 163                 return;
 164
 165         if (!sg_policy->work_in_progress) {
 166                 sg_policy->work_in_progress = true;
 167                 irq_work_queue(&sg_policy->irq_work);
 168         }
 169 }
 170
 171 /**
 172  * get_next_freq - Compute a new frequency for a given cpufreq policy.
 173  * @sg_policy: schedutil policy object to compute the new frequency for.
 174  * @util: Current CPU utilization.
 175  * @max: CPU capacity.
 176  *
 177  * If the utilization is frequency-invariant, choose the new frequency to be
 178  * proportional to it, that is
 179  *
 180  * next_freq = C * max_freq * util / max
 181  *
 182  * Otherwise, approximate the would-be frequency-invariant utilization by
 183  * util_raw * (curr_freq / max_freq) which leads to
 184  *
 185  * next_freq = C * curr_freq * util_raw / max
 186  *
 187  * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 188  *
 189  * The lowest driver-supported frequency which is equal or greater than the raw
 190  * next_freq (as calculated above) is returned, subject to policy min/max and
 191  * cpufreq driver limitations.
 192  */
 193 static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 194                                   unsigned long util, unsigned long max)
 195 {
 196         struct cpufreq_policy *policy = sg_policy->policy;
 197         unsigned int freq = arch_scale_freq_invariant() ?
 198                                 policy->cpuinfo.max_freq : policy->cur;
 199
 200         freq = map_util_freq(util, freq, max);
 201
 202         if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 203                 return sg_policy->next_freq;
 204
 205         sg_policy->need_freq_update = false;
 206         sg_policy->cached_raw_freq = freq;
 207         return cpufreq_driver_resolve_freq(policy, freq);
 208 }
 209
 210 /*
 211  * This function computes an effective utilization for the given CPU, to be
 212  * used for frequency selection given the linear relation: f = u * f_max.
 213  *
 214  * The scheduler tracks the following metrics:
 215  *
 216  *   cpu_util_{cfs,rt,dl,irq}()
 217  *   cpu_bw_dl()
 218  *
 219  * Where the cfs,rt and dl util numbers are tracked with the same metric and
 220  * synchronized windows and are thus directly comparable.
 221  *
 222  * The @util parameter passed to this function is assumed to be the aggregation
 223  * of RT and CFS util numbers. The cases of DL and IRQ are managed here.
 224  *
 225  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
 226  * which excludes things like IRQ and steal-time. These latter are then accrued
 227  * in the irq utilization.
 228  *
 229  * The DL bandwidth number otoh is not a measured metric but a value computed
 230  * based on the task model parameters and gives the minimal utilization
 231  * required to meet deadlines.
 232  */
 233 unsigned long schedutil_freq_util(int cpu, unsigned long util,
 234                                   unsigned long max, enum schedutil_type type)
 235 {
 236         unsigned long dl_util, irq;
 237         struct rq *rq = cpu_rq(cpu);
 238
 239         if (sched_feat(SUGOV_RT_MAX_FREQ) && type == FREQUENCY_UTIL &&
 240                                                 rt_rq_is_runnable(&rq->rt))
 241                 return max;
 242
 243         /*
 244          * Early check to see if IRQ/steal time saturates the CPU, can be
 245          * because of inaccuracies in how we track these -- see
 246          * update_irq_load_avg().
 247          */
 248         irq = cpu_util_irq(rq);
 249         if (unlikely(irq >= max))
 250                 return max;
 251
 252         /*
 253          * The function is called with @util defined as the aggregation (the
 254          * sum) of RT and CFS signals, hence leaving the special case of DL
 255          * to be delt with. The exact way of doing things depend on the calling
 256          * context.
 257          */
 258         dl_util = cpu_util_dl(rq);
 259
 260         /*
 261          * For frequency selection we do not make cpu_util_dl() a permanent part
 262          * of this sum because we want to use cpu_bw_dl() later on, but we need
 263          * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
 264          * that we select f_max when there is no idle time.
 265          *
 266          * NOTE: numerical errors or stop class might cause us to not quite hit
 267          * saturation when we should -- something for later.
 268          */
 269         if (util + dl_util >= max)
 270                 return max;
 271
 272         /*
 273          * OTOH, for energy computation we need the estimated running time, so
 274          * include util_dl and ignore dl_bw.
 275          */
 276         if (type == ENERGY_UTIL)
 277                 util += dl_util;
 278
 279         /*
 280          * There is still idle time; further improve the number by using the
 281          * irq metric. Because IRQ/steal time is hidden from the task clock we
 282          * need to scale the task numbers:
 283          *
 284          *              1 - irq
 285          *   U' = irq + ------- * U
 286          *                max
 287          */
 288         util = scale_irq_capacity(util, irq, max);
 289         util += irq;
 290
 291         /*
 292          * Bandwidth required by DEADLINE must always be granted while, for
 293          * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 294          * to gracefully reduce the frequency when no tasks show up for longer
 295          * periods of time.
 296          *
 297          * Ideally we would like to set bw_dl as min/guaranteed freq and util +
 298          * bw_dl as requested freq. However, cpufreq is not yet ready for such
 299          * an interface. So, we only do the latter for now.
 300          */
 301         if (type == FREQUENCY_UTIL)
 302                 util += cpu_bw_dl(rq);
 303
 304         return min(max, util);
 305 }
 306
 307 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 308 {
 309         struct rq *rq = cpu_rq(sg_cpu->cpu);
 310         unsigned long util = boosted_cpu_util(sg_cpu->cpu, cpu_util_rt(rq));
 311         unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
 312
 313         sg_cpu->max = max;
 314         sg_cpu->bw_dl = cpu_bw_dl(rq);
 315
 316         return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
 317 }
 318
 319 /**
 320  * sugov_iowait_reset() - Reset the IO boost status of a CPU.
 321  * @sg_cpu: the sugov data for the CPU to boost
 322  * @time: the update time from the caller
 323  * @set_iowait_boost: true if an IO boost has been requested
 324  *
 325  * The IO wait boost of a task is disabled after a tick since the last update
 326  * of a CPU. If a new IO wait boost is requested after more then a tick, then
 327  * we enable the boost starting from the minimum frequency, which improves
 328  * energy efficiency by ignoring sporadic wakeups from IO.
 329  */
 330 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 331                                bool set_iowait_boost)
 332 {
 333         s64 delta_ns = time - sg_cpu->last_update;
 334
 335         /* Reset boost only if a tick has elapsed since last request */
 336         if (delta_ns <= TICK_NSEC)
 337                 return false;
 338
 339         sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
 340         sg_cpu->iowait_boost_pending = set_iowait_boost;
 341
 342         return true;
 343 }
 344
 345 /**
 346  * sugov_iowait_boost() - Updates the IO boost status of a CPU.
 347  * @sg_cpu: the sugov data for the CPU to boost
 348  * @time: the update time from the caller
 349  * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 350  *
 351  * Each time a task wakes up after an IO operation, the CPU utilization can be
 352  * boosted to a certain utilization which doubles at each "frequent and
 353  * successive" wakeup from IO, ranging from the utilization of the minimum
 354  * OPP to the utilization of the maximum OPP.
 355  * To keep doubling, an IO boost has to be requested at least once per tick,
 356  * otherwise we restart from the utilization of the minimum OPP.
 357  */
 358 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 359                                unsigned int flags)
 360 {
 361         bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
 362
 363         /* Reset boost if the CPU appears to have been idle enough */
 364         if (sg_cpu->iowait_boost &&
 365             sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
 366                 return;
 367
 368         /* Boost only tasks waking up after IO */
 369         if (!set_iowait_boost)
 370                 return;
 371
 372         /* Ensure boost doubles only one time at each request */
 373         if (sg_cpu->iowait_boost_pending)
 374                 return;
 375         sg_cpu->iowait_boost_pending = true;
 376
 377         /* Double the boost at each request */
 378         if (sg_cpu->iowait_boost) {
 379                 sg_cpu->iowait_boost =
 380                         min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
 381                 return;
 382         }
 383
 384         /* First wakeup after IO: start with minimum boost */
 385         sg_cpu->iowait_boost = sg_cpu->min;
 386 }
 387
 388 /**
 389  * sugov_iowait_apply() - Apply the IO boost to a CPU.
 390  * @sg_cpu: the sugov data for the cpu to boost
 391  * @time: the update time from the caller
 392  * @util: the utilization to (eventually) boost
 393  * @max: the maximum value the utilization can be boosted to
 394  *
 395  * A CPU running a task which woken up after an IO operation can have its
 396  * utilization boosted to speed up the completion of those IO operations.
 397  * The IO boost value is increased each time a task wakes up from IO, in
 398  * sugov_iowait_apply(), and it's instead decreased by this function,
 399  * each time an increase has not been requested (!iowait_boost_pending).
 400  *
 401  * A CPU which also appears to have been idle for at least one tick has also
 402  * its IO boost utilization reset.
 403  *
 404  * This mechanism is designed to boost high frequently IO waiting tasks, while
 405  * being more conservative on tasks which does sporadic IO operations.
 406  */
 407 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 408                                         unsigned long util, unsigned long max)
 409 {
 410         unsigned long boost;
 411
 412         /* No boost currently required */
 413         if (!sg_cpu->iowait_boost)
 414                 return util;
 415
 416         /* Reset boost if the CPU appears to have been idle enough */
 417         if (sugov_iowait_reset(sg_cpu, time, false))
 418                 return util;
 419
 420         if (!sg_cpu->iowait_boost_pending) {
 421                 /*
 422                  * No boost pending; reduce the boost value.
 423                  */
 424                 sg_cpu->iowait_boost >>= 1;
 425                 if (sg_cpu->iowait_boost < sg_cpu->min) {
 426                         sg_cpu->iowait_boost = 0;
 427                         return util;
 428                 }
 429         }
 430
 431         sg_cpu->iowait_boost_pending = false;
 432
 433         /*
 434          * @util is already in capacity scale; convert iowait_boost
 435          * into the same scale so we can compare.
 436          */
 437         boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
 438         return max(boost, util);
 439 }
 440
 441 #ifdef CONFIG_NO_HZ_COMMON
 442 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 443 {
 444         unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 445         bool ret = idle_calls == sg_cpu->saved_idle_calls;
 446
 447         sg_cpu->saved_idle_calls = idle_calls;
 448         return ret;
 449 }
 450 #else
 451 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 452 #endif /* CONFIG_NO_HZ_COMMON */
 453
 454 /*
 455  * Make sugov_should_update_freq() ignore the rate limit when DL
 456  * has increased the utilization.
 457  */
 458 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
 459 {
 460         if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
 461                 sg_policy->need_freq_update = true;
 462 }
 463
 464 static void sugov_update_single(struct update_util_data *hook, u64 time,
 465                                 unsigned int flags)
 466 {
 467         struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 468         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 469         unsigned long util, max;
 470         unsigned int next_f;
 471         bool busy;
 472
 473         sugov_iowait_boost(sg_cpu, time, flags);
 474         sg_cpu->last_update = time;
 475
 476         ignore_dl_rate_limit(sg_cpu, sg_policy);
 477
 478         if (!sugov_should_update_freq(sg_policy, time))
 479                 return;
 480
 481         busy = sugov_cpu_is_busy(sg_cpu);
 482
 483         util = sugov_get_util(sg_cpu);
 484         max = sg_cpu->max;
 485         util = sugov_iowait_apply(sg_cpu, time, util, max);
 486         next_f = get_next_freq(sg_policy, util, max);
 487         /*
 488          * Do not reduce the frequency if the CPU has not been idle
 489          * recently, as the reduction is likely to be premature then.
 490          */
 491         if (busy && next_f < sg_policy->next_freq) {
 492                 next_f = sg_policy->next_freq;
 493
 494                 /* Reset cached freq as next_freq has changed */
 495                 sg_policy->cached_raw_freq = 0;
 496         }
 497
 498         /*
 499          * This code runs under rq->lock for the target CPU, so it won't run
 500          * concurrently on two different CPUs for the same target and it is not
 501          * necessary to acquire the lock in the fast switch case.
 502          */
 503         if (sg_policy->policy->fast_switch_enabled) {
 504                 sugov_fast_switch(sg_policy, time, next_f);
 505         } else {
 506                 raw_spin_lock(&sg_policy->update_lock);
 507                 sugov_deferred_update(sg_policy, time, next_f);
 508                 raw_spin_unlock(&sg_policy->update_lock);
 509         }
 510 }
 511
 512 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 513 {
 514         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 515         struct cpufreq_policy *policy = sg_policy->policy;
 516         unsigned long util = 0, max = 1;
 517         unsigned int j;
 518
 519         for_each_cpu(j, policy->cpus) {
 520                 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 521                 unsigned long j_util, j_max;
 522
 523                 j_util = sugov_get_util(j_sg_cpu);
 524                 j_max = j_sg_cpu->max;
 525                 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
 526
 527                 if (j_util * max > j_max * util) {
 528                         util = j_util;
 529                         max = j_max;
 530                 }
 531         }
 532
 533         return get_next_freq(sg_policy, util, max);
 534 }
 535
 536 static void
 537 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 538 {
 539         struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 540         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 541         unsigned int next_f;
 542
 543         raw_spin_lock(&sg_policy->update_lock);
 544
 545         sugov_iowait_boost(sg_cpu, time, flags);
 546         sg_cpu->last_update = time;
 547
 548         ignore_dl_rate_limit(sg_cpu, sg_policy);
 549
 550         if (sugov_should_update_freq(sg_policy, time)) {
 551                 next_f = sugov_next_freq_shared(sg_cpu, time);
 552
 553                 if (sg_policy->policy->fast_switch_enabled)
 554                         sugov_fast_switch(sg_policy, time, next_f);
 555                 else
 556                         sugov_deferred_update(sg_policy, time, next_f);
 557         }
 558
 559         raw_spin_unlock(&sg_policy->update_lock);
 560 }
 561
 562 static void sugov_work(struct kthread_work *work)
 563 {
 564         struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 565         unsigned int freq;
 566         unsigned long flags;
 567
 568         /*
 569          * Hold sg_policy->update_lock shortly to handle the case where:
 570          * incase sg_policy->next_freq is read here, and then updated by
 571          * sugov_deferred_update() just before work_in_progress is set to false
 572          * here, we may miss queueing the new update.
 573          *
 574          * Note: If a work was queued after the update_lock is released,
 575          * sugov_work() will just be called again by kthread_work code; and the
 576          * request will be proceed before the sugov thread sleeps.
 577          */
 578         raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 579         freq = sg_policy->next_freq;
 580         sg_policy->work_in_progress = false;
 581         raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 582
 583         mutex_lock(&sg_policy->work_lock);
 584         __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
 585         mutex_unlock(&sg_policy->work_lock);
 586 }
 587
 588 static void sugov_irq_work(struct irq_work *irq_work)
 589 {
 590         struct sugov_policy *sg_policy;
 591
 592         sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 593
 594         kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 595 }
 596
 597 /************************** sysfs interface ************************/
 598
 599 static struct sugov_tunables *global_tunables;
 600 static DEFINE_MUTEX(global_tunables_lock);
 601
 602 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
 603 {
 604         return container_of(attr_set, struct sugov_tunables, attr_set);
 605 }
 606
 607 static DEFINE_MUTEX(min_rate_lock);
 608
 609 static void update_min_rate_limit_ns(struct sugov_policy *sg_policy)
 610 {
 611         mutex_lock(&min_rate_lock);
 612         sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
 613                                            sg_policy->down_rate_delay_ns);
 614         mutex_unlock(&min_rate_lock);
 615 }
 616
 617 static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 618 {
 619         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 620
 621         return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
 622 }
 623
 624 static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 625 {
 626         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 627
 628         return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
 629 }
 630
 631 static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
 632                                       const char *buf, size_t count)
 633 {
 634         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 635         struct sugov_policy *sg_policy;
 636         unsigned int rate_limit_us;
 637
 638         if (kstrtouint(buf, 10, &rate_limit_us))
 639                 return -EINVAL;
 640
 641         tunables->up_rate_limit_us = rate_limit_us;
 642
 643         list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
 644                 sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
 645                 update_min_rate_limit_ns(sg_policy);
 646         }
 647
 648         return count;
 649 }
 650
 651 static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
 652                                         const char *buf, size_t count)
 653 {
 654         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 655         struct sugov_policy *sg_policy;
 656         unsigned int rate_limit_us;
 657
 658         if (kstrtouint(buf, 10, &rate_limit_us))
 659                 return -EINVAL;
 660
 661         tunables->down_rate_limit_us = rate_limit_us;
 662
 663         list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
 664                 sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
 665                 update_min_rate_limit_ns(sg_policy);
 666         }
 667
 668         return count;
 669 }
 670
 671 static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
 672 static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
 673
 674 static struct attribute *sugov_attributes[] = {
 675         &up_rate_limit_us.attr,
 676         &down_rate_limit_us.attr,
 677         NULL
 678 };
 679
 680 static struct kobj_type sugov_tunables_ktype = {
 681         .default_attrs = sugov_attributes,
 682         .sysfs_ops = &governor_sysfs_ops,
 683 };
 684
 685 /********************** cpufreq governor interface *********************/
 686
 687 struct cpufreq_governor schedutil_gov;
 688
 689 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 690 {
 691         struct sugov_policy *sg_policy;
 692
 693         sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
 694         if (!sg_policy)
 695                 return NULL;
 696
 697         sg_policy->policy = policy;
 698         raw_spin_lock_init(&sg_policy->update_lock);
 699         return sg_policy;
 700 }
 701
 702 static void sugov_policy_free(struct sugov_policy *sg_policy)
 703 {
 704         kfree(sg_policy);
 705 }
 706
 707 static int sugov_kthread_create(struct sugov_policy *sg_policy)
 708 {
 709         struct task_struct *thread;
 710         struct sched_attr attr = {
 711                 .size           = sizeof(struct sched_attr),
 712                 .sched_policy   = SCHED_DEADLINE,
 713                 .sched_flags    = SCHED_FLAG_SUGOV,
 714                 .sched_nice     = 0,
 715                 .sched_priority = 0,
 716                 /*
 717                  * Fake (unused) bandwidth; workaround to "fix"
 718                  * priority inheritance.
 719                  */
 720                 .sched_runtime  =  1000000,
 721                 .sched_deadline = 10000000,
 722                 .sched_period   = 10000000,
 723         };
 724         struct cpufreq_policy *policy = sg_policy->policy;
 725         int ret;
 726
 727         /* kthread only required for slow path */
 728         if (policy->fast_switch_enabled)
 729                 return 0;
 730
 731         kthread_init_work(&sg_policy->work, sugov_work);
 732         kthread_init_worker(&sg_policy->worker);
 733         thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
 734                                 "sugov:%d",
 735                                 cpumask_first(policy->related_cpus));
 736         if (IS_ERR(thread)) {
 737                 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
 738                 return PTR_ERR(thread);
 739         }
 740
 741         ret = sched_setattr_nocheck(thread, &attr);
 742         if (ret) {
 743                 kthread_stop(thread);
 744                 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
 745                 return ret;
 746         }
 747
 748         sg_policy->thread = thread;
 749         kthread_bind_mask(thread, policy->related_cpus);
 750         init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 751         mutex_init(&sg_policy->work_lock);
 752
 753         wake_up_process(thread);
 754
 755         return 0;
 756 }
 757
 758 static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 759 {
 760         /* kthread only required for slow path */
 761         if (sg_policy->policy->fast_switch_enabled)
 762                 return;
 763
 764         kthread_flush_worker(&sg_policy->worker);
 765         kthread_stop(sg_policy->thread);
 766         mutex_destroy(&sg_policy->work_lock);
 767 }
 768
 769 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 770 {
 771         struct sugov_tunables *tunables;
 772
 773         tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
 774         if (tunables) {
 775                 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
 776                 if (!have_governor_per_policy())
 777                         global_tunables = tunables;
 778         }
 779         return tunables;
 780 }
 781
 782 static void sugov_tunables_free(struct sugov_tunables *tunables)
 783 {
 784         if (!have_governor_per_policy())
 785                 global_tunables = NULL;
 786
 787         kfree(tunables);
 788 }
 789
 790 static int sugov_init(struct cpufreq_policy *policy)
 791 {
 792         struct sugov_policy *sg_policy;
 793         struct sugov_tunables *tunables;
 794         int ret = 0;
 795
 796         /* State should be equivalent to EXIT */
 797         if (policy->governor_data)
 798                 return -EBUSY;
 799
 800         cpufreq_enable_fast_switch(policy);
 801
 802         sg_policy = sugov_policy_alloc(policy);
 803         if (!sg_policy) {
 804                 ret = -ENOMEM;
 805                 goto disable_fast_switch;
 806         }
 807
 808         ret = sugov_kthread_create(sg_policy);
 809         if (ret)
 810                 goto free_sg_policy;
 811
 812         mutex_lock(&global_tunables_lock);
 813
 814         if (global_tunables) {
 815                 if (WARN_ON(have_governor_per_policy())) {
 816                         ret = -EINVAL;
 817                         goto stop_kthread;
 818                 }
 819                 policy->governor_data = sg_policy;
 820                 sg_policy->tunables = global_tunables;
 821
 822                 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
 823                 goto out;
 824         }
 825
 826         tunables = sugov_tunables_alloc(sg_policy);
 827         if (!tunables) {
 828                 ret = -ENOMEM;
 829                 goto stop_kthread;
 830         }
 831
 832         tunables->up_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 833         tunables->down_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 834
 835         policy->governor_data = sg_policy;
 836         sg_policy->tunables = tunables;
 837
 838         ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
 839                                    get_governor_parent_kobj(policy), "%s",
 840                                    schedutil_gov.name);
 841         if (ret)
 842                 goto fail;
 843
 844 out:
 845         mutex_unlock(&global_tunables_lock);
 846         return 0;
 847
 848 fail:
 849         kobject_put(&tunables->attr_set.kobj);
 850         policy->governor_data = NULL;
 851         sugov_tunables_free(tunables);
 852
 853 stop_kthread:
 854         sugov_kthread_stop(sg_policy);
 855         mutex_unlock(&global_tunables_lock);
 856
 857 free_sg_policy:
 858         sugov_policy_free(sg_policy);
 859
 860 disable_fast_switch:
 861         cpufreq_disable_fast_switch(policy);
 862
 863         pr_err("initialization failed (error %d)\n", ret);
 864         return ret;
 865 }
 866
 867 static void sugov_exit(struct cpufreq_policy *policy)
 868 {
 869         struct sugov_policy *sg_policy = policy->governor_data;
 870         struct sugov_tunables *tunables = sg_policy->tunables;
 871         unsigned int count;
 872
 873         mutex_lock(&global_tunables_lock);
 874
 875         count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 876         policy->governor_data = NULL;
 877         if (!count)
 878                 sugov_tunables_free(tunables);
 879
 880         mutex_unlock(&global_tunables_lock);
 881
 882         sugov_kthread_stop(sg_policy);
 883         sugov_policy_free(sg_policy);
 884         cpufreq_disable_fast_switch(policy);
 885 }
 886
 887 static int sugov_start(struct cpufreq_policy *policy)
 888 {
 889         struct sugov_policy *sg_policy = policy->governor_data;
 890         unsigned int cpu;
 891
 892         sg_policy->up_rate_delay_ns =
 893                 sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
 894         sg_policy->down_rate_delay_ns =
 895                 sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
 896         update_min_rate_limit_ns(sg_policy);
 897         sg_policy->last_freq_update_time        = 0;
 898         sg_policy->next_freq                    = 0;
 899         sg_policy->work_in_progress             = false;
 900         sg_policy->need_freq_update             = false;
 901         sg_policy->cached_raw_freq              = 0;
 902
 903         for_each_cpu(cpu, policy->cpus) {
 904                 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 905
 906                 memset(sg_cpu, 0, sizeof(*sg_cpu));
 907                 sg_cpu->cpu                     = cpu;
 908                 sg_cpu->sg_policy               = sg_policy;
 909                 sg_cpu->min                     =
 910                         (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
 911                         policy->cpuinfo.max_freq;
 912         }
 913
 914         for_each_cpu(cpu, policy->cpus) {
 915                 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 916
 917                 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 918                                              policy_is_shared(policy) ?
 919                                                         sugov_update_shared :
 920                                                         sugov_update_single);
 921         }
 922         return 0;
 923 }
 924
 925 static void sugov_stop(struct cpufreq_policy *policy)
 926 {
 927         struct sugov_policy *sg_policy = policy->governor_data;
 928         unsigned int cpu;
 929
 930         for_each_cpu(cpu, policy->cpus)
 931                 cpufreq_remove_update_util_hook(cpu);
 932
 933         synchronize_sched();
 934
 935         if (!policy->fast_switch_enabled) {
 936                 irq_work_sync(&sg_policy->irq_work);
 937                 kthread_cancel_work_sync(&sg_policy->work);
 938         }
 939 }
 940
 941 static void sugov_limits(struct cpufreq_policy *policy)
 942 {
 943         struct sugov_policy *sg_policy = policy->governor_data;
 944
 945         if (!policy->fast_switch_enabled) {
 946                 mutex_lock(&sg_policy->work_lock);
 947                 cpufreq_policy_apply_limits(policy);
 948                 mutex_unlock(&sg_policy->work_lock);
 949         }
 950
 951         sg_policy->need_freq_update = true;
 952 }
 953
 954 struct cpufreq_governor schedutil_gov = {
 955         .name                   = "schedutil",
 956         .owner                  = THIS_MODULE,
 957         .dynamic_switching      = true,
 958         .init                   = sugov_init,
 959         .exit                   = sugov_exit,
 960         .start                  = sugov_start,
 961         .stop                   = sugov_stop,
 962         .limits                 = sugov_limits,
 963 };
 964
 965 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 966 struct cpufreq_governor *cpufreq_default_governor(void)
 967 {
 968         return &schedutil_gov;
 969 }
 970 #endif
 971
 972 static int __init sugov_register(void)
 973 {
 974         return cpufreq_register_governor(&schedutil_gov);
 975 }
 976 fs_initcall(sugov_register);
 977
 978 #ifdef CONFIG_ENERGY_MODEL
 979 extern bool sched_energy_update;
 980 extern struct mutex sched_energy_mutex;
 981
 982 static void rebuild_sd_workfn(struct work_struct *work)
 983 {
 984         mutex_lock(&sched_energy_mutex);
 985         sched_energy_update = true;
 986         rebuild_sched_domains();
 987         sched_energy_update = false;
 988         mutex_unlock(&sched_energy_mutex);
 989 }
 990 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
 991
 992 /*
 993  * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
 994  * on governor changes to make sure the scheduler knows about it.
 995  */
 996 void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
 997                                   struct cpufreq_governor *old_gov)
 998 {
 999         if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
1000                 /*
1001                  * When called from the cpufreq_register_driver() path, the
1002                  * cpu_hotplug_lock is already held, so use a work item to
1003                  * avoid nested locking in rebuild_sched_domains().
1004                  */
1005                 schedule_work(&rebuild_sd_work);
1006         }
1007
1008 }
1009 #endif