Merge 4.4.187 into android-4.4

[sagit-ice-cold/kernel_xiaomi_msm8998.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ccf212f..134e2bd 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1878,6 +1878,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
         if (p->last_task_numa_placement) {
                 delta = runtime - p->last_sum_exec_runtime;
                 *period = now - p->last_task_numa_placement;
+
+               /* Avoid time going backwards, prevent potential divide error: */
+               if (unlikely((s64)*period < 0))
+                       *period = 0;
         } else {
                 delta = p->se.avg.load_sum / p->se.load.weight;
                 *period = LOAD_AVG_MAX;
@@ -2206,13 +2210,23 @@ no_join:
         return;
  }
  
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
  {
         struct numa_group *grp = p->numa_group;
-       void *numa_faults = p->numa_faults;
+       unsigned long *numa_faults = p->numa_faults;
         unsigned long flags;
         int i;
  
+       if (!numa_faults)
+               return;
+
         if (grp) {
                 spin_lock_irqsave(&grp->lock, flags);
                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2225,8 +2239,14 @@ void task_numa_free(struct task_struct *p)
                 put_numa_group(grp);
         }
  
-       p->numa_faults = NULL;
-       kfree(numa_faults);
+       if (final) {
+               p->numa_faults = NULL;
+               kfree(numa_faults);
+       } else {
+               p->total_numa_faults = 0;
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+                       numa_faults[i] = 0;
+       }
  }
  
  /*
@@ -4104,9 +4124,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  
         /*
          * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us
+        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+        * not running add to the tail so that later runqueues don't get starved.
          */
-       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (cfs_b->distribute_running)
+               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       else
+               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
  
         /*
          * If we're the first throttled task, make sure the bandwidth
@@ -4249,14 +4273,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
          * in us over-using our runtime if it is all used during this loop, but
          * only by limited amounts in that extreme case.
          */
-       while (throttled && cfs_b->runtime > 0) {
+       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                 runtime = cfs_b->runtime;
+               cfs_b->distribute_running = 1;
                 raw_spin_unlock(&cfs_b->lock);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                  runtime_expires);
                 raw_spin_lock(&cfs_b->lock);
  
+               cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4367,6 +4393,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->distribute_running) {
+               raw_spin_unlock(&cfs_b->lock);
+               return;
+       }
+
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                 raw_spin_unlock(&cfs_b->lock);
                 return;
@@ -4376,6 +4407,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                 runtime = cfs_b->runtime;
  
         expires = cfs_b->runtime_expires;
+       if (runtime)
+               cfs_b->distribute_running = 1;
+
         raw_spin_unlock(&cfs_b->lock);
  
         if (!runtime)
@@ -4386,6 +4420,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         raw_spin_lock(&cfs_b->lock);
         if (expires == cfs_b->runtime_expires)
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
+       cfs_b->distribute_running = 0;
         raw_spin_unlock(&cfs_b->lock);
  }
  
@@ -4463,12 +4498,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
         return HRTIMER_NORESTART;
  }
  
+extern const u64 max_cfs_quota_period;
+
  static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
  {
         struct cfs_bandwidth *cfs_b =
                 container_of(timer, struct cfs_bandwidth, period_timer);
         int overrun;
         int idle = 0;
+       int count = 0;
  
         raw_spin_lock(&cfs_b->lock);
         for (;;) {
@@ -4476,6 +4514,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                 if (!overrun)
                         break;
  
+               if (++count > 3) {
+                       u64 new, old = ktime_to_ns(cfs_b->period);
+
+                       new = (old * 147) / 128; /* ~115% */
+                       new = min(new, max_cfs_quota_period);
+
+                       cfs_b->period = ns_to_ktime(new);
+
+                       /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+                       cfs_b->quota *= new;
+                       cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+                       pr_warn_ratelimited(
+        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+                               smp_processor_id(),
+                               div_u64(new, NSEC_PER_USEC),
+                                div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+                       /* reset count so we don't come right back in here */
+                       count = 0;
+               }
+
                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
         }
         if (idle)
@@ -4497,6 +4557,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
+       cfs_b->distribute_running = 0;
  }
  
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -7767,10 +7828,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
         if (cfs_rq->last_h_load_update == now)
                 return;
  
-       cfs_rq->h_load_next = NULL;
+       WRITE_ONCE(cfs_rq->h_load_next, NULL);
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
-               cfs_rq->h_load_next = se;
+               WRITE_ONCE(cfs_rq->h_load_next, se);
                 if (cfs_rq->last_h_load_update == now)
                         break;
         }
@@ -7780,7 +7841,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
                 cfs_rq->last_h_load_update = now;
         }
  
-       while ((se = cfs_rq->h_load_next) != NULL) {
+       while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
                 load = cfs_rq->h_load;
                 load = div64_ul(load * se->avg.load_avg,
                         cfs_rq_load_avg(cfs_rq) + 1);
@@ -10265,11 +10326,8 @@ void free_fair_sched_group(struct task_group *tg)
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
-               if (tg->se) {
-                       if (tg->se[i])
-                               remove_entity_load_avg(tg->se[i]);
+               if (tg->se)
                         kfree(tg->se[i]);
-               }
         }
  
         kfree(tg->cfs_rq);
@@ -10324,21 +10382,29 @@ err:
         return 0;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
  {
-       struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
+       struct rq *rq;
+       int cpu;
  
-       /*
-       * Only empty task groups can be destroyed; so we can speculatively
-       * check on_list without danger of it being re-added.
-       */
-       if (!tg->cfs_rq[cpu]->on_list)
-               return;
+       for_each_possible_cpu(cpu) {
+               if (tg->se[cpu])
+                       remove_entity_load_avg(tg->se[cpu]);
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               /*
+                * Only empty task groups can be destroyed; so we can speculatively
+                * check on_list without danger of it being re-added.
+                */
+               if (!tg->cfs_rq[cpu]->on_list)
+                       continue;
+
+               rq = cpu_rq(cpu);
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
  }
  
  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -10422,7 +10488,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */