sched/numa: Evaluate move once per node

author Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Wed, 20 Jun 2018 17:02:43 +0000 (22:32 +0530)

committer Ingo Molnar <mingo@kernel.org>

Wed, 25 Jul 2018 09:41:06 +0000 (11:41 +0200)
author Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Wed, 20 Jun 2018 17:02:43 +0000 (22:32 +0530)
committer Ingo Molnar <mingo@kernel.org>
Wed, 25 Jul 2018 09:41:06 +0000 (11:41 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 14c3fdd..b10e066 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1580,9 +1580,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
   * be exchanged with the source task
   */
  static void task_numa_compare(struct task_numa_env *env,
-                             long taskimp, long groupimp)
+                             long taskimp, long groupimp, bool maymove)
  {
-       struct rq *src_rq = cpu_rq(env->src_cpu);
         struct rq *dst_rq = cpu_rq(env->dst_cpu);
         struct task_struct *cur;
         long src_load, dst_load;
@@ -1603,97 +1602,73 @@ static void task_numa_compare(struct task_numa_env *env,
         if (cur == env->p)
                 goto unlock;
  
+       if (!cur) {
+               if (maymove || imp > env->best_imp)
+                       goto assign;
+               else
+                       goto unlock;
+       }
+
         /*
          * "imp" is the fault differential for the source task between the
          * source and destination node. Calculate the total differential for
          * the source task and potential destination task. The more negative
-        * the value is, the more rmeote accesses that would be expected to
+        * the value is, the more remote accesses that would be expected to
          * be incurred if the tasks were swapped.
          */
-       if (cur) {
-               /* Skip this swap candidate if cannot move to the source CPU: */
-               if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
-                       goto unlock;
+       /* Skip this swap candidate if cannot move to the source cpu */
+       if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+               goto unlock;
  
+       /*
+        * If dst and source tasks are in the same NUMA group, or not
+        * in any group then look only at task weights.
+        */
+       if (cur->numa_group == env->p->numa_group) {
+               imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                     task_weight(cur, env->dst_nid, dist);
                 /*
-                * If dst and source tasks are in the same NUMA group, or not
-                * in any group then look only at task weights.
+                * Add some hysteresis to prevent swapping the
+                * tasks within a group over tiny differences.
                  */
-               if (cur->numa_group == env->p->numa_group) {
-                       imp = taskimp + task_weight(cur, env->src_nid, dist) -
-                             task_weight(cur, env->dst_nid, dist);
-                       /*
-                        * Add some hysteresis to prevent swapping the
-                        * tasks within a group over tiny differences.
-                        */
-                       if (cur->numa_group)
-                               imp -= imp/16;
-               } else {
-                       /*
-                        * Compare the group weights. If a task is all by
-                        * itself (not part of a group), use the task weight
-                        * instead.
-                        */
-                       if (cur->numa_group)
-                               imp += group_weight(cur, env->src_nid, dist) -
-                                      group_weight(cur, env->dst_nid, dist);
-                       else
-                               imp += task_weight(cur, env->src_nid, dist) -
-                                      task_weight(cur, env->dst_nid, dist);
-               }
+               if (cur->numa_group)
+                       imp -= imp / 16;
+       } else {
+               /*
+                * Compare the group weights. If a task is all by itself
+                * (not part of a group), use the task weight instead.
+                */
+               if (cur->numa_group && env->p->numa_group)
+                       imp += group_weight(cur, env->src_nid, dist) -
+                              group_weight(cur, env->dst_nid, dist);
+               else
+                       imp += task_weight(cur, env->src_nid, dist) -
+                              task_weight(cur, env->dst_nid, dist);
         }
  
-       if (imp <= env->best_imp && moveimp <= env->best_imp)
+       if (imp <= env->best_imp)
                 goto unlock;
  
-       if (!cur) {
-               /* Is there capacity at our destination? */
-               if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
-                   !env->dst_stats.has_free_capacity)
-                       goto unlock;
-
-               goto balance;
-       }
-
-       /* Balance doesn't matter much if we're running a task per CPU: */
-       if (imp > env->best_imp && src_rq->nr_running == 1 &&
-                       dst_rq->nr_running == 1)
+       if (maymove && moveimp > imp && moveimp > env->best_imp) {
+               imp = moveimp - 1;
+               cur = NULL;
                 goto assign;
+       }
  
         /*
          * In the overloaded case, try and keep the load balanced.
          */
-balance:
-       load = task_h_load(env->p);
+       load = task_h_load(env->p) - task_h_load(cur);
+       if (!load)
+               goto assign;
+
         dst_load = env->dst_stats.load + load;
         src_load = env->src_stats.load - load;
  
-       if (moveimp > imp && moveimp > env->best_imp) {
-               /*
-                * If the improvement from just moving env->p direction is
-                * better than swapping tasks around, check if a move is
-                * possible. Store a slightly smaller score than moveimp,
-                * so an actually idle CPU will win.
-                */
-               if (!load_too_imbalanced(src_load, dst_load, env)) {
-                       imp = moveimp - 1;
-                       cur = NULL;
-                       goto assign;
-               }
-       }
-
-       if (imp <= env->best_imp)
-               goto unlock;
-
-       if (cur) {
-               load = task_h_load(cur);
-               dst_load -= load;
-               src_load += load;
-       }
-
         if (load_too_imbalanced(src_load, dst_load, env))
                 goto unlock;
  
+assign:
         /*
          * One idle CPU per node is evaluated for a task numa move.
          * Call select_idle_sibling to maybe find a better one.
@@ -1709,7 +1684,6 @@ balance:
                 local_irq_enable();
         }
  
-assign:
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
@@ -1718,15 +1692,27 @@ unlock:
  static void task_numa_find_cpu(struct task_numa_env *env,
                                 long taskimp, long groupimp)
  {
+       long src_load, dst_load, load;
+       bool maymove = false;
         int cpu;
  
+       load = task_h_load(env->p);
+       dst_load = env->dst_stats.load + load;
+       src_load = env->src_stats.load - load;
+
+       /*
+        * If the improvement from just moving env->p direction is better
+        * than swapping tasks around, check if a move is possible.
+        */
+       maymove = !load_too_imbalanced(src_load, dst_load, env);
+
         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                 /* Skip this CPU if the source task cannot migrate */
                 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                         continue;
  
                 env->dst_cpu = cpu;
-               task_numa_compare(env, taskimp, groupimp);
+               task_numa_compare(env, taskimp, groupimp, maymove);
         }
  }
author	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
	Wed, 20 Jun 2018 17:02:43 +0000 (22:32 +0530)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 25 Jul 2018 09:41:06 +0000 (11:41 +0200)