rcu: Get rid of synchronize_sched_expedited()'s polling loop

author Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Wed, 24 Jun 2015 21:20:08 +0000 (14:20 -0700)

committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Fri, 17 Jul 2015 21:58:48 +0000 (14:58 -0700)
author Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Wed, 24 Jun 2015 21:20:08 +0000 (14:20 -0700)
committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 17 Jul 2015 21:58:48 +0000 (14:58 -0700)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 3c182fd..b310b40 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");
  
  static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
  static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
  
  /*
   * In order to export the rcu_state name to the tracing tools, it
@@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \
         .orphan_nxttail = &sname##_state.orphan_nxtlist, \
         .orphan_donetail = &sname##_state.orphan_donelist, \
         .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-       .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
         .name = RCU_STATE_NAME(sname), \
         .abbr = sabbr, \
  }
@@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
         return 0;
  }
  
+/* Common code for synchronize_sched_expedited() work-done checking. */
+static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
+                             atomic_long_t *stat, unsigned long s)
+{
+       if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
+               if (rnp)
+                       mutex_unlock(&rnp->exp_funnel_mutex);
+               /* Ensure test happens before caller kfree(). */
+               smp_mb__before_atomic(); /* ^^^ */
+               atomic_long_inc(stat);
+               put_online_cpus();
+               return true;
+       }
+       return false;
+}
+
  /**
   * synchronize_sched_expedited - Brute-force RCU-sched grace period
   *
@@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
   * This implementation can be thought of as an application of sequence
   * locking to expedited grace periods, but using the sequence counter to
   * determine when someone else has already done the work instead of for
- * retrying readers.  We do a mutex_trylock() polling loop, but if we fail
- * too many times in a row, we fall back to synchronize_sched().
+ * retrying readers.
   */
  void synchronize_sched_expedited(void)
  {
         int cpu;
         long s;
-       int trycount = 0;
         struct rcu_state *rsp = &rcu_sched_state;
+       struct rcu_node *rnp0;
+       struct rcu_node *rnp1 = NULL;
  
         /* Take a snapshot of the sequence number.  */
         smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void)
         WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
  
         /*
-        * Each pass through the following loop attempts to acquire
-        * ->expedited_mutex, checking for others doing our work each time.
+        * Each pass through the following loop works its way
+        * up the rcu_node tree, returning if others have done the
+        * work or otherwise falls through holding the root rnp's
+        * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
+        * can be inexact, as it is just promoting locality and is not
+        * strictly needed for correctness.
          */
-       while (!mutex_trylock(&rsp->expedited_mutex)) {
-               put_online_cpus();
-               atomic_long_inc(&rsp->expedited_tryfail);
-
-               /* Check to see if someone else did our work for us. */
-               if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
-                       /* ensure test happens before caller kfree */
-                       smp_mb__before_atomic(); /* ^^^ */
-                       atomic_long_inc(&rsp->expedited_workdone1);
-                       return;
-               }
-
-               /* No joy, try again later.  Or just synchronize_sched(). */
-               if (trycount++ < 10) {
-                       udelay(trycount * num_online_cpus());
-               } else {
-                       wait_rcu_gp(call_rcu_sched);
-                       atomic_long_inc(&rsp->expedited_normal);
+       rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+       for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+               if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
                         return;
-               }
-
-               /* Recheck to see if someone else did our work for us. */
-               if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
-                       /* ensure test happens before caller kfree */
-                       smp_mb__before_atomic(); /* ^^^ */
-                       atomic_long_inc(&rsp->expedited_workdone2);
-                       return;
-               }
-
-               /*
-                * Refetching sync_sched_expedited_started allows later
-                * callers to piggyback on our grace period.  We retry
-                * after they started, so our grace period works for them,
-                * and they started after our first try, so their grace
-                * period works for us.
-                */
-               if (!try_get_online_cpus()) {
-                       /* CPU hotplug operation in flight, use normal GP. */
-                       wait_rcu_gp(call_rcu_sched);
-                       atomic_long_inc(&rsp->expedited_normal);
-                       return;
-               }
+               mutex_lock(&rnp0->exp_funnel_mutex);
+               if (rnp1)
+                       mutex_unlock(&rnp1->exp_funnel_mutex);
+               rnp1 = rnp0;
         }
-
-       /* Recheck yet again to see if someone else did our work for us. */
-       if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
-               rsp->expedited_workdone3++;
-               mutex_unlock(&rsp->expedited_mutex);
-               smp_mb(); /* ensure test happens before caller kfree */
+       rnp0 = rnp1;  /* rcu_get_root(rsp), AKA root rcu_node structure. */
+       if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
                 return;
-       }
  
         WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
         smp_mb(); /* Ensure expedited GP seen after counter increment. */
@@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void)
         smp_mb(); /* Ensure expedited GP seen before counter increment. */
         WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
         WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
-       mutex_unlock(&rsp->expedited_mutex);
+       mutex_unlock(&rnp0->exp_funnel_mutex);
         smp_mb(); /* ensure subsequent action seen after grace period. */
  
         put_online_cpus();
@@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
  {
         static const char * const buf[] = RCU_NODE_NAME_INIT;
         static const char * const fqs[] = RCU_FQS_NAME_INIT;
+       static const char * const exp[] = RCU_EXP_NAME_INIT;
         static u8 fl_mask = 0x1;
  
         int levelcnt[RCU_NUM_LVLS];             /* # nodes in each level. */
@@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                         rnp->level = i;
                         INIT_LIST_HEAD(&rnp->blkd_tasks);
                         rcu_init_one_nocb(rnp);
+                       mutex_init(&rnp->exp_funnel_mutex);
+                       lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+                                                  &rcu_exp_class[i], exp[i]);
                 }
         }
  
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index 6a2b741..2ef036b 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -68,6 +68,7 @@
  #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
  #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
  #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
  #elif NR_CPUS <= RCU_FANOUT_2
  #  define RCU_NUM_LVLS       2
  #  define NUM_RCU_LVL_0              1
@@ -76,6 +77,7 @@
  #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
  #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
  #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
  #elif NR_CPUS <= RCU_FANOUT_3
  #  define RCU_NUM_LVLS       3
  #  define NUM_RCU_LVL_0              1
@@ -85,6 +87,7 @@
  #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
  #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
  #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
  #elif NR_CPUS <= RCU_FANOUT_4
  #  define RCU_NUM_LVLS       4
  #  define NUM_RCU_LVL_0              1
@@ -95,6 +98,7 @@
  #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
  #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
  #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
  #else
  # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
  #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -237,6 +241,8 @@ struct rcu_node {
         int need_future_gp[2];
                                 /* Counts of upcoming no-CB GP requests. */
         raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+       struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
  } ____cacheline_internodealigned_in_smp;
  
  /*
@@ -480,12 +486,10 @@ struct rcu_state {
                                                 /*  _rcu_barrier(). */
         /* End of fields guarded by barrier_mutex. */
  
-       struct mutex  expedited_mutex;          /* Serializes expediting. */
         unsigned long expedited_sequence;       /* Take a ticket. */
         atomic_long_t expedited_tryfail;        /* # acquisition failures. */
         atomic_long_t expedited_workdone1;      /* # done by others #1. */
         atomic_long_t expedited_workdone2;      /* # done by others #2. */
-       unsigned long expedited_workdone3;      /* # done by others #3. */
         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
  
         unsigned long jiffies_force_qs;         /* Time at which to invoke */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c

index a1ab3a5..d2aab8d 100644 (file)
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v)
  {
         struct rcu_state *rsp = (struct rcu_state *)m->private;
  
-       seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n",
+       seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
                    rsp->expedited_sequence,
                    atomic_long_read(&rsp->expedited_tryfail),
                    atomic_long_read(&rsp->expedited_workdone1),
                    atomic_long_read(&rsp->expedited_workdone2),
-                  rsp->expedited_workdone3,
                    atomic_long_read(&rsp->expedited_normal),
                    rsp->expedited_sequence / 2);
         return 0;
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Wed, 24 Jun 2015 21:20:08 +0000 (14:20 -0700)
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Fri, 17 Jul 2015 21:58:48 +0000 (14:58 -0700)
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_trace.c		patch \| blob \| history