kernel/locking/percpu-rwsem.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include <linux/atomic.h>
   3 #include <linux/percpu.h>
   4 #include <linux/wait.h>
   5 #include <linux/lockdep.h>
   6 #include <linux/percpu-rwsem.h>
   7 #include <linux/rcupdate.h>
   8 #include <linux/sched.h>
   9 #include <linux/sched/task.h>
  10 #include <linux/sched/debug.h>
  11 #include <linux/errno.h>
  12
  13 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
  14                         const char *name, struct lock_class_key *key)
  15 {
  16         sem->read_count = alloc_percpu(int);
  17         if (unlikely(!sem->read_count))
  18                 return -ENOMEM;
  19
  20         rcu_sync_init(&sem->rss);
  21         rcuwait_init(&sem->writer);
  22         init_waitqueue_head(&sem->waiters);
  23         atomic_set(&sem->block, 0);
  24 #ifdef CONFIG_DEBUG_LOCK_ALLOC
  25         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
  26         lockdep_init_map(&sem->dep_map, name, key, 0);
  27 #endif
  28         return 0;
  29 }
  30 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
  31
  32 void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
  33 {
  34         /*
  35          * XXX: temporary kludge. The error path in alloc_super()
  36          * assumes that percpu_free_rwsem() is safe after kzalloc().
  37          */
  38         if (!sem->read_count)
  39                 return;
  40
  41         rcu_sync_dtor(&sem->rss);
  42         free_percpu(sem->read_count);
  43         sem->read_count = NULL; /* catch use after free bugs */
  44 }
  45 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
  46
  47 static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
  48 {
  49         this_cpu_inc(*sem->read_count);
  50
  51         /*
  52          * Due to having preemption disabled the decrement happens on
  53          * the same CPU as the increment, avoiding the
  54          * increment-on-one-CPU-and-decrement-on-another problem.
  55          *
  56          * If the reader misses the writer's assignment of sem->block, then the
  57          * writer is guaranteed to see the reader's increment.
  58          *
  59          * Conversely, any readers that increment their sem->read_count after
  60          * the writer looks are guaranteed to see the sem->block value, which
  61          * in turn means that they are guaranteed to immediately decrement
  62          * their sem->read_count, so that it doesn't matter that the writer
  63          * missed them.
  64          */
  65
  66         smp_mb(); /* A matches D */
  67
  68         /*
  69          * If !sem->block the critical section starts here, matched by the
  70          * release in percpu_up_write().
  71          */
  72         if (likely(!atomic_read_acquire(&sem->block)))
  73                 return true;
  74
  75         this_cpu_dec(*sem->read_count);
  76
  77         /* Prod writer to re-evaluate readers_active_check() */
  78         rcuwait_wake_up(&sem->writer);
  79
  80         return false;
  81 }
  82
  83 static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
  84 {
  85         if (atomic_read(&sem->block))
  86                 return false;
  87
  88         return atomic_xchg(&sem->block, 1) == 0;
  89 }
  90
  91 static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
  92 {
  93         if (reader) {
  94                 bool ret;
  95
  96                 preempt_disable();
  97                 ret = __percpu_down_read_trylock(sem);
  98                 preempt_enable();
  99
 100                 return ret;
 101         }
 102         return __percpu_down_write_trylock(sem);
 103 }
 104
 105 /*
 106  * The return value of wait_queue_entry::func means:
 107  *
 108  *  <0 - error, wakeup is terminated and the error is returned
 109  *   0 - no wakeup, a next waiter is tried
 110  *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
 111  *
 112  * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
 113  * and play games with the return value to allow waking multiple readers.
 114  *
 115  * Specifically, we wake readers until we've woken a single writer, or until a
 116  * trylock fails.
 117  */
 118 static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
 119                                       unsigned int mode, int wake_flags,
 120                                       void *key)
 121 {
 122         bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
 123         struct percpu_rw_semaphore *sem = key;
 124         struct task_struct *p;
 125
 126         /* concurrent against percpu_down_write(), can get stolen */
 127         if (!__percpu_rwsem_trylock(sem, reader))
 128                 return 1;
 129
 130         p = get_task_struct(wq_entry->private);
 131         list_del_init(&wq_entry->entry);
 132         smp_store_release(&wq_entry->private, NULL);
 133
 134         wake_up_process(p);
 135         put_task_struct(p);
 136
 137         return !reader; /* wake (readers until) 1 writer */
 138 }
 139
 140 static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
 141 {
 142         DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
 143         bool wait;
 144
 145         spin_lock_irq(&sem->waiters.lock);
 146         /*
 147          * Serialize against the wakeup in percpu_up_write(), if we fail
 148          * the trylock, the wakeup must see us on the list.
 149          */
 150         wait = !__percpu_rwsem_trylock(sem, reader);
 151         if (wait) {
 152                 wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
 153                 __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
 154         }
 155         spin_unlock_irq(&sem->waiters.lock);
 156
 157         while (wait) {
 158                 set_current_state(TASK_UNINTERRUPTIBLE);
 159                 if (!smp_load_acquire(&wq_entry.private))
 160                         break;
 161                 schedule();
 162         }
 163         __set_current_state(TASK_RUNNING);
 164 }
 165
 166 bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 167 {
 168         if (__percpu_down_read_trylock(sem))
 169                 return true;
 170
 171         if (try)
 172                 return false;
 173
 174         preempt_enable();
 175         percpu_rwsem_wait(sem, /* .reader = */ true);
 176         preempt_disable();
 177
 178         return true;
 179 }
 180 EXPORT_SYMBOL_GPL(__percpu_down_read);
 181
 182 #define per_cpu_sum(var)                                                \
 183 ({                                                                      \
 184         typeof(var) __sum = 0;                                          \
 185         int cpu;                                                        \
 186         compiletime_assert_atomic_type(__sum);                          \
 187         for_each_possible_cpu(cpu)                                      \
 188                 __sum += per_cpu(var, cpu);                             \
 189         __sum;                                                          \
 190 })
 191
 192 /*
 193  * Return true if the modular sum of the sem->read_count per-CPU variable is
 194  * zero.  If this sum is zero, then it is stable due to the fact that if any
 195  * newly arriving readers increment a given counter, they will immediately
 196  * decrement that same counter.
 197  *
 198  * Assumes sem->block is set.
 199  */
 200 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 201 {
 202         if (per_cpu_sum(*sem->read_count) != 0)
 203                 return false;
 204
 205         /*
 206          * If we observed the decrement; ensure we see the entire critical
 207          * section.
 208          */
 209
 210         smp_mb(); /* C matches B */
 211
 212         return true;
 213 }
 214
 215 void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
 216 {
 217         might_sleep();
 218         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 219
 220         /* Notify readers to take the slow path. */
 221         rcu_sync_enter(&sem->rss);
 222
 223         /*
 224          * Try set sem->block; this provides writer-writer exclusion.
 225          * Having sem->block set makes new readers block.
 226          */
 227         if (!__percpu_down_write_trylock(sem))
 228                 percpu_rwsem_wait(sem, /* .reader = */ false);
 229
 230         /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
 231
 232         /*
 233          * If they don't see our store of sem->block, then we are guaranteed to
 234          * see their sem->read_count increment, and therefore will wait for
 235          * them.
 236          */
 237
 238         /* Wait for all active readers to complete. */
 239         rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
 240 }
 241 EXPORT_SYMBOL_GPL(percpu_down_write);
 242
 243 void percpu_up_write(struct percpu_rw_semaphore *sem)
 244 {
 245         rwsem_release(&sem->dep_map, _RET_IP_);
 246
 247         /*
 248          * Signal the writer is done, no fast path yet.
 249          *
 250          * One reason that we cannot just immediately flip to readers_fast is
 251          * that new readers might fail to see the results of this writer's
 252          * critical section.
 253          *
 254          * Therefore we force it through the slow path which guarantees an
 255          * acquire and thereby guarantees the critical section's consistency.
 256          */
 257         atomic_set_release(&sem->block, 0);
 258
 259         /*
 260          * Prod any pending reader/writer to make progress.
 261          */
 262         __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
 263
 264         /*
 265          * Once this completes (at least one RCU-sched grace period hence) the
 266          * reader fast path will be available again. Safe to use outside the
 267          * exclusive write lock because its counting.
 268          */
 269         rcu_sync_exit(&sem->rss);
 270 }
 271 EXPORT_SYMBOL_GPL(percpu_up_write);