OSDN Git Service

Merge branch 'locking/core' into x86/core, to prepare for dependent patch
authorIngo Molnar <mingo@kernel.org>
Wed, 3 Jun 2015 08:07:35 +0000 (10:07 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 3 Jun 2015 08:07:35 +0000 (10:07 +0200)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
60 files changed:
Documentation/memory-barriers.txt
arch/alpha/include/asm/cmpxchg.h
arch/arm/include/asm/barrier.h
arch/arm64/include/asm/barrier.h
arch/avr32/include/asm/cmpxchg.h
arch/hexagon/include/asm/cmpxchg.h
arch/ia64/include/asm/barrier.h
arch/ia64/include/uapi/asm/cmpxchg.h
arch/m32r/include/asm/cmpxchg.h
arch/m68k/include/asm/cmpxchg.h
arch/metag/include/asm/barrier.h
arch/metag/include/asm/cmpxchg.h
arch/mips/include/asm/barrier.h
arch/mips/include/asm/cmpxchg.h
arch/parisc/include/asm/cmpxchg.h
arch/powerpc/include/asm/barrier.h
arch/powerpc/include/asm/cmpxchg.h
arch/s390/include/asm/barrier.h
arch/s390/include/asm/cmpxchg.h
arch/score/include/asm/cmpxchg.h
arch/sh/include/asm/barrier.h
arch/sh/include/asm/cmpxchg.h
arch/sparc/include/asm/barrier_64.h
arch/sparc/include/asm/cmpxchg_32.h
arch/sparc/include/asm/cmpxchg_64.h
arch/tile/include/asm/atomic_64.h
arch/x86/Kconfig
arch/x86/include/asm/barrier.h
arch/x86/include/asm/cmpxchg.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/qspinlock.h [new file with mode: 0644]
arch/x86/include/asm/qspinlock_paravirt.h [new file with mode: 0644]
arch/x86/include/asm/spinlock.h
arch/x86/include/asm/spinlock_types.h
arch/x86/kernel/kvm.c
arch/x86/kernel/paravirt-spinlocks.c
arch/x86/kernel/paravirt_patch_32.c
arch/x86/kernel/paravirt_patch_64.c
arch/x86/um/asm/barrier.h
arch/x86/xen/spinlock.c
fs/select.c
include/asm-generic/barrier.h
include/asm-generic/cmpxchg.h
include/asm-generic/qspinlock.h [new file with mode: 0644]
include/asm-generic/qspinlock_types.h [new file with mode: 0644]
include/linux/compiler.h
include/linux/osq_lock.h
include/linux/sched.h
kernel/Kconfig.locks
kernel/futex.c
kernel/locking/Makefile
kernel/locking/lockdep.c
kernel/locking/mcs_spinlock.h
kernel/locking/qrwlock.c
kernel/locking/qspinlock.c [new file with mode: 0644]
kernel/locking/qspinlock_paravirt.h [new file with mode: 0644]
kernel/locking/rtmutex.c
kernel/locking/rwsem-xadd.c
kernel/sched/wait.c

index f957461..fe4020e 100644 (file)
@@ -1662,7 +1662,7 @@ CPU from reordering them.
 
 There are some more advanced barrier functions:
 
- (*) set_mb(var, value)
+ (*) smp_store_mb(var, value)
 
      This assigns the value to the variable and then inserts a full memory
      barrier after it, depending on the function.  It isn't guaranteed to
@@ -1975,7 +1975,7 @@ after it has altered the task state:
        CPU 1
        ===============================
        set_current_state();
-         set_mb();
+         smp_store_mb();
            STORE current->state
            <general barrier>
        LOAD event_indicated
@@ -2016,7 +2016,7 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING:
        CPU 1                           CPU 2
        =============================== ===============================
        set_current_state();            STORE event_indicated
-         set_mb();                     wake_up();
+         smp_store_mb();               wake_up();
            STORE current->state          <write barrier>
            <general barrier>             STORE current->state
        LOAD event_indicated
index 429e8cd..e511776 100644 (file)
@@ -66,6 +66,4 @@
 #undef __ASM__MB
 #undef ____cmpxchg
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #endif /* _ALPHA_CMPXCHG_H */
index d2f81e6..6c2327e 100644 (file)
@@ -81,7 +81,7 @@ do {                                                                  \
 #define read_barrier_depends()         do { } while(0)
 #define smp_read_barrier_depends()     do { } while(0)
 
-#define set_mb(var, value)     do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)       do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_mb__before_atomic()        smp_mb()
 #define smp_mb__after_atomic() smp_mb()
index 71f19c4..0fa47c4 100644 (file)
@@ -114,7 +114,7 @@ do {                                                                        \
 #define read_barrier_depends()         do { } while(0)
 #define smp_read_barrier_depends()     do { } while(0)
 
-#define set_mb(var, value)     do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)       do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()          asm volatile("nop");
 
 #define smp_mb__before_atomic()        smp_mb()
index 962a6ae..366bbea 100644 (file)
@@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
    if something tries to do an invalid cmpxchg().  */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
                                      unsigned long new, int size)
 {
index 9e78029..a6e34e2 100644 (file)
@@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
  *  looks just like atomic_cmpxchg on our arch currently with a bunch of
  *  variable casting.
  */
-#define __HAVE_ARCH_CMPXCHG 1
 
 #define cmpxchg(ptr, old, new)                                 \
 ({                                                             \
index f6769eb..843ba43 100644 (file)
@@ -77,12 +77,7 @@ do {                                                                 \
        ___p1;                                                          \
 })
 
-/*
- * XXX check on this ---I suspect what Linus really wants here is
- * acquire vs release semantics but we can't discuss this stuff with
- * Linus just yet.  Grrr...
- */
-#define set_mb(var, value)     do { (var) = (value); mb(); } while (0)
+#define smp_store_mb(var, value)       do { WRITE_ONCE(var, value); mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
index f35109b..a0e3620 100644 (file)
@@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void);
  * indicated by comparing RETURN with OLD.
  */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * This function doesn't exist, so you'll get a linker error
  * if something tries to do an invalid cmpxchg().
index de651db..14bf9b7 100644 (file)
@@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size)
        ((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr),    \
                        sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG    1
-
 static inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new)
 {
index bc755bc..83b1df8 100644 (file)
@@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *,
  * indicated by comparing RETURN with OLD.
  */
 #ifdef CONFIG_RMW_INSNS
-#define __HAVE_ARCH_CMPXCHG    1
 
 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old,
                                      unsigned long new, int size)
index d703d8e..5a696e5 100644 (file)
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()         do { } while (0)
 #define smp_read_barrier_depends()     do { } while (0)
 
-#define set_mb(var, value) do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)                                                \
 do {                                                                   \
index b1bc1be..be29e3e 100644 (file)
@@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
        return old;
 }
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define cmpxchg(ptr, o, n)                                             \
        ({                                                              \
                __typeof__(*(ptr)) _o_ = (o);                           \
index 2b8bbbc..7ecba84 100644 (file)
 #define __WEAK_LLSC_MB         "               \n"
 #endif
 
-#define set_mb(var, value) \
-       do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) \
+       do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_llsc_mb()  __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
 
index 412f945..b71ab4a 100644 (file)
@@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
                __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))));     \
 })
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define __cmpxchg_asm(ld, st, m, old, new)                             \
 ({                                                                     \
        __typeof(*(m)) __ret;                                           \
index dbd1335..0a90b96 100644 (file)
@@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size)
 #define xchg(ptr, x) \
        ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG    1
-
 /* bug catcher for when unsupported size is used - won't link */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
index a3bf5be..39505d6 100644 (file)
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define set_mb(var, value)     do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)       do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
index d463c68..ad6263c 100644 (file)
@@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
  */
-#define __HAVE_ARCH_CMPXCHG    1
 
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
index 8d72471..e6f8615 100644 (file)
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()                smp_mb()
 #define smp_mb__after_atomic()         smp_mb()
 
-#define set_mb(var, value)             do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)               do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #define smp_store_release(p, v)                                                \
 do {                                                                   \
index 4eadec4..411464f 100644 (file)
@@ -32,8 +32,6 @@
        __old;                                                          \
 })
 
-#define __HAVE_ARCH_CMPXCHG
-
 #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn)              \
 ({                                                                     \
        register __typeof__(*(p1)) __old1 asm("2") = (o1);              \
index f384839..cc3f642 100644 (file)
@@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m,
                                        (unsigned long)(o),     \
                                        (unsigned long)(n)))
 
-#define __HAVE_ARCH_CMPXCHG    1
-
 #include <asm-generic/cmpxchg-local.h>
 
 #endif /* _ASM_SCORE_CMPXCHG_H */
index 4371530..bf91037 100644 (file)
@@ -32,7 +32,7 @@
 #define ctrl_barrier() __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop")
 #endif
 
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
 #include <asm-generic/barrier.h>
 
index f6bd140..85c97b1 100644 (file)
@@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void);
  * if something tries to do an invalid cmpxchg(). */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
                unsigned long new, int size)
 {
index 7664894..809941e 100644 (file)
@@ -40,8 +40,8 @@ do {  __asm__ __volatile__("ba,pt     %%xcc, 1f\n\t" \
 #define dma_rmb()      rmb()
 #define dma_wmb()      wmb()
 
-#define set_mb(__var, __value) \
-       do { __var = __value; membar_safe("#StoreLoad"); } while(0)
+#define smp_store_mb(__var, __value) \
+       do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 
 #ifdef CONFIG_SMP
 #define smp_mb()       mb()
index d38b52d..83ffb83 100644 (file)
@@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int
  *
  * Cribbed from <asm-parisc/atomic.h>
  */
-#define __HAVE_ARCH_CMPXCHG    1
 
 /* bug catcher for when unsupported size is used - won't link */
 void __cmpxchg_called_with_bad_pointer(void);
index 0e1ed6c..faa2f61 100644 (file)
@@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 
 #include <asm-generic/cmpxchg-local.h>
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long
 __cmpxchg_u32(volatile int *m, int old, int new)
 {
index 7b11c5f..0496970 100644 (file)
@@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v)       atomic64_add_unless((v), 1, 0)
 
-/* Define this to indicate that cmpxchg is an efficient operation. */
-#define __HAVE_ARCH_CMPXCHG
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_TILE_ATOMIC_64_H */
index e68408f..a708bcc 100644 (file)
@@ -127,7 +127,8 @@ config X86
        select MODULES_USE_ELF_RELA if X86_64
        select CLONE_BACKWARDS if X86_32
        select ARCH_USE_BUILTIN_BSWAP
-       select ARCH_USE_QUEUE_RWLOCK
+       select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_USE_QUEUED_RWLOCKS
        select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
        select OLD_SIGACTION if X86_32
        select COMPAT_OLD_SIGACTION if IA32_EMULATION
@@ -666,7 +667,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
        bool "Paravirtualization layer for spinlocks"
        depends on PARAVIRT && SMP
-       select UNINLINE_SPIN_UNLOCK
+       select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
        ---help---
          Paravirtualized spinlocks allow a pvops backend to replace the
          spinlock implementation with something virtualization-friendly
index 959e45b..e51a8f8 100644 (file)
 #define smp_mb()       mb()
 #define smp_rmb()      dma_rmb()
 #define smp_wmb()      barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()       barrier()
 #define smp_rmb()      barrier()
 #define smp_wmb()      barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()         do { } while (0)
index 99c105d..ad19841 100644 (file)
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * Non-existant functions to indicate usage errors at link time
  * (or compile-time if the compiler implements __compiletime_error().
index 8957810..d143bfa 100644 (file)
@@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+                                                       u32 val)
+{
+       PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+       PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+       PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+       PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
                                                        __ticket_t ticket)
 {
@@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
        PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
 
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
index 344c646..a6b8f9f 100644 (file)
@@ -334,9 +334,19 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+struct qspinlock;
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+       void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+       struct paravirt_callee_save queued_spin_unlock;
+
+       void (*wait)(u8 *ptr, u8 val);
+       void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        struct paravirt_callee_save lock_spinning;
        void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644 (file)
index 0000000..9d51fae
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+
+#define        queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+       smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+       pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+       pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+       native_queued_spin_unlock(lock);
+}
+#endif
+
+#define virt_queued_spin_lock virt_queued_spin_lock
+
+static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+       if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+               return false;
+
+       while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
+               cpu_relax();
+
+       return true;
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644 (file)
index 0000000..b002e71
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif
index 64b6117..be0a059 100644 (file)
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
                cpu_relax();
        }
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 /*
  * Read-write spinlocks, allowing multiple readers
index 5f9d757..65c3e37 100644 (file)
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT   (sizeof(__ticket_t) * 8)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct arch_spinlock {
        union {
                __ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
index 9435620..1681504 100644 (file)
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+       unsigned long flags;
+
+       if (in_nmi())
+               return;
+
+       local_irq_save(flags);
+
+       if (READ_ONCE(*ptr) != val)
+               goto out;
+
+       /*
+        * halt until it's our turn and kicked. Note that we do safe halt
+        * for irq enabled case to avoid hang when lock info is overwritten
+        * in irq spinlock slowpath and no spurious interrupt occur to save us.
+        */
+       if (arch_irqs_disabled_flags(flags))
+               halt();
+       else
+               safe_halt();
+
+out:
+       local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 enum kvm_contention_stat {
        TAKEN_SLOW,
        TAKEN_SLOW_PICKUP,
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
        }
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
                return;
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+       __pv_init_lock_hash();
+       pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+       pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+       pv_lock_ops.wait = kvm_wait;
+       pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
        pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
index bbb6c73..33ee3e0 100644 (file)
@@ -8,11 +8,33 @@
 
 #include <asm/paravirt.h>
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+       native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+       return pv_lock_ops.queued_spin_unlock.func ==
+               __raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+       .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+       .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+       .wait = paravirt_nop,
+       .kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
        .unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
index d9f32e6..e1b0136 100644 (file)
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
        /* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
        return 0;
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_mmu_ops, write_cr3);
                PATCH_SITE(pv_cpu_ops, clts);
                PATCH_SITE(pv_cpu_ops, read_tsc);
-
-       patch_site:
-               ret = paravirt_patch_insns(ibuf, len, start, end);
-               break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+               case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+                       if (pv_is_native_spin_unlock()) {
+                               start = start_pv_lock_ops_queued_spin_unlock;
+                               end   = end_pv_lock_ops_queued_spin_unlock;
+                               goto patch_site;
+                       }
+#endif
 
        default:
                ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
                break;
+
+patch_site:
+               ret = paravirt_patch_insns(ibuf, len, start, end);
+               break;
        }
 #undef PATCH_SITE
        return ret;
index 0de21c6..8aa0558 100644 (file)
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
        return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
                                    start__mov64, end__mov64);
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
@@ -58,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_cpu_ops, clts);
                PATCH_SITE(pv_mmu_ops, flush_tlb_single);
                PATCH_SITE(pv_cpu_ops, wbinvd);
-
-       patch_site:
-               ret = paravirt_patch_insns(ibuf, len, start, end);
-               break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+               case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+                       if (pv_is_native_spin_unlock()) {
+                               start = start_pv_lock_ops_queued_spin_unlock;
+                               end   = end_pv_lock_ops_queued_spin_unlock;
+                               goto patch_site;
+                       }
+#endif
 
        default:
                ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
                break;
+
+patch_site:
+               ret = paravirt_patch_insns(ibuf, len, start, end);
+               break;
        }
 #undef PATCH_SITE
        return ret;
index 7e8a1a6..b9531d3 100644 (file)
@@ -39,7 +39,8 @@
 #define smp_mb()       barrier()
 #define smp_rmb()      barrier()
 #define smp_wmb()      barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()         do { } while (0)
 #define smp_read_barrier_depends()     do { } while (0)
index 956374c..9e2ba5c 100644 (file)
 #include "xen-ops.h"
 #include "debugfs.h"
 
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void xen_qlock_kick(int cpu)
+{
+       xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+       int irq = __this_cpu_read(lock_kicker_irq);
+
+       /* If kicker interrupts not initialized yet, just spin */
+       if (irq == -1)
+               return;
+
+       /* clear pending */
+       xen_clear_irq_pending(irq);
+       barrier();
+
+       /*
+        * We check the byte value after clearing pending IRQ to make sure
+        * that we won't miss a wakeup event because of the clearing.
+        *
+        * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+        * So it is effectively a memory barrier for x86.
+        */
+       if (READ_ONCE(*byte) != val)
+               return;
+
+       /*
+        * If an interrupt happens here, it will leave the wakeup irq
+        * pending, which will cause xen_poll_irq() to return
+        * immediately.
+        */
+
+       /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+       xen_poll_irq(irq);
+}
+
+#else /* CONFIG_QUEUED_SPINLOCKS */
+
 enum xen_contention_stat {
        TAKEN_SLOW,
        TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
        __ticket_t want;
 };
 
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
        int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
                }
        }
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
                return;
        }
        printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+       __pv_init_lock_hash();
+       pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+       pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+       pv_lock_ops.wait = xen_qlock_wait;
+       pv_lock_ops.kick = xen_qlock_kick;
+#else
        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
        pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
 }
 
 /*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
 
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 
 static struct dentry *d_spin_debug;
 
index f684c75..0155473 100644 (file)
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
         * doesn't imply write barrier and the users expect write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-        * and is paired with set_mb() in poll_schedule_timeout.
+        * and is paired with smp_store_mb() in poll_schedule_timeout.
         */
        smp_wmb();
        pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
        /*
         * Prepare for the next iteration.
         *
-        * The following set_mb() serves two purposes.  First, it's
+        * The following smp_store_mb() serves two purposes.  First, it's
         * the counterpart rmb of the wmb in pollwake() such that data
         * written before wake up is always visible after wake up.
         * Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
         * this problem doesn't exist for the first iteration as
         * add_wait_queue() has full barrier semantics.
         */
-       set_mb(pwq->triggered, 0);
+       smp_store_mb(pwq->triggered, 0);
 
        return rc;
 }
index f5c40b0..e6a83d7 100644 (file)
@@ -66,8 +66,8 @@
 #define smp_read_barrier_depends()     do { } while (0)
 #endif
 
-#ifndef set_mb
-#define set_mb(var, value)  do { (var) = (value); mb(); } while (0)
+#ifndef smp_store_mb
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
 #endif
 
 #ifndef smp_mb__before_atomic
index 811fb1e..3766ab3 100644 (file)
@@ -86,9 +86,6 @@ unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
 
 /*
  * Atomic compare and exchange.
- *
- * Do not define __HAVE_ARCH_CMPXCHG because we want to use it to check whether
- * a cmpxchg primitive faster than repeated local irq save/restore exists.
  */
 #include <asm-generic/cmpxchg-local.h>
 
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
new file mode 100644 (file)
index 0000000..83bfb87
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_H
+#define __ASM_GENERIC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+/**
+ * queued_spin_is_locked - is the spinlock locked?
+ * @lock: Pointer to queued spinlock structure
+ * Return: 1 if it is locked, 0 otherwise
+ */
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+       return atomic_read(&lock->val);
+}
+
+/**
+ * queued_spin_value_unlocked - is the spinlock structure unlocked?
+ * @lock: queued spinlock structure
+ * Return: 1 if it is unlocked, 0 otherwise
+ *
+ * N.B. Whenever there are tasks waiting for the lock, it is considered
+ *      locked wrt the lockref code to avoid lock stealing by the lockref
+ *      code and change things underneath the lock. This also allows some
+ *      optimizations to be applied without conflict with lockref.
+ */
+static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
+{
+       return !atomic_read(&lock.val);
+}
+
+/**
+ * queued_spin_is_contended - check if the lock is contended
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
+{
+       return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
+}
+/**
+ * queued_spin_trylock - try to acquire the queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int queued_spin_trylock(struct qspinlock *lock)
+{
+       if (!atomic_read(&lock->val) &&
+          (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+               return 1;
+       return 0;
+}
+
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+/**
+ * queued_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+       u32 val;
+
+       val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+       if (likely(val == 0))
+               return;
+       queued_spin_lock_slowpath(lock, val);
+}
+
+#ifndef queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_unlock(struct qspinlock *lock)
+{
+       /*
+        * smp_mb__before_atomic() in order to guarantee release semantics
+        */
+       smp_mb__before_atomic_dec();
+       atomic_sub(_Q_LOCKED_VAL, &lock->val);
+}
+#endif
+
+/**
+ * queued_spin_unlock_wait - wait until current lock holder releases the lock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * There is a very slight possibility of live-lock if the lockers keep coming
+ * and the waiter is just unfortunate enough to not see any unlock state.
+ */
+static inline void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+       while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+               cpu_relax();
+}
+
+#ifndef virt_queued_spin_lock
+static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+       return false;
+}
+#endif
+
+/*
+ * Initializier
+ */
+#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
+
+/*
+ * Remapping spinlock architecture specific functions to the corresponding
+ * queued spinlock functions.
+ */
+#define arch_spin_is_locked(l)         queued_spin_is_locked(l)
+#define arch_spin_is_contended(l)      queued_spin_is_contended(l)
+#define arch_spin_value_unlocked(l)    queued_spin_value_unlocked(l)
+#define arch_spin_lock(l)              queued_spin_lock(l)
+#define arch_spin_trylock(l)           queued_spin_trylock(l)
+#define arch_spin_unlock(l)            queued_spin_unlock(l)
+#define arch_spin_lock_flags(l, f)     queued_spin_lock(l)
+#define arch_spin_unlock_wait(l)       queued_spin_unlock_wait(l)
+
+#endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
new file mode 100644 (file)
index 0000000..85f888e
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H
+#define __ASM_GENERIC_QSPINLOCK_TYPES_H
+
+/*
+ * Including atomic.h with PARAVIRT on will cause compilation errors because
+ * of recursive header file incluson via paravirt_types.h. So don't include
+ * it if PARAVIRT is on.
+ */
+#ifndef CONFIG_PARAVIRT
+#include <linux/types.h>
+#include <linux/atomic.h>
+#endif
+
+typedef struct qspinlock {
+       atomic_t        val;
+} arch_spinlock_t;
+
+/*
+ * Bitfields in the atomic value:
+ *
+ * When NR_CPUS < 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-10: tail index
+ * 11-31: tail cpu (+1)
+ */
+#define        _Q_SET_MASK(type)       (((1U << _Q_ ## type ## _BITS) - 1)\
+                                     << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET       0
+#define _Q_LOCKED_BITS         8
+#define _Q_LOCKED_MASK         _Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET      (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS                8
+#else
+#define _Q_PENDING_BITS                1
+#endif
+#define _Q_PENDING_MASK                _Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET     (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS       2
+#define _Q_TAIL_IDX_MASK       _Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET     (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS       (32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK       _Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET         _Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK           (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL          (1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL         (1U << _Q_PENDING_OFFSET)
+
+#endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */
index 8677225..03e227b 100644 (file)
@@ -250,7 +250,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
        ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #define WRITE_ONCE(x, val) \
-       ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
+       ({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #endif /* __KERNEL__ */
 
@@ -450,7 +450,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
  *
- * If possible use READ_ONCE/ASSIGN_ONCE instead.
+ * If possible use READ_ONCE()/WRITE_ONCE() instead.
  */
 #define __ACCESS_ONCE(x) ({ \
         __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
index 3a6490e..703ea5c 100644 (file)
@@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 extern bool osq_lock(struct optimistic_spin_queue *lock);
 extern void osq_unlock(struct optimistic_spin_queue *lock);
 
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+       return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
 #endif
index 26a2e61..18f1972 100644 (file)
@@ -252,7 +252,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_task_state(tsk, state_value)                       \
        do {                                                    \
                (tsk)->task_state_change = _THIS_IP_;           \
-               set_mb((tsk)->state, (state_value));            \
+               smp_store_mb((tsk)->state, (state_value));              \
        } while (0)
 
 /*
@@ -274,7 +274,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_current_state(state_value)                         \
        do {                                                    \
                current->task_state_change = _THIS_IP_;         \
-               set_mb(current->state, (state_value));          \
+               smp_store_mb(current->state, (state_value));            \
        } while (0)
 
 #else
@@ -282,7 +282,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_task_state(tsk, state_value)             \
        do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)               \
-       set_mb((tsk)->state, (state_value))
+       smp_store_mb((tsk)->state, (state_value))
 
 /*
  * set_current_state() includes a barrier so that the write of current->state
@@ -298,7 +298,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_current_state(state_value)               \
        do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)                 \
-       set_mb(current->state, (state_value))
+       smp_store_mb(current->state, (state_value))
 
 #endif
 
index 08561f1..ebdb004 100644 (file)
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
        def_bool y
        depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
        bool
 
-config QUEUE_RWLOCK
-       def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+       def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+       depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+       bool
+
+config QUEUED_RWLOCKS
+       def_bool y if ARCH_USE_QUEUED_RWLOCKS
        depends on SMP
index 2579e40..55ca63a 100644 (file)
@@ -2055,7 +2055,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
        /*
         * The task state is guaranteed to be set before another task can
-        * wake it. set_current_state() is implemented using set_mb() and
+        * wake it. set_current_state() is implemented using smp_store_mb() and
         * queue_me() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
index de7a416..7dd5c99 100644 (file)
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
index a0831e1..a61bb1d 100644 (file)
@@ -4066,8 +4066,7 @@ void __init lockdep_info(void)
 
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-               printk("WARNING: lockdep init error! lock-%s was acquired"
-                       "before lockdep_init\n", lock_init_error);
+               printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
index 75e114b..fd91aaa 100644 (file)
@@ -17,6 +17,7 @@
 struct mcs_spinlock {
        struct mcs_spinlock *next;
        int locked; /* 1 if lock acquired */
+       int count;  /* nesting count, see qspinlock.c */
 };
 
 #ifndef arch_mcs_spin_lock_contended
index f956ede..00c12bb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Queue read/write lock
+ * Queued read/write locks
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644 (file)
index 0000000..38c4920
--- /dev/null
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES      8
+#else
+#define MAX_NODES      4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+       u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+       BUG_ON(idx > 3);
+#endif
+       tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+       tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+       return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+       int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+       int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+       return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+       union {
+               atomic_t val;
+#ifdef __LITTLE_ENDIAN
+               struct {
+                       u8      locked;
+                       u8      pending;
+               };
+               struct {
+                       u16     locked_pending;
+                       u16     tail;
+               };
+#else
+               struct {
+                       u16     tail;
+                       u16     locked_pending;
+               };
+               struct {
+                       u8      reserved[2];
+                       u8      pending;
+                       u8      locked;
+               };
+#endif
+       };
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+       u32 old, new, val = atomic_read(&lock->val);
+
+       for (;;) {
+               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+               old = atomic_cmpxchg(&lock->val, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+       return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+                                          struct mcs_spinlock *node) { }
+
+#define pv_enabled()           false
+
+#define pv_init_node           __pv_init_node
+#define pv_wait_node           __pv_wait_node
+#define pv_kick_node           __pv_kick_node
+#define pv_wait_head           __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath      native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+       struct mcs_spinlock *prev, *next, *node;
+       u32 new, old, tail;
+       int idx;
+
+       BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+       if (pv_enabled())
+               goto queue;
+
+       if (virt_queued_spin_lock(lock))
+               return;
+
+       /*
+        * wait for in-progress pending->locked hand-overs
+        *
+        * 0,1,0 -> 0,0,1
+        */
+       if (val == _Q_PENDING_VAL) {
+               while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+                       cpu_relax();
+       }
+
+       /*
+        * trylock || pending
+        *
+        * 0,0,0 -> 0,0,1 ; trylock
+        * 0,0,1 -> 0,1,1 ; pending
+        */
+       for (;;) {
+               /*
+                * If we observe any contention; queue.
+                */
+               if (val & ~_Q_LOCKED_MASK)
+                       goto queue;
+
+               new = _Q_LOCKED_VAL;
+               if (val == new)
+                       new |= _Q_PENDING_VAL;
+
+               old = atomic_cmpxchg(&lock->val, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       /*
+        * we won the trylock
+        */
+       if (new == _Q_LOCKED_VAL)
+               return;
+
+       /*
+        * we're pending, wait for the owner to go away.
+        *
+        * *,1,1 -> *,1,0
+        *
+        * this wait loop must be a load-acquire such that we match the
+        * store-release that clears the locked bit and create lock
+        * sequentiality; this is because not all clear_pending_set_locked()
+        * implementations imply full barriers.
+        */
+       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+               cpu_relax();
+
+       /*
+        * take ownership and clear the pending bit.
+        *
+        * *,1,0 -> *,0,1
+        */
+       clear_pending_set_locked(lock);
+       return;
+
+       /*
+        * End of pending bit optimistic spinning and beginning of MCS
+        * queuing.
+        */
+queue:
+       node = this_cpu_ptr(&mcs_nodes[0]);
+       idx = node->count++;
+       tail = encode_tail(smp_processor_id(), idx);
+
+       node += idx;
+       node->locked = 0;
+       node->next = NULL;
+       pv_init_node(node);
+
+       /*
+        * We touched a (possibly) cold cacheline in the per-cpu queue node;
+        * attempt the trylock once more in the hope someone let go while we
+        * weren't watching.
+        */
+       if (queued_spin_trylock(lock))
+               goto release;
+
+       /*
+        * We have already touched the queueing cacheline; don't bother with
+        * pending stuff.
+        *
+        * p,*,* -> n,*,*
+        */
+       old = xchg_tail(lock, tail);
+
+       /*
+        * if there was a previous node; link it and wait until reaching the
+        * head of the waitqueue.
+        */
+       if (old & _Q_TAIL_MASK) {
+               prev = decode_tail(old);
+               WRITE_ONCE(prev->next, node);
+
+               pv_wait_node(node);
+               arch_mcs_spin_lock_contended(&node->locked);
+       }
+
+       /*
+        * we're at the head of the waitqueue, wait for the owner & pending to
+        * go away.
+        *
+        * *,x,y -> *,0,0
+        *
+        * this wait loop must use a load-acquire such that we match the
+        * store-release that clears the locked bit and create lock
+        * sequentiality; this is because the set_locked() function below
+        * does not imply a full barrier.
+        *
+        */
+       pv_wait_head(lock, node);
+       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+               cpu_relax();
+
+       /*
+        * claim the lock:
+        *
+        * n,0,0 -> 0,0,1 : lock, uncontended
+        * *,0,0 -> *,0,1 : lock, contended
+        *
+        * If the queue head is the only one in the queue (lock value == tail),
+        * clear the tail code and grab the lock. Otherwise, we only need
+        * to grab the lock.
+        */
+       for (;;) {
+               if (val != tail) {
+                       set_locked(lock);
+                       break;
+               }
+               old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+               if (old == val)
+                       goto release;   /* No contention */
+
+               val = old;
+       }
+
+       /*
+        * contended path; wait for next, release.
+        */
+       while (!(next = READ_ONCE(node->next)))
+               cpu_relax();
+
+       arch_mcs_spin_unlock_contended(&next->locked);
+       pv_kick_node(next);
+
+release:
+       /*
+        * release the node
+        */
+       this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef  pv_enabled
+#define pv_enabled()   true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath      __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644 (file)
index 0000000..04ab181
--- /dev/null
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL    (3U << _Q_LOCKED_OFFSET)
+
+enum vcpu_state {
+       vcpu_running = 0,
+       vcpu_halted,
+};
+
+struct pv_node {
+       struct mcs_spinlock     mcs;
+       struct mcs_spinlock     __res[3];
+
+       int                     cpu;
+       u8                      state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+       struct qspinlock *lock;
+       struct pv_node   *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN      (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+       int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+       if (pv_hash_size < PV_HE_MIN)
+               pv_hash_size = PV_HE_MIN;
+
+       /*
+        * Allocate space from bootmem which should be page-size aligned
+        * and hence cacheline aligned.
+        */
+       pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+                                              sizeof(struct pv_hash_entry),
+                                              pv_hash_size, 0, HASH_EARLY,
+                                              &pv_lock_hash_bits, NULL,
+                                              pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash)                                          \
+       for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
+            offset < (1 << pv_lock_hash_bits);                                         \
+            offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+       unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+       struct pv_hash_entry *he;
+
+       for_each_hash_entry(he, offset, hash) {
+               if (!cmpxchg(&he->lock, NULL, lock)) {
+                       WRITE_ONCE(he->node, node);
+                       return &he->lock;
+               }
+       }
+       /*
+        * Hard assume there is a free entry for us.
+        *
+        * This is guaranteed by ensuring every blocked lock only ever consumes
+        * a single entry, and since we only have 4 nesting levels per CPU
+        * and allocated 4*nr_possible_cpus(), this must be so.
+        *
+        * The single entry is guaranteed by having the lock owner unhash
+        * before it releases.
+        */
+       BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+       unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+       struct pv_hash_entry *he;
+       struct pv_node *node;
+
+       for_each_hash_entry(he, offset, hash) {
+               if (READ_ONCE(he->lock) == lock) {
+                       node = READ_ONCE(he->node);
+                       WRITE_ONCE(he->lock, NULL);
+                       return node;
+               }
+       }
+       /*
+        * Hard assume we'll find an entry.
+        *
+        * This guarantees a limited lookup time and is itself guaranteed by
+        * having the lock owner do the unhash -- IFF the unlock sees the
+        * SLOW flag, there MUST be a hash entry.
+        */
+       BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+       struct pv_node *pn = (struct pv_node *)node;
+
+       BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+       pn->cpu = smp_processor_id();
+       pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+       struct pv_node *pn = (struct pv_node *)node;
+       int loop;
+
+       for (;;) {
+               for (loop = SPIN_THRESHOLD; loop; loop--) {
+                       if (READ_ONCE(node->locked))
+                               return;
+                       cpu_relax();
+               }
+
+               /*
+                * Order pn->state vs pn->locked thusly:
+                *
+                * [S] pn->state = vcpu_halted    [S] next->locked = 1
+                *     MB                             MB
+                * [L] pn->locked               [RmW] pn->state = vcpu_running
+                *
+                * Matches the xchg() from pv_kick_node().
+                */
+               smp_store_mb(pn->state, vcpu_halted);
+
+               if (!READ_ONCE(node->locked))
+                       pv_wait(&pn->state, vcpu_halted);
+
+               /*
+                * Reset the vCPU state to avoid unncessary CPU kicking
+                */
+               WRITE_ONCE(pn->state, vcpu_running);
+
+               /*
+                * If the locked flag is still not set after wakeup, it is a
+                * spurious wakeup and the vCPU should wait again. However,
+                * there is a pretty high overhead for CPU halting and kicking.
+                * So it is better to spin for a while in the hope that the
+                * MCS lock will be released soon.
+                */
+       }
+       /*
+        * By now our node->locked should be 1 and our caller will not actually
+        * spin-wait for it. We do however rely on our caller to do a
+        * load-acquire for us.
+        */
+}
+
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+       struct pv_node *pn = (struct pv_node *)node;
+
+       /*
+        * Note that because node->locked is already set, this actual
+        * mcs_spinlock entry could be re-used already.
+        *
+        * This should be fine however, kicking people for no reason is
+        * harmless.
+        *
+        * See the comment in pv_wait_node().
+        */
+       if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+               pv_kick(pn->cpu);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+       struct pv_node *pn = (struct pv_node *)node;
+       struct __qspinlock *l = (void *)lock;
+       struct qspinlock **lp = NULL;
+       int loop;
+
+       for (;;) {
+               for (loop = SPIN_THRESHOLD; loop; loop--) {
+                       if (!READ_ONCE(l->locked))
+                               return;
+                       cpu_relax();
+               }
+
+               WRITE_ONCE(pn->state, vcpu_halted);
+               if (!lp) { /* ONCE */
+                       lp = pv_hash(lock, pn);
+                       /*
+                        * lp must be set before setting _Q_SLOW_VAL
+                        *
+                        * [S] lp = lock                [RmW] l = l->locked = 0
+                        *     MB                             MB
+                        * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                        *
+                        * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                        */
+                       if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                               /*
+                                * The lock is free and _Q_SLOW_VAL has never
+                                * been set. Therefore we need to unhash before
+                                * getting the lock.
+                                */
+                               WRITE_ONCE(*lp, NULL);
+                               return;
+                       }
+               }
+               pv_wait(&l->locked, _Q_SLOW_VAL);
+
+               /*
+                * The unlocker should have freed the lock before kicking the
+                * CPU. So if the lock is still not free, it is a spurious
+                * wakeup and so the vCPU should wait again after spinning for
+                * a while.
+                */
+       }
+
+       /*
+        * Lock is unlocked now; the caller will acquire it without waiting.
+        * As with pv_wait_node() we rely on the caller to do a load-acquire
+        * for us.
+        */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+       struct pv_node *node;
+
+       /*
+        * We must not unlock if SLOW, because in that case we must first
+        * unhash. Otherwise it would be possible to have multiple @lock
+        * entries, which would be BAD.
+        */
+       if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+               return;
+
+       /*
+        * Since the above failed to release, this must be the SLOW path.
+        * Therefore start by looking up the blocked node and unhashing it.
+        */
+       node = pv_unhash(lock);
+
+       /*
+        * Now that we have a reference to the (likely) blocked pv_node,
+        * release the lock.
+        */
+       smp_store_release(&l->locked, 0);
+
+       /*
+        * At this point the memory pointed at by lock can be freed/reused,
+        * however we can still use the pv_node to kick the CPU.
+        */
+       if (READ_ONCE(node->state) == vcpu_halted)
+               pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
index b025295..30ec5b4 100644 (file)
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 
 /*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
  */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
 # define rt_mutex_cmpxchg(l,c,n)       (cmpxchg(&l->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
@@ -1443,10 +1443,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  *
  * @lock:      the rt_mutex to be locked
  *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
  * Returns 1 on success and 0 on contention
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+       if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+               return 0;
+
        return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
index 3417d01..0f18971 100644 (file)
@@ -409,11 +409,24 @@ done:
        return taken;
 }
 
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+       return osq_is_locked(&sem->osq);
+}
+
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
        return false;
 }
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+       return false;
+}
 #endif
 
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
 
+       /*
+        * If a spinner is present, it is not necessary to do the wakeup.
+        * Try to do wakeup only if the trylock succeeds to minimize
+        * spinlock contention which may introduce too much delay in the
+        * unlock operation.
+        *
+        *    spinning writer           up_write/up_read caller
+        *    ---------------           -----------------------
+        * [S]   osq_unlock()           [L]   osq
+        *       MB                           RMB
+        * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+        *
+        * Here, it is important to make sure that there won't be a missed
+        * wakeup while the rwsem is free and the only spinning writer goes
+        * to sleep without taking the rwsem. Even when the spinning writer
+        * is just going to break out of the waiting loop, it will still do
+        * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+        * rwsem_has_spinner() is true, it will guarantee at least one
+        * trylock attempt on the rwsem later on.
+        */
+       if (rwsem_has_spinner(sem)) {
+               /*
+                * The smp_rmb() here is to make sure that the spinner
+                * state is consulted before reading the wait_lock.
+                */
+               smp_rmb();
+               if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+                       return sem;
+               goto locked;
+       }
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
 
        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
index 852143a..9bc8232 100644 (file)
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
         * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
         * an event.
         */
-       set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+       smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
 
        return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
         * doesn't imply write barrier and the users expects write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-        * and is paired with set_mb() in wait_woken().
+        * and is paired with smp_store_mb() in wait_woken().
         */
        smp_wmb(); /* C */
        wait->flags |= WQ_FLAG_WOKEN;