diff options
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/qspinlock.h | 21 | ||||
-rw-r--r-- | arch/x86/include/asm/qspinlock_paravirt.h | 3 | ||||
-rw-r--r-- | include/asm-generic/atomic-long.h | 19 | ||||
-rw-r--r-- | include/asm-generic/barrier.h | 27 | ||||
-rw-r--r-- | include/asm-generic/qspinlock.h | 2 | ||||
-rw-r--r-- | include/asm-generic/qspinlock_types.h | 32 | ||||
-rw-r--r-- | include/linux/atomic.h | 2 | ||||
-rw-r--r-- | include/linux/delayacct.h | 2 | ||||
-rw-r--r-- | kernel/delayacct.c | 17 | ||||
-rw-r--r-- | kernel/locking/mcs_spinlock.h | 10 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 3 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 247 | ||||
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 49 | ||||
-rw-r--r-- | kernel/locking/qspinlock_stat.h | 9 | ||||
-rw-r--r-- | kernel/stop_machine.c | 24 |
16 files changed, 250 insertions, 218 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index b1ccabd0dbc3..05123dc4a1c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8320,6 +8320,7 @@ F: Documentation/admin-guide/LSM/LoadPin.rst LOCKING PRIMITIVES M: Peter Zijlstra <peterz@infradead.org> M: Ingo Molnar <mingo@redhat.com> +M: Will Deacon <will.deacon@arm.com> L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core S: Maintained diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 5e16b5d40d32..3e70bed8a978 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -7,6 +7,14 @@ #include <asm-generic/qspinlock_types.h> #include <asm/paravirt.h> +#define _Q_PENDING_LOOPS (1 << 9) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + #define queued_spin_unlock queued_spin_unlock /** * queued_spin_unlock - release a queued spinlock @@ -16,15 +24,9 @@ */ static inline void native_queued_spin_unlock(struct qspinlock *lock) { - smp_store_release((u8 *)lock, 0); + smp_store_release(&lock->locked, 0); } -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_init_lock_hash(void); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); - static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { pv_queued_spin_lock_slowpath(lock, val); @@ -40,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu) { return pv_vcpu_is_preempted(cpu); } -#else -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - native_queued_spin_unlock(lock); -} #endif #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 923307ea11c7..9ef5ee03d2d7 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); * * void __pv_queued_spin_unlock(struct qspinlock *lock) * { - * struct __qspinlock *l = (void *)lock; - * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); * * if (likely(lockval == _Q_LOCKED_VAL)) * return; diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h index 34a028a7bcc5..87d14476edc2 100644 --- a/include/asm-generic/atomic-long.h +++ b/include/asm-generic/atomic-long.h @@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define ATOMIC_LONG_PFX(x) atomic64 ## x +#define ATOMIC_LONG_TYPE s64 #else @@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define ATOMIC_LONG_PFX(x) atomic ## x +#define ATOMIC_LONG_TYPE int #endif @@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release) #define atomic_long_cmpxchg(l, old, new) \ (ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new))) + +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \ + (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \ + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new))) +#define atomic_long_try_cmpxchg_acquire(l, old, new) \ + (ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \ + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new))) +#define atomic_long_try_cmpxchg_release(l, old, new) \ + (ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \ + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new))) +#define atomic_long_try_cmpxchg(l, old, new) \ + (ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \ + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new))) + + #define atomic_long_xchg_relaxed(v, new) \ (ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new))) #define atomic_long_xchg_acquire(v, new) \ @@ -244,6 +261,8 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) #define atomic_long_inc_not_zero(l) \ ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l)) +#define atomic_long_cond_read_relaxed(v, c) \ + ATOMIC_LONG_PFX(_cond_read_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (c)) #define atomic_long_cond_read_acquire(v, c) \ ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c)) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 29458bbb2fa0..2cafdbb9ae4c 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -221,18 +221,17 @@ do { \ #endif /** - * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering + * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees * @ptr: pointer to the variable to wait on * @cond: boolean expression to wait for * - * Equivalent to using smp_load_acquire() on the condition variable but employs - * the control dependency of the wait to reduce the barrier on many platforms. + * Equivalent to using READ_ONCE() on the condition variable. * * Due to C lacking lambda expressions we load the value of *ptr into a * pre-named variable @VAL to be used in @cond. */ -#ifndef smp_cond_load_acquire -#define smp_cond_load_acquire(ptr, cond_expr) ({ \ +#ifndef smp_cond_load_relaxed +#define smp_cond_load_relaxed(ptr, cond_expr) ({ \ typeof(ptr) __PTR = (ptr); \ typeof(*ptr) VAL; \ for (;;) { \ @@ -241,10 +240,26 @@ do { \ break; \ cpu_relax(); \ } \ - smp_acquire__after_ctrl_dep(); \ VAL; \ }) #endif +/** + * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering + * @ptr: pointer to the variable to wait on + * @cond: boolean expression to wait for + * + * Equivalent to using smp_load_acquire() on the condition variable but employs + * the control dependency of the wait to reduce the barrier on many platforms. + */ +#ifndef smp_cond_load_acquire +#define smp_cond_load_acquire(ptr, cond_expr) ({ \ + typeof(*ptr) _val; \ + _val = smp_cond_load_relaxed(ptr, cond_expr); \ + smp_acquire__after_ctrl_dep(); \ + _val; \ +}) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index b37b4ad7eb94..a8ed0a352d75 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -100,7 +100,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock) /* * unlock() needs release semantics: */ - (void)atomic_sub_return_release(_Q_LOCKED_VAL, &lock->val); + smp_store_release(&lock->locked, 0); } #endif diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 034acd0c4956..0763f065b975 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -29,13 +29,41 @@ #endif typedef struct qspinlock { - atomic_t val; + union { + atomic_t val; + + /* + * By using the whole 2nd least significant byte for the + * pending bit, we can allow better optimization of the lock + * acquisition for the pending bit holder. + */ +#ifdef __LITTLE_ENDIAN + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; +#else + struct { + u16 tail; + u16 locked_pending; + }; + struct { + u8 reserved[2]; + u8 pending; + u8 locked; + }; +#endif + }; } arch_spinlock_t; /* * Initializier */ -#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) } +#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) } /* * Bitfields in the atomic value: diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 8b276fd9a127..01ce3997cb42 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -654,6 +654,7 @@ static inline int atomic_dec_if_positive(atomic_t *v) } #endif +#define atomic_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c)) #define atomic_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c)) #ifdef CONFIG_GENERIC_ATOMIC64 @@ -1075,6 +1076,7 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v } #endif +#define atomic64_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c)) #define atomic64_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c)) #include <asm-generic/atomic-long.h> diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 5e335b6203f4..e6c0448ebcc7 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -29,7 +29,7 @@ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { - spinlock_t lock; + raw_spinlock_t lock; unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e2764d767f18..ca8ac2824f0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk) { tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) - spin_lock_init(&tsk->delays->lock); + raw_spin_lock_init(&tsk->delays->lock); } /* * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, + u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; @@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; } @@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) __u64 ret; unsigned long flags; - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f046b7ce9dd6..5e10153b4d3c 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -23,13 +23,15 @@ struct mcs_spinlock { #ifndef arch_mcs_spin_lock_contended /* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. + * Using smp_cond_load_acquire() provides the acquire semantics + * required so that subsequent operations happen after the + * lock is acquired. Additionally, some architectures such as + * ARM64 would like to do spin-waiting instead of purely + * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ do { \ - while (!(smp_load_acquire(l))) \ - cpu_relax(); \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2048359f33d2..f44f658ae629 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock) static __always_inline bool __mutex_trylock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; + unsigned long zero = 0UL; - if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; return false; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index d880296245c5..bfaeb05123ff 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -12,11 +12,11 @@ * GNU General Public License for more details. * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. - * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -33,6 +33,11 @@ #include <asm/qspinlock.h> /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the * MCS lock. The paper below provides a good description for this kind @@ -77,6 +82,18 @@ #endif /* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * @@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) -/* - * By using the whole 2nd least significant byte for the pending bit, we - * can allow better optimization of the lock acquisition for the pending - * bit holder. +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure * - * This internal structure is also used by the set_locked function which - * is not restricted to _Q_PENDING_BITS == 8. + * *,1,* -> *,0,* */ -struct __qspinlock { - union { - atomic_t val; -#ifdef __LITTLE_ENDIAN - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} -#if _Q_PENDING_BITS == 8 /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure @@ -159,9 +153,7 @@ struct __qspinlock { */ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } /* @@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - struct __qspinlock *l = (void *)lock; - /* - * Use release semantics to make sure that the MCS node is properly - * initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. */ - return (u32)xchg_release(&l->tail, + return (u32)xchg_relaxed(&lock->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ /** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure * @@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; /* - * Use release semantics to make sure that the MCS node is - * properly initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. */ - old = atomic_cmpxchg_release(&lock->val, val, new); + old = atomic_cmpxchg_relaxed(&lock->val, val, new); if (old == val) break; @@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) */ static __always_inline void set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); } @@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; - u32 new, old, tail; + u32 old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (pv_enabled()) - goto queue; + goto pv_queue; if (virt_spin_lock(lock)) return; /* - * wait for in-progress pending->locked hand-overs + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { - while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) - cpu_relax(); + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); } /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* * trylock || pending * * 0,0,0 -> 0,0,1 ; trylock * 0,0,1 -> 0,1,1 ; pending */ - for (;;) { + val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); + if (!(val & ~_Q_LOCKED_MASK)) { /* - * If we observe any contention; queue. + * We're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - new = _Q_LOCKED_VAL; - if (val == new) - new |= _Q_PENDING_VAL; + if (val & _Q_LOCKED_MASK) { + atomic_cond_read_acquire(&lock->val, + !(VAL & _Q_LOCKED_MASK)); + } /* - * Acquire semantic is required here as the function may - * return immediately if the lock was free. + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 */ - old = atomic_cmpxchg_acquire(&lock->val, val, new); - if (old == val) - break; - - val = old; - } - - /* - * we won the trylock - */ - if (new == _Q_LOCKED_VAL) + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); return; + } /* - * we're pending, wait for the owner to go away. - * - * *,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all clear_pending_set_locked() - * implementations imply full barriers. - */ - smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); - - /* - * take ownership and clear the pending bit. - * - * *,1,0 -> *,0,1 + * If pending was clear but there are waiters in the queue, then + * we need to undo our setting of pending before we queue ourselves. */ - clear_pending_set_locked(lock); - return; + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); /* * End of pending bit optimistic spinning and beginning of MCS * queuing. */ queue: + qstat_inc(qstat_lock_slowpath, true); +pv_queue: node = this_cpu_ptr(&mcs_nodes[0]); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -400,12 +397,18 @@ queue: goto release; /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* - * - * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -417,14 +420,8 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); - /* - * We must ensure that the stores to @node are observed before - * the write to prev->next. The address dependency from - * xchg_tail is not sufficient to ensure this because the read - * component of xchg_tail is unordered with respect to the - * initialisation of @node. - */ - smp_store_release(&prev->next, node); + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); @@ -453,8 +450,8 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_load_acquire() call. As the next PV queue head hasn't been - * designated yet, there is no way for the locked value to become + * atomic_cond_read_acquire() call. As the next PV queue head hasn't + * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * @@ -464,44 +461,38 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended - * *,0,0 -> *,0,1 : lock, contended + * *,*,0 -> *,*,1 : lock, contended * - * If the queue head is the only one in the queue (lock value == tail), - * clear the tail code and grab the lock. Otherwise, we only need - * to grab the lock. + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. */ - for (;;) { - /* In the PV case we might already have _Q_LOCKED_VAL set */ - if ((val & _Q_TAIL_MASK) != tail) { - set_locked(lock); - break; - } - /* - * The smp_cond_load_acquire() call above has provided the - * necessary acquire semantics required for locking. At most - * two iterations of this loop may be ran. - */ - old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); - if (old == val) - goto release; /* No contention */ - val = old; - } + /* + * In the PV case we might already have _Q_LOCKED_VAL set. + * + * The atomic_cond_read_acquire() call above has provided the + * necessary acquire semantics required for locking. + */ + if (((val & _Q_TAIL_MASK) == tail) && + atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + + /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + set_locked(lock); /* * contended path; wait for next if not observed yet, release. */ - if (!next) { - while (!(next = READ_ONCE(node->next))) - cpu_relax(); - } + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6ee477765e6c..5a0cf5f9008c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -56,11 +56,6 @@ struct pv_node { }; /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Hybrid PV queued/unfair lock * * By replacing the regular queued_spin_trylock() with the function below, @@ -87,8 +82,6 @@ struct pv_node { #define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - /* * Stay in unfair lock mode as long as queued mode waiters are * present in the MCS wait queue but the pending bit isn't set. @@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) int val = atomic_read(&lock->val); if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) #if _Q_PENDING_BITS == 8 static __always_inline void set_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 1); -} - -static __always_inline void clear_pending(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 0); + WRITE_ONCE(lock->pending, 1); } /* @@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock) */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - return !READ_ONCE(l->locked) && - (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + return !READ_ONCE(lock->locked) && + (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ @@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - static __always_inline int trylock_clear_pending(struct qspinlock *lock) { int val = atomic_read(&lock->val); @@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; /* * If the vCPU is indeed halted, advance its state to match that of @@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * the hash table later on at unlock time, no atomic instruction is * needed. */ - WRITE_ONCE(l->locked, _Q_SLOW_VAL); + WRITE_ONCE(lock->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); } @@ -428,7 +404,6 @@ static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; int waitcnt = 0; int loop; @@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_pv_lock_slowpath, true); + qstat_inc(qstat_lock_slowpath, true); for (;; waitcnt++) { /* @@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { + if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { /* * The lock was free and now we own the lock. * Change the lock value back to _Q_LOCKED_VAL * and unhash the table. */ - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); goto gotlock; } @@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); - pv_wait(&l->locked, _Q_SLOW_VAL); + pv_wait(&lock->locked, _Q_SLOW_VAL); /* * Because of lock stealing, the queue head vCPU may not be @@ -518,7 +493,6 @@ gotlock: __visible void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { - struct __qspinlock *l = (void *)lock; struct pv_node *node; if (unlikely(locked != _Q_SLOW_VAL)) { @@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_release(&lock->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, @@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; u8 locked; /* @@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 4a30ef63c607..6bd78c0740fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,13 +22,14 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node + * lock_pending - # of locking operations via pending code + * lock_slowpath - # of locking operations via MCS lock queue * * Writing to the "reset_counters" file will reset all the above counter * values. @@ -46,13 +47,14 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, - qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_early, qstat_pv_wait_head, qstat_pv_wait_node, + qstat_lock_pending, + qstat_lock_slowpath, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", + [qstat_lock_pending] = "lock_pending", + [qstat_lock_slowpath] = "lock_slowpath", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7591261652d..c25ba18274fb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -36,7 +36,7 @@ struct cpu_stop_done { struct cpu_stopper { struct task_struct *thread; - spinlock_t lock; + raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ @@ -78,13 +78,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; bool enabled; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); return enabled; } @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); int err; retry: - spin_lock_irq(&stopper1->lock); - spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&stopper1->lock); + raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) @@ -255,8 +255,8 @@ retry: __cpu_stop_queue_work(stopper1, work1); __cpu_stop_queue_work(stopper2, work2); unlock: - spin_unlock(&stopper2->lock); - spin_unlock_irq(&stopper1->lock); + raw_spin_unlock(&stopper2->lock); + raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { while (stop_cpus_in_progress) @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu) unsigned long flags; int run; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); return run; } @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu) repeat: work = NULL; - spin_lock_irq(&stopper->lock); + raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } - spin_unlock_irq(&stopper->lock); + raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; @@ -541,7 +541,7 @@ static int __init cpu_stop_init(void) for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - spin_lock_init(&stopper->lock); + raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } |