From 607904c357c61adf20b8fd18af765e501d61a385 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Jan 2017 10:26:52 -0500 Subject: locking/spinlocks: Remove the unused spin_lock_bh_nested() API The spin_lock_bh_nested() API is defined but is not used anywhere in the kernel. So all spin_lock_bh_nested() and related APIs are now removed. Signed-off-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1483975612-16447-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index db3ccb1dd614..4b082b5cac9e 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); -void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) -{ - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh_nested); - unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { -- cgit v1.2.3-59-g8ed1b From 75437bb304b20a2b350b9a8e9f9238d5e24e12ba Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Tue, 10 Jan 2017 02:56:46 -0500 Subject: locking/pvqspinlock: Don't wait if vCPU is preempted If prev node is not in running state or its vCPU is preempted, we can give up our vCPU slices in pv_wait_node() ASAP. Signed-off-by: Pan Xinhui Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: longman@redhat.com Link: http://lkml.kernel.org/r/1484035006-6787-1-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Fixed typos in the changelog, removed ugly linebreak from the code. ] Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e3b5520005db..e6b2f7ad3e51 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -263,7 +263,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running; + return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); } /* -- cgit v1.2.3-59-g8ed1b From 0039962a1473f07fd5c8355bd8264be1eb87eb3e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Jan 2017 13:43:11 -0800 Subject: kernel/exit: Compute 'current' directly This patch effectively replaces the tsk pointer dereference (which is obviously == current), to directly use get_current() macro. In this case, do_exit() always passes current to exit_mm(), hence we can simply get rid of the argument. This is also a performance win on some archs such as x86-64 and ppc64 -- arm64 is no longer an issue. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8f14b866f9f6..2385d434a46e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -468,12 +468,12 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct *tsk) +static void exit_mm(void) { - struct mm_struct *mm = tsk->mm; + struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(tsk, mm); + mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -491,7 +491,7 @@ static void exit_mm(struct task_struct *tsk) up_read(&mm->mmap_sem); - self.task = tsk; + self.task = current; self.next = xchg(&core_state->dumper.next, &self); /* * Implies mb(), the result of xchg() must be visible @@ -501,22 +501,22 @@ static void exit_mm(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_task_state(current, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); + BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; + task_lock(current); + current->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - task_unlock(tsk); + task_unlock(current); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -823,7 +823,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - exit_mm(tsk); + exit_mm(); if (group_dead) acct_process(); -- cgit v1.2.3-59-g8ed1b From d269a8b8c57523a2e328c1ff44fe791e13df3d37 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Jan 2017 13:43:13 -0800 Subject: kernel/locking: Compute 'current' directly This patch effectively replaces the tsk pointer dereference (which is obviously == current), to directly use get_current() macro. This is to make the removal of setting foreign task states smoother and painfully obvious. Performance win on some archs such as x86-64 and ppc64. On a microbenchmark that calls set_task_state() vs set_current_state() and an inode rwsem pounding benchmark doing unlink: == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 19 +++++++++---------- kernel/locking/rwsem-spinlock.c | 18 +++++++----------- kernel/locking/rwsem-xadd.c | 7 +++---- kernel/locking/semaphore.c | 7 +++---- 4 files changed, 22 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9b349619f431..4c7d04362c95 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -622,7 +622,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { - struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; bool first = false; @@ -656,18 +655,18 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task); + debug_mutex_add_waiter(lock, &waiter, current); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; + waiter.task = current; if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); lock_contended(&lock->dep_map, ip); - set_task_state(task, state); + set_task_state(current, state); for (;;) { /* * Once we hold wait_lock, we're serialized against @@ -683,7 +682,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, task))) { + if (unlikely(signal_pending_state(state, current))) { ret = -EINTR; goto err; } @@ -702,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } - set_task_state(task, state); + set_task_state(current, state); /* * Here we order against unlock; we must either see it change * state back to RUNNING and fall through the next schedule(), @@ -716,9 +715,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } spin_lock_mutex(&lock->wait_lock, flags); acquired: - __set_task_state(task, TASK_RUNNING); + __set_task_state(current, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -736,8 +735,8 @@ skip_wait: return 0; err: - __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, task); + __set_task_state(current, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 1591f6b3539f..60d15d3bb2a8 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -128,7 +128,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) void __sched __down_read(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -140,13 +139,12 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); + get_task_struct(current); list_add_tail(&waiter.list, &sem->wait_list); @@ -158,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem) if (!waiter.task) break; schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); } - __set_task_state(tsk, TASK_RUNNING); + __set_task_state(current, TASK_RUNNING); out: ; } @@ -194,15 +192,13 @@ int __down_read_trylock(struct rw_semaphore *sem) int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; - struct task_struct *tsk; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; list_add_tail(&waiter.list, &sem->wait_list); @@ -220,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) ret = -EINTR; goto out; } - set_task_state(tsk, state); + set_task_state(current, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 631506004f9e..d3b819c7f07f 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -224,10 +224,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; - struct task_struct *tsk = current; DEFINE_WAKE_Q(wake_q); - waiter.task = tsk; + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; raw_spin_lock_irq(&sem->wait_lock); @@ -254,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); if (!waiter.task) break; schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_task_state(current, TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index b8120abe594b..29aac9f9e5e4 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -204,19 +204,18 @@ struct semaphore_waiter { static inline int __sched __down_common(struct semaphore *sem, long state, long timeout) { - struct task_struct *task = current; struct semaphore_waiter waiter; list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; + waiter.task = current; waiter.up = false; for (;;) { - if (signal_pending_state(state, task)) + if (signal_pending_state(state, current)) goto interrupted; if (unlikely(timeout <= 0)) goto timed_out; - __set_task_state(task, state); + __set_task_state(current, state); raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); -- cgit v1.2.3-59-g8ed1b From 642fa448ae6b3a4e5e8737054a094173405b7643 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Jan 2017 13:43:14 -0800 Subject: sched/core: Remove set_task_state() This is a nasty interface and setting the state of a foreign task must not be done. As of the following commit: be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()") ... everyone in the kernel calls set_task_state() with current, allowing the helper to be removed. However, as the comment indicates, it is still around for those archs where computing current is more expensive than using a pointer, at least in theory. An important arch that is affected is arm64, however this has been addressed now [1] and performance is up to par making no difference with either calls. Of all the callers, if any, it's the locking bits that would care most about this -- ie: we end up passing a tsk pointer to a lot of the lock slowpath, and setting ->state on that. The following numbers are based on two tests: a custom ad-hoc microbenchmark that just measures latencies (for ~65 million calls) between get_task_state() vs get_current_state(). Secondly for a higher overview, an unlink microbenchmark was used, which pounds on a single file with open, close,unlink combos with increasing thread counts (up to 4x ncpus). While the workload is quite unrealistic, it does contend a lot on the inode mutex or now rwsem. [1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching) and ppc64 (with paca) show similar improvements in the unlink microbenches. The small delta for ppc64 (2ms), does not represent the gains on the unlink runs. In the case of x86, there was a decent amount of variation in the latency runs, but always within a 20 to 50ms increase), ppc was more constant. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- arch/um/drivers/random.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-crypt.c | 4 ++-- drivers/md/persistent-data/dm-block-manager.c | 4 ++-- .../staging/lustre/lnet/libcfs/linux/linux-debug.c | 2 +- drivers/tty/tty_ldsem.c | 10 ++++---- include/linux/sched.h | 27 +--------------------- kernel/exit.c | 4 ++-- kernel/locking/mutex.c | 8 +++---- kernel/locking/rwsem-spinlock.c | 8 +++---- kernel/locking/rwsem-xadd.c | 4 ++-- kernel/locking/semaphore.c | 2 +- 12 files changed, 26 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 05523f14d7b2..57f03050c850 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, add_sigio_fd(random_fd); add_wait_queue(&host_read_wait, &wait); - set_task_state(current, TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); schedule(); remove_wait_queue(&host_read_wait, &wait); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 84d2f0e4c754..d36d427a9efb 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) DECLARE_WAITQUEUE(wait, current); add_wait_queue(&c->free_buffer_wait, &wait); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); dm_bufio_unlock(c); io_schedule(); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 7c6c57216bf2..96692d13a6e4 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1210,14 +1210,14 @@ continue_locked: spin_unlock_irq(&cc->write_thread_wait.lock); if (unlikely(kthread_should_stop())) { - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); spin_lock_irq(&cc->write_thread_wait.lock); __remove_wait_queue(&cc->write_thread_wait, &wait); goto continue_locked; diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a6dde7cab458..758d90cc2733 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock) static void __wait(struct waiter *w) { for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!w->task) break; @@ -128,7 +128,7 @@ static void __wait(struct waiter *w) schedule(); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); } static void __wake_waiter(struct waiter *w) diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c index 39a72e3f0c18..7035356e56b3 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c @@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata) libcfs_debug_dumplog(); if (libcfs_panic_on_lbug) panic("LBUG"); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (1) schedule(); } diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c index 3e6722954f29..9229de43e19d 100644 --- a/drivers/tty/tty_ldsem.c +++ b/drivers/tty/tty_ldsem.c @@ -231,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout) /* wait to be given the lock */ for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!waiter.task) break; @@ -240,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout) timeout = schedule_timeout(timeout); } - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (!timeout) { /* lock timed out but check if this task was just @@ -289,14 +289,14 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout) waiter.task = current; - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { if (!timeout) break; raw_spin_unlock_irq(&sem->wait_lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->wait_lock); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); locked = writer_trylock(sem); if (locked) break; @@ -307,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout) list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); /* lock wait may have timed out */ if (!locked) diff --git a/include/linux/sched.h b/include/linux/sched.h index ad3ec9ec61f7..f4f9d32f12b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -227,7 +227,7 @@ extern void proc_sched_set_task(struct task_struct *p); extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; -/* Convenience macros for the sake of set_task_state */ +/* Convenience macros for the sake of set_current_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) @@ -254,17 +254,6 @@ extern char ___assert_task_state[1 - 2*!!( #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -#define __set_task_state(tsk, state_value) \ - do { \ - (tsk)->task_state_change = _THIS_IP_; \ - (tsk)->state = (state_value); \ - } while (0) -#define set_task_state(tsk, state_value) \ - do { \ - (tsk)->task_state_change = _THIS_IP_; \ - smp_store_mb((tsk)->state, (state_value)); \ - } while (0) - #define __set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ @@ -277,20 +266,6 @@ extern char ___assert_task_state[1 - 2*!!( } while (0) #else - -/* - * @tsk had better be current, or you get to keep the pieces. - * - * The only reason is that computing current can be more expensive than - * using a pointer that's already available. - * - * Therefore, see set_current_state(). - */ -#define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -#define set_task_state(tsk, state_value) \ - smp_store_mb((tsk)->state, (state_value)) - /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to diff --git a/kernel/exit.c b/kernel/exit.c index 2385d434a46e..27c68653e2fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -501,12 +501,12 @@ static void exit_mm(void) complete(&core_state->startup); for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4c7d04362c95..97d142486f93 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -666,7 +666,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, lock_contended(&lock->dep_map, ip); - set_task_state(current, state); + set_current_state(state); for (;;) { /* * Once we hold wait_lock, we're serialized against @@ -701,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } - set_task_state(current, state); + set_current_state(state); /* * Here we order against unlock; we must either see it change * state back to RUNNING and fall through the next schedule(), @@ -715,7 +715,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } spin_lock_mutex(&lock->wait_lock, flags); acquired: - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) @@ -735,7 +735,7 @@ skip_wait: return 0; err: - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 60d15d3bb2a8..5eacab880f67 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -139,7 +139,7 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ waiter.task = current; @@ -156,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem) if (!waiter.task) break; schedule(); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); out: ; } @@ -216,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) ret = -EINTR; goto out; } - set_task_state(current, state); + set_current_state(state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index d3b819c7f07f..a3a381aee24b 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -253,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!waiter.task) break; schedule(); } - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 29aac9f9e5e4..9512e37637dc 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -215,7 +215,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state, goto interrupted; if (unlikely(timeout <= 0)) goto timed_out; - __set_task_state(current, state); + __set_current_state(state); raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); -- cgit v1.2.3-59-g8ed1b From 8f95c90ceb541a38ac16fec48c05142ef1450c25 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Jan 2017 07:22:25 -0800 Subject: sched/wait, RCU: Introduce rcuwait machinery rcuwait provides support for (single) RCU-safe task wait/wake functionality, with the caveat that it must not be called after exit_notify(), such that we avoid racing with rcu delayed_put_task_struct callbacks, task_struct being rcu unaware in this context -- for which we similarly have task_rcu_dereference() magic, but with different return semantics, which can conflict with the wakeup side. The interfaces are quite straightforward: rcuwait_wait_event() rcuwait_wake_up() More details are in the comments, but it's perhaps worth mentioning at least, that users must provide proper serialization when waiting on a condition, and avoid corrupting a concurrent waiter. Also care must be taken between the task and the condition for when calling the wakeup -- we cannot miss wakeups. When porting users, this is for example, a given when using waitqueues in that everything is done under the q->lock. As such, it can remove sources of non preemptable unbounded work for realtime. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/rcuwait.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/exit.c | 30 +++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 include/linux/rcuwait.h (limited to 'kernel') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h new file mode 100644 index 000000000000..0e93d56c7ab2 --- /dev/null +++ b/include/linux/rcuwait.h @@ -0,0 +1,63 @@ +#ifndef _LINUX_RCUWAIT_H_ +#define _LINUX_RCUWAIT_H_ + +#include + +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner; where it is forbidden to use + * after exit_notify(). task_struct is not properly rcu protected, + * unless dealing with rcu-aware lists, ie: find_task_by_*(). + * + * Alternatively we have task_rcu_dereference(), but the return + * semantics have different implications which would break the + * wakeup side. The only time @task is non-nil is when a user is + * blocked (or checking if it needs to) on a condition, and reset + * as soon as we know that the condition has succeeded and are + * awoken. + */ +struct rcuwait { + struct task_struct *task; +}; + +#define __RCUWAIT_INITIALIZER(name) \ + { .task = NULL, } + +static inline void rcuwait_init(struct rcuwait *w) +{ + w->task = NULL; +} + +extern void rcuwait_wake_up(struct rcuwait *w); + +/* + * The caller is responsible for locking around rcuwait_wait_event(), + * such that writes to @task are properly serialized. + */ +#define rcuwait_wait_event(w, condition) \ +({ \ + /* \ + * Complain if we are called after do_exit()/exit_notify(), \ + * as we cannot rely on the rcu critical region for the \ + * wakeup side. \ + */ \ + WARN_ON(current->exit_state); \ + \ + rcu_assign_pointer((w)->task, current); \ + for (;;) { \ + /* \ + * Implicit barrier (A) pairs with (B) in \ + * rcuwait_trywake(). \ + */ \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + \ + schedule(); \ + } \ + \ + WRITE_ONCE((w)->task, NULL); \ + __set_current_state(TASK_RUNNING); \ +}) + +#endif /* _LINUX_RCUWAIT_H_ */ diff --git a/kernel/exit.c b/kernel/exit.c index 27c68653e2fc..a9441da69e29 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -282,6 +283,35 @@ retry: return task; } +void rcuwait_wake_up(struct rcuwait *w) +{ + struct task_struct *task; + + rcu_read_lock(); + + /* + * Order condition vs @task, such that everything prior to the load + * of @task is visible. This is the condition as to why the user called + * rcuwait_trywake() in the first place. Pairs with set_current_state() + * barrier (A) in rcuwait_wait_event(). + * + * WAIT WAKE + * [S] tsk = current [S] cond = true + * MB (A) MB (B) + * [L] cond [L] tsk + */ + smp_rmb(); /* (B) */ + + /* + * Avoid using task_rcu_dereference() magic as long as we are careful, + * see comment in rcuwait_wait_event() regarding ->exit_state. + */ + task = rcu_dereference(w->task); + if (task) + wake_up_process(task); + rcu_read_unlock(); +} + struct task_struct *try_get_task_struct(struct task_struct **ptask) { struct task_struct *task; -- cgit v1.2.3-59-g8ed1b From 52b94129f274937e4c25dd17b76697664a3c43c9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Jan 2017 07:22:26 -0800 Subject: locking/percpu-rwsem: Replace waitqueue with rcuwait The use of any kind of wait queue is an overkill for pcpu-rwsems. While one option would be to use the less heavy simple (swait) flavor, this is still too much for what pcpu-rwsems needs. For one, we do not care about any sort of queuing in that the only (rare) time writers (and readers, for that matter) are queued is when trying to acquire the regular contended rw_sem. There cannot be any further queuing as writers are serialized by the rw_sem in the first place. Given that percpu_down_write() must not be called after exit_notify(), we can replace the bulky waitqueue with rcuwait such that a writer can wait for its turn to take the lock. As such, we can avoid the queue handling and locking overhead. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1484148146-14210-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 8 ++++---- kernel/locking/percpu-rwsem.c | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5b2e6159b744..93664f022ecf 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -4,15 +4,15 @@ #include #include #include -#include +#include #include #include struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *read_count; - struct rw_semaphore rw_sem; - wait_queue_head_t writer; + struct rw_semaphore rw_sem; /* slowpath */ + struct rcuwait writer; /* blocked writer */ int readers_block; }; @@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ - .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \ + .writer = __RCUWAIT_INITIALIZER(name.writer), \ } extern int __percpu_down_read(struct percpu_rw_semaphore *, int); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index ce182599cf2e..883cf1b92d90 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); __init_rwsem(&sem->rw_sem, name, rwsem_key); - init_waitqueue_head(&sem->writer); + rcuwait_init(&sem->writer); sem->readers_block = 0; return 0; } @@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem) __this_cpu_dec(*sem->read_count); /* Prod writer to recheck readers_active */ - wake_up(&sem->writer); + rcuwait_wake_up(&sem->writer); } EXPORT_SYMBOL_GPL(__percpu_up_read); @@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) */ /* Wait for all now active readers to complete. */ - wait_event(sem->writer, readers_active_check(sem)); + rcuwait_wait_event(&sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -- cgit v1.2.3-59-g8ed1b From e274795ea7b7caa0fd74ef651594382a69e2a951 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Jan 2017 14:17:48 +0100 Subject: locking/mutex: Fix mutex handoff While reviewing the ww_mutex patches, I noticed that it was still possible to (incorrectly) succeed for (incorrect) code like: mutex_lock(&a); mutex_lock(&a); This was possible if the second mutex_lock() would block (as expected) but then receive a spurious wakeup. At that point it would find itself at the front of the queue, request a handoff and instantly claim ownership and continue, since owner would point to itself. Avoid this scenario and simplify the code by introducing a third low bit to signal handoff pickup. So once we request handoff, unlock clears the handoff bit and sets the pickup bit along with the new owner. This also removes the need for the .handoff argument to __mutex_trylock(), since that becomes superfluous with PICKUP. In order to guarantee enough low bits, ensure task_struct alignment is at least L1_CACHE_BYTES (which seems a good ideal regardless). Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 9d659ae14b54 ("locking/mutex: Add lock handoff to avoid starvation") Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 2 +- kernel/fork.c | 6 ++- kernel/locking/mutex.c | 108 ++++++++++++++++++++++++------------------------- 3 files changed, 57 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index b97870f2debd..3e1fccb47f11 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -65,7 +65,7 @@ struct mutex { static inline struct task_struct *__mutex_owner(struct mutex *lock) { - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03); + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8ab827c..a90510d0bbf8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -432,11 +432,13 @@ void __init fork_init(void) int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#define ARCH_MIN_TASKALIGN 0 #endif + int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - arch_task_struct_size, ARCH_MIN_TASKALIGN, + arch_task_struct_size, align, SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 97d142486f93..24284497c425 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init); /* * @owner: contains: 'struct task_struct *' to the current lock owner, * NULL means not owned. Since task_struct pointers are aligned at - * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low - * bits to store extra state. + * at least L1_CACHE_BYTES, we have low bits to store extra state. * * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. */ #define MUTEX_FLAG_WAITERS 0x01 #define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 -#define MUTEX_FLAGS 0x03 +#define MUTEX_FLAGS 0x07 static inline struct task_struct *__owner_task(unsigned long owner) { @@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner) } /* - * Actual trylock that will work on any unlocked state. - * - * When setting the owner field, we must preserve the low flag bits. - * - * Be careful with @handoff, only set that in a wait-loop (where you set - * HANDOFF) to avoid recursive lock attempts. + * Trylock variant that retuns the owning task on failure. */ -static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ unsigned long old, flags = __owner_flags(owner); + unsigned long task = owner & ~MUTEX_FLAGS; + + if (task) { + if (likely(task != curr)) + break; + + if (likely(!(flags & MUTEX_FLAG_PICKUP))) + break; - if (__owner_task(owner)) { - if (handoff && unlikely(__owner_task(owner) == current)) { - /* - * Provide ACQUIRE semantics for the lock-handoff. - * - * We cannot easily use load-acquire here, since - * the actual load is a failed cmpxchg, which - * doesn't imply any barriers. - * - * Also, this is a fairly unlikely scenario, and - * this contains the cost. - */ - smp_mb(); /* ACQUIRE */ - return true; - } - - return false; + flags &= ~MUTEX_FLAG_PICKUP; + } else { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); +#endif } /* @@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) * past the point where we acquire it. This would be possible * if we (accidentally) set the bit on an unlocked mutex. */ - if (handoff) - flags &= ~MUTEX_FLAG_HANDOFF; + flags &= ~MUTEX_FLAG_HANDOFF; old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); if (old == owner) - return true; + return NULL; owner = old; } + + return __owner_task(owner); +} + +/* + * Actual trylock that will work on any unlocked state. + */ +static inline bool __mutex_trylock(struct mutex *lock) +{ + return !__mutex_trylock_or_owner(lock); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait /* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE - * semantics like a regular unlock, the __mutex_trylock() provides matching - * ACQUIRE semantics for the handoff. + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * WAITERS. Provides RELEASE semantics like a regular unlock, the + * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) { @@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; + if (task) + new |= MUTEX_FLAG_PICKUP; old = atomic_long_cmpxchg_release(&lock->owner, owner, new); if (old == owner) @@ -435,8 +439,6 @@ static bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx, const bool waiter) { - struct task_struct *task = current; - if (!waiter) { /* * The purpose of the mutex_can_spin_on_owner() function is @@ -476,24 +478,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, goto fail_unlock; } + /* Try to acquire the mutex... */ + owner = __mutex_trylock_or_owner(lock); + if (!owner) + break; + /* - * If there's an owner, wait for it to either + * There's an owner, wait for it to either * release the lock or go to sleep. */ - owner = __mutex_owner(lock); - if (owner) { - if (waiter && owner == task) { - smp_mb(); /* ACQUIRE */ - break; - } - - if (!mutex_spin_on_owner(lock, owner)) - goto fail_unlock; - } - - /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock, waiter)) - break; + if (!mutex_spin_on_owner(lock, owner)) + goto fail_unlock; /* * The cpu_relax() call is a compiler barrier which forces @@ -637,7 +632,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (__mutex_trylock(lock, false) || + if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); @@ -651,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock, false)) + if (__mutex_trylock(lock)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -674,7 +669,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * before testing the error conditions to make sure we pick up * the handoff. */ - if (__mutex_trylock(lock, first)) + if (__mutex_trylock(lock)) goto acquired; /* @@ -707,8 +702,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || - __mutex_trylock(lock, first)) + if (__mutex_trylock(lock) || + (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true))) break; spin_lock_mutex(&lock->wait_lock, flags); @@ -865,6 +860,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); + DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); #endif if (owner & MUTEX_FLAG_HANDOFF) @@ -1003,7 +999,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock, false); + bool locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -- cgit v1.2.3-59-g8ed1b From 3822da3ed0676e01f83fe0518c333c8e9ba249bf Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:31 +0100 Subject: locking/ww_mutex: Extract stamp comparison to __ww_mutex_stamp_after() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function will be re-used in subsequent patches. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Wilson Cc: Andrew Morton Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-4-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 24284497c425..9ad03b9a5f7f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -281,6 +281,13 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, ww_ctx->acquired++; } +static inline bool __sched +__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ + return a->stamp - b->stamp <= LONG_MAX && + (a->stamp != b->stamp || a > b); +} + /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. @@ -597,8 +604,7 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && - (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { + if (__ww_ctx_stamp_after(ctx, hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(ctx->contending_lock); ctx->contending_lock = ww; -- cgit v1.2.3-59-g8ed1b From ea9e0fb8fe1bdfca81bd76052a5cce70bb053430 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:32 +0100 Subject: locking/ww_mutex: Set use_ww_ctx even when locking without a context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will add a new field to struct mutex_waiter. This field must be initialized for all waiters if any waiter uses the ww_use_ctx path. So there is a trade-off: Keep ww_mutex locking without a context on the faster non-use_ww_ctx path, at the cost of adding the initialization to all mutex locks (including non-ww_mutexes), or avoid the additional cost for non-ww_mutex locks, at the cost of adding additional checks to the use_ww_ctx path. We take the latter choice. It may be worth eliminating the users of ww_mutex_lock(lock, NULL), but there are a lot of them. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-5-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- include/linux/ww_mutex.h | 11 ++--------- kernel/locking/mutex.c | 29 +++++++++++++++++------------ 2 files changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 7b0066814fa0..5f2e8379baff 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -222,11 +222,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock, */ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - if (ctx) - return __ww_mutex_lock(lock, ctx); - - mutex_lock(&lock->base); - return 0; + return __ww_mutex_lock(lock, ctx); } /** @@ -262,10 +258,7 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - if (ctx) - return __ww_mutex_lock_interruptible(lock, ctx); - else - return mutex_lock_interruptible(&lock->base); + return __ww_mutex_lock_interruptible(lock, ctx); } /** diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9ad03b9a5f7f..44a64c0a851a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -469,7 +469,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, for (;;) { struct task_struct *owner; - if (use_ww_ctx && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -629,8 +629,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_mutex *ww; int ret; - if (use_ww_ctx) { - ww = container_of(lock, struct ww_mutex, base); + ww = container_of(lock, struct ww_mutex, base); + + if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -642,7 +643,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; @@ -688,7 +689,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; @@ -728,7 +729,7 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) + if (use_ww_ctx && ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); spin_unlock_mutex(&lock->wait_lock, flags); @@ -816,8 +817,9 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); - if (!ret && ctx->acquired > 1) + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx, 1); + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; @@ -831,9 +833,10 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx, 1); - if (!ret && ctx->acquired > 1) + if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; @@ -1021,7 +1024,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } @@ -1035,7 +1039,8 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); if (__mutex_trylock_fast(&lock->base)) { - ww_mutex_set_context_fastpath(lock, ctx); + if (ctx) + ww_mutex_set_context_fastpath(lock, ctx); return 0; } -- cgit v1.2.3-59-g8ed1b From c5470b22d1833241cae996d23a8a346ff8ec4d58 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:33 +0100 Subject: locking/ww_mutex: Remove the __ww_mutex_lock*() inline wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the documentation in the header file since there is no good place for it in mutex.c: there are two rather different implementations with different EXPORT_SYMBOLs for each function. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-6-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- include/linux/ww_mutex.h | 18 ++++-------------- kernel/locking/mutex.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 5f2e8379baff..c07528522bbc 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) #endif } -extern int __must_check __ww_mutex_lock(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx); -extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx); - /** * ww_mutex_lock - acquire the w/w mutex * @lock: the mutex to be acquired @@ -220,10 +215,8 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock, * * A mutex acquired with this function must be released with ww_mutex_unlock. */ -static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - return __ww_mutex_lock(lock, ctx); -} +extern int __must_check ww_mutex_lock(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx); /** * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible @@ -255,11 +248,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct * * A mutex acquired with this function must be released with ww_mutex_unlock. */ -static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - return __ww_mutex_lock_interruptible(lock, ctx); -} +extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx); /** * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 44a64c0a851a..c696614a6b8b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -811,7 +811,7 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; @@ -824,10 +824,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock); +EXPORT_SYMBOL_GPL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; @@ -841,7 +841,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return ret; } -EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); #endif @@ -1019,7 +1019,7 @@ EXPORT_SYMBOL(mutex_trylock); #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); @@ -1031,10 +1031,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return __ww_mutex_lock_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock); +EXPORT_SYMBOL(ww_mutex_lock); int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { might_sleep(); @@ -1046,7 +1046,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } -EXPORT_SYMBOL(__ww_mutex_lock_interruptible); +EXPORT_SYMBOL(ww_mutex_lock_interruptible); #endif -- cgit v1.2.3-59-g8ed1b From 6baa5c60a97d7f1a37a4b5ab5fb3a450e56e8b06 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:34 +0100 Subject: locking/ww_mutex: Add waiters in stamp order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add regular waiters in stamp order. Keep adding waiters that have no context in FIFO order and take care not to starve them. While adding our task as a waiter, back off if we detect that there is a waiter with a lower stamp in front of us. Make sure to call lock_contended even when we back off early. For w/w mutexes, being first in the wait list is only stable when taking the lock without a context. Therefore, the purpose of the first flag is split into two: 'first' remains to indicate whether we want to spin optimistically, while 'handoff' indicates that we should be prepared to accept a handoff. For w/w locking with a context, we always accept handoffs after the first schedule(), to handle the following sequence of events: 1. Task #0 unlocks and hands off to Task #2 which is first in line 2. Task #1 adds itself in front of Task #2 3. Task #2 wakes up and must accept the handoff even though it is no longer first in line Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-7-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 3 ++ kernel/locking/mutex.c | 76 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3e1fccb47f11..e17942ffb3fc 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,8 @@ #include #include +struct ww_acquire_ctx; + /* * Simple, straightforward mutexes with strict semantics: * @@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) struct mutex_waiter { struct list_head list; struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c696614a6b8b..d0f7628b5a3d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -615,6 +615,52 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) return 0; } +static inline int __sched +__ww_mutex_add_waiter(struct mutex_waiter *waiter, + struct mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + struct list_head *pos; + + if (!ww_ctx) { + list_add_tail(&waiter->list, &lock->wait_list); + return 0; + } + + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. + */ + pos = &lock->wait_list; + list_for_each_entry_reverse(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { + /* Back off immediately if necessary. */ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + break; + } + + pos = &cur->list; + } + + list_add_tail(&waiter->list, pos); + return 0; +} + /* * Lock a mutex (possibly interruptible), slowpath: */ @@ -659,15 +705,25 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, current); - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); + lock_contended(&lock->dep_map, ip); + + if (!use_ww_ctx) { + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + } else { + /* Add in stamp order, waking up waiters that must back off. */ + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + if (ret) + goto err_early_backoff; + + waiter.ww_ctx = ww_ctx; + } + waiter.task = current; if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - lock_contended(&lock->dep_map, ip); - set_current_state(state); for (;;) { /* @@ -698,9 +754,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); - if (!first && __mutex_waiter_is_first(lock, &waiter)) { - first = true; - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + /* + * ww_mutex needs to always recheck its position since its waiter + * list is not FIFO ordered. + */ + if ((use_ww_ctx && ww_ctx) || !first) { + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } set_current_state(state); @@ -739,6 +800,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); +err_early_backoff: spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); -- cgit v1.2.3-59-g8ed1b From 200b1874401555e9b36010aa318e75cf57005f36 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:35 +0100 Subject: locking/ww_mutex: Notify waiters that have to back off while adding tasks to wait list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While adding our task as a waiter, detect if another task should back off because of us. With this patch, we establish the invariant that the wait list contains at most one (sleeping) waiter with ww_ctx->acquired > 0, and this waiter will be the first waiter with a context. Since only waiters with ww_ctx->acquired > 0 have to back off, this allows us to be much more economical with wakeups. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-8-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d0f7628b5a3d..66967219477d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -596,23 +596,34 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct mutex_waiter *cur; - if (!hold_ctx) - return 0; + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) + goto deadlock; - if (__ww_ctx_stamp_after(ctx, hold_ctx)) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must back off. + */ + cur = waiter; + list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { + if (cur->ww_ctx) + goto deadlock; } return 0; + +deadlock: +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; } static inline int __sched @@ -655,6 +666,15 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, } pos = &cur->list; + + /* + * Wake up the waiter so that it gets a chance to back + * off. + */ + if (cur->ww_ctx->acquired > 0) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } } list_add_tail(&waiter->list, pos); @@ -746,7 +766,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); if (ret) goto err; } -- cgit v1.2.3-59-g8ed1b From 659cf9f5824a12e6b50785e4e9cf1adf8a3adbd0 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:36 +0100 Subject: locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter for backoff when acquiring the lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wait list is sorted by stamp order, and the only waiting task that may have to back off is the first waiter with a context. The regular slow path does not have to wake any other tasks at all, since all other waiters that would have to back off were either woken up when the waiter was added to the list, or detected the condition before they added themselves. Median timings taken of a contention-heavy GPU workload: Without this series: real 0m59.900s user 0m7.516s sys 2m16.076s With changes up to and including this patch: real 0m52.946s user 0m7.272s sys 1m55.964s Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-9-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 59 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 66967219477d..51e2ba557e2a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -288,6 +288,36 @@ __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) (a->stamp != b->stamp || a > b); } +/* + * Wake up any waiters that may have to back off when the lock is held by the + * given context. + * + * Due to the invariants on the wait list, this can only affect the first + * waiter with a context. + * + * The current task must not be on the wait list. + */ +static void __sched +__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + + lockdep_assert_held(&lock->wait_lock); + + list_for_each_entry(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (cur->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + + break; + } +} + /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. @@ -297,7 +327,6 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { unsigned long flags; - struct mutex_waiter *cur; ww_mutex_lock_acquired(lock, ctx); @@ -323,16 +352,15 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, * so they can see the new lock->ctx. */ spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } + __ww_mutex_wakeup_for_backoff(&lock->base, ctx); spin_unlock_mutex(&lock->base.wait_lock, flags); } /* - * After acquiring lock in the slowpath set ctx and wake up any - * waiters so they can recheck. + * After acquiring lock in the slowpath set ctx. + * + * Unlike for the fast path, the caller ensures that waiters are woken up where + * necessary. * * Callers must hold the mutex wait_lock. */ @@ -340,19 +368,8 @@ static __always_inline void ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - struct mutex_waiter *cur; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -719,8 +736,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock)) + if (__mutex_trylock(lock)) { + if (use_ww_ctx && ww_ctx) + __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + goto skip_wait; + } debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, current); -- cgit v1.2.3-59-g8ed1b From 427b18207a87f6306bd53a74e03dbe17392b0045 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Dec 2016 10:36:00 +0100 Subject: locking/mutex: Improve inlining Instead of inlining __mutex_lock_common() 5 times, once for each {state,ww} variant. Reduce this to two, ww and !ww. Then add __always_inline to mutex_optimistic_spin(), so that that will get inlined all 4 remaining times, for all {waiter,ww} variants. text data bss dec hex filename 6301 0 0 6301 189d defconfig-build/kernel/locking/mutex.o 4053 0 0 4053 fd5 defconfig-build/kernel/locking/mutex.o 4257 0 0 4257 10a1 defconfig-build/kernel/locking/mutex.o This reduces total text size and better separates the ww and !ww mutex code generation. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 85 ++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 51e2ba557e2a..43ff6110014c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -241,8 +241,8 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { #ifdef CONFIG_DEBUG_MUTEXES /* @@ -323,8 +323,7 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) * slowpath, set ctx and wake up any waiters so they can recheck. */ static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { unsigned long flags; @@ -365,8 +364,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, * Callers must hold the mutex wait_lock. */ static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) +ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; @@ -459,9 +457,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * with the spinner at the head of the OSQ, if present, until the owner is * changed to itself. */ -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { if (!waiter) { /* @@ -551,9 +549,9 @@ fail: return false; } #else -static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) +static __always_inline bool +mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { return false; } @@ -712,8 +710,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_mutex *ww; int ret; - ww = container_of(lock, struct ww_mutex, base); + might_sleep(); + ww = container_of(lock, struct ww_mutex, base); if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; @@ -849,13 +848,26 @@ err_early_backoff: return ret; } +static int __sched +__mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); +} + +static int __sched +__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx) +{ + return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -863,27 +875,21 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested); void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -919,9 +925,9 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx, 1); + ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -935,9 +941,9 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) int ret; might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx, 1); + ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, + 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, + ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1060,37 +1066,34 @@ EXPORT_SYMBOL(mutex_lock_killable); static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) { - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); + return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + _RET_IP_, ctx); } #endif -- cgit v1.2.3-59-g8ed1b From 25f13b4040b68dfc5a2a22e7234894e718e3f4c5 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:37 +0100 Subject: locking/ww_mutex: Re-check ww->ctx in the inner optimistic spin loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the following scenario, thread #1 should back off its attempt to lock ww1 and unlock ww2 (assuming the acquire context stamps are ordered accordingly). Thread #0 Thread #1 --------- --------- successfully lock ww2 set ww1->base.owner attempt to lock ww1 confirm ww1->ctx == NULL enter mutex_spin_on_owner set ww1->ctx What was likely to happen previously is: attempt to lock ww2 refuse to spin because ww2->ctx != NULL schedule() detect thread #0 is off CPU stop optimistic spin return -EDEADLK unlock ww2 wakeup thread #0 lock ww2 Now, we are more likely to see: detect ww1->ctx != NULL stop optimistic spin return -EDEADLK unlock ww2 successfully lock ww2 ... because thread #1 will stop its optimistic spin as soon as possible. The whole scenario is quite unlikely, since it requires thread #1 to get between thread #0 setting the owner and setting the ctx. But since we're idling here anyway, the additional check is basically free. Found by inspection. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-10-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 43ff6110014c..41b0406069e8 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -372,11 +372,14 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. + * Look out! "owner" is an entirely speculative pointer access and not + * reliable. + * + * "noinline" so that this function shows up on perf profiles. */ static noinline -bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, + struct ww_acquire_ctx *ww_ctx) { bool ret = true; @@ -399,6 +402,28 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) break; } + if (ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + * + * Check this in every inner iteration because we may + * be racing against another thread's ww_mutex_lock. + */ + if (READ_ONCE(ww->ctx)) { + ret = false; + break; + } + } + cpu_relax(); } rcu_read_unlock(); @@ -484,22 +509,6 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, for (;;) { struct task_struct *owner; - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (READ_ONCE(ww->ctx)) - goto fail_unlock; - } - /* Try to acquire the mutex... */ owner = __mutex_trylock_or_owner(lock); if (!owner) @@ -509,7 +518,7 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * There's an owner, wait for it to either * release the lock or go to sleep. */ - if (!mutex_spin_on_owner(lock, owner)) + if (!mutex_spin_on_owner(lock, owner, ww_ctx)) goto fail_unlock; /* -- cgit v1.2.3-59-g8ed1b From c516df978df1471793aaa4809b390ecd40fa93c2 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:38 +0100 Subject: locking/ww_mutex: Optimize ww-mutexes by yielding to other waiters from optimistic spin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lock stealing is less beneficial for w/w mutexes since we may just end up backing off if we stole from a thread with an earlier acquire stamp that already holds another w/w mutex that we also need. So don't spin optimistically unless we are sure that there is no other waiter that might cause us to back off. Median timings taken of a contention-heavy GPU workload: Before: real 0m52.946s user 0m7.272s sys 1m55.964s After: real 0m53.086s user 0m7.360s sys 1m46.204s This particular workload still spends 20%-25% of CPU in mutex_spin_on_owner according to perf, but my attempts to further reduce this spinning based on various heuristics all lead to an increase in measured wall time despite the decrease in sys time. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-11-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 78 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 41b0406069e8..cd8598aa0426 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -371,6 +371,49 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline +bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, + struct mutex_waiter *waiter) +{ + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + * + * Check this in every inner iteration because we may + * be racing against another thread's ww_mutex_lock. + */ + if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx)) + return false; + + /* + * If we aren't on the wait list yet, cancel the spin + * if there are waiters. We want to avoid stealing the + * lock from a waiter with an earlier stamp, since the + * other thread may already own a lock that we also + * need. + */ + if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS)) + return false; + + /* + * Similarly, stop spinning if we are no longer the + * first waiter. + */ + if (waiter && !__mutex_waiter_is_first(lock, waiter)) + return false; + + return true; +} + /* * Look out! "owner" is an entirely speculative pointer access and not * reliable. @@ -379,7 +422,7 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) */ static noinline bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) { bool ret = true; @@ -402,26 +445,9 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, break; } - if (ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - * - * Check this in every inner iteration because we may - * be racing against another thread's ww_mutex_lock. - */ - if (READ_ONCE(ww->ctx)) { - ret = false; - break; - } + if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) { + ret = false; + break; } cpu_relax(); @@ -484,7 +510,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) + const bool use_ww_ctx, struct mutex_waiter *waiter) { if (!waiter) { /* @@ -518,7 +544,7 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * There's an owner, wait for it to either * release the lock or go to sleep. */ - if (!mutex_spin_on_owner(lock, owner, ww_ctx)) + if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter)) goto fail_unlock; /* @@ -560,7 +586,7 @@ fail: #else static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, const bool waiter) + const bool use_ww_ctx, struct mutex_waiter *waiter) { return false; } @@ -731,7 +757,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); if (__mutex_trylock(lock) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (use_ww_ctx && ww_ctx) @@ -820,7 +846,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * or we must see its unlock and acquire. */ if (__mutex_trylock(lock) || - (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true))) + (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) break; spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3-59-g8ed1b From 977625a693283dbb35b2a3f674bc0237f7347348 Mon Sep 17 00:00:00 2001 From: Nicolai Hähnle Date: Wed, 21 Dec 2016 19:46:39 +0100 Subject: locking/mutex: Initialize mutex_waiter::ww_ctx with poison when debugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Help catch cases where mutex_lock is used directly on w/w mutexes, which otherwise result in the w/w tasks reading uninitialized data. Signed-off-by: Nicolai Hähnle Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1482346000-9927-12-git-send-email-nhaehnle@gmail.com Signed-off-by: Ingo Molnar --- include/linux/poison.h | 1 + kernel/locking/mutex.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/poison.h b/include/linux/poison.h index 51334edec506..a39540326417 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -80,6 +80,7 @@ /********** kernel/mutexes **********/ #define MUTEX_DEBUG_INIT 0x11 #define MUTEX_DEBUG_FREE 0x22 +#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA) /********** lib/flex_array.c **********/ #define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cd8598aa0426..935116723a3d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -785,6 +785,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); + +#ifdef CONFIG_DEBUG_MUTEXES + waiter.ww_ctx = MUTEX_POISON_WW_CTX; +#endif } else { /* Add in stamp order, waking up waiters that must back off. */ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); -- cgit v1.2.3-59-g8ed1b From 0186a6cbdc6287fde65858e5d9c714dc167b8ace Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:05 +0000 Subject: locking/ww_mutex: Add ww_mutex to locktorture test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Although ww_mutexes degenerate into mutexes, it would be useful to torture the deadlock handling between multiple ww_mutexes in addition to torturing the regular mutexes. Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-3-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/locktorture.c | 73 ++++++++++++++++++++++ .../selftests/rcutorture/configs/lock/CFLIST | 1 + .../selftests/rcutorture/configs/lock/LOCK07 | 6 ++ .../selftests/rcutorture/configs/lock/LOCK07.boot | 1 + 4 files changed, 81 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK07 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f8c5af52a131..9bffedd82884 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -372,6 +372,78 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#include +static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); +static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); + +static int torture_ww_mutex_lock(void) +__acquires(torture_ww_mutex_0) +__acquires(torture_ww_mutex_1) +__acquires(torture_ww_mutex_2) +{ + LIST_HEAD(list); + struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; + } locks[3], *ll, *ln; + struct ww_acquire_ctx ctx; + + locks[0].lock = &torture_ww_mutex_0; + list_add(&locks[0].link, &list); + + locks[1].lock = &torture_ww_mutex_1; + list_add(&locks[1].link, &list); + + locks[2].lock = &torture_ww_mutex_2; + list_add(&locks[2].link, &list); + + ww_acquire_init(&ctx, &torture_ww_class); + + list_for_each_entry(ll, &list, link) { + int err; + + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &list, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) + return err; + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &list); + } + + ww_acquire_fini(&ctx); + return 0; +} + +static void torture_ww_mutex_unlock(void) +__releases(torture_ww_mutex_0) +__releases(torture_ww_mutex_1) +__releases(torture_ww_mutex_2) +{ + ww_mutex_unlock(&torture_ww_mutex_0); + ww_mutex_unlock(&torture_ww_mutex_1); + ww_mutex_unlock(&torture_ww_mutex_2); +} + +static struct lock_torture_ops ww_mutex_lock_ops = { + .writelock = torture_ww_mutex_lock, + .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_ww_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "ww_mutex_lock" +}; + #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); @@ -793,6 +865,7 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, + &ww_mutex_lock_ops, #ifdef CONFIG_RT_MUTEXES &rtmutex_lock_ops, #endif diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST index b9611c523723..41bae5824339 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST @@ -4,3 +4,4 @@ LOCK03 LOCK04 LOCK05 LOCK06 +LOCK07 diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07 b/tools/testing/selftests/rcutorture/configs/lock/LOCK07 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot new file mode 100644 index 000000000000..97dadd1a9e45 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot @@ -0,0 +1 @@ +locktorture.torture_type=ww_mutex_lock -- cgit v1.2.3-59-g8ed1b From f2a5fec17395f259d54daa8833d81b00cceb15c3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:06 +0000 Subject: locking/ww_mutex: Begin kselftests for ww_mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-4-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 1 + kernel/locking/test-ww_mutex.c | 140 +++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 12 ++++ 3 files changed, 153 insertions(+) create mode 100644 kernel/locking/test-ww_mutex.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6f88e352cd4f..760158d9d98d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o +obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c new file mode 100644 index 000000000000..472d5b1f303f --- /dev/null +++ b/kernel/locking/test-ww_mutex.c @@ -0,0 +1,140 @@ +/* + * Module-based API test facility for ww_mutexes + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + */ + +#include + +#include +#include +#include +#include + +static DEFINE_WW_CLASS(ww_class); + +struct test_mutex { + struct work_struct work; + struct ww_mutex mutex; + struct completion ready, go, done; + unsigned int flags; +}; + +#define TEST_MTX_SPIN BIT(0) +#define TEST_MTX_TRY BIT(1) +#define TEST_MTX_CTX BIT(2) +#define __TEST_MTX_LAST BIT(3) + +static void test_mutex_work(struct work_struct *work) +{ + struct test_mutex *mtx = container_of(work, typeof(*mtx), work); + + complete(&mtx->ready); + wait_for_completion(&mtx->go); + + if (mtx->flags & TEST_MTX_TRY) { + while (!ww_mutex_trylock(&mtx->mutex)) + cpu_relax(); + } else { + ww_mutex_lock(&mtx->mutex, NULL); + } + complete(&mtx->done); + ww_mutex_unlock(&mtx->mutex); +} + +static int __test_mutex(unsigned int flags) +{ +#define TIMEOUT (HZ / 16) + struct test_mutex mtx; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mtx.mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); + init_completion(&mtx.ready); + init_completion(&mtx.go); + init_completion(&mtx.done); + mtx.flags = flags; + + schedule_work(&mtx.work); + + wait_for_completion(&mtx.ready); + ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL); + complete(&mtx.go); + if (flags & TEST_MTX_SPIN) { + unsigned long timeout = jiffies + TIMEOUT; + + ret = 0; + do { + if (completion_done(&mtx.done)) { + ret = -EINVAL; + break; + } + cpu_relax(); + } while (time_before(jiffies, timeout)); + } else { + ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); + } + ww_mutex_unlock(&mtx.mutex); + ww_acquire_fini(&ctx); + + if (ret) { + pr_err("%s(flags=%x): mutual exclusion failure\n", + __func__, flags); + ret = -EINVAL; + } + + flush_work(&mtx.work); + destroy_work_on_stack(&mtx.work); + return ret; +#undef TIMEOUT +} + +static int test_mutex(void) +{ + int ret; + int i; + + for (i = 0; i < __TEST_MTX_LAST; i++) { + ret = __test_mutex(i); + if (ret) + return ret; + } + + return 0; +} + +static int __init test_ww_mutex_init(void) +{ + int ret; + + ret = test_mutex(); + if (ret) + return ret; + + return 0; +} + +static void __exit test_ww_mutex_exit(void) +{ +} + +module_init(test_ww_mutex_init); +module_exit(test_ww_mutex_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index eb9e9a7870fa..5b37821632a2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1180,6 +1180,18 @@ config LOCK_TORTURE_TEST Say M if you want these torture tests to build as a module. Say N if you are unsure. +config WW_MUTEX_SELFTEST + tristate "Wait/wound mutex selftests" + help + This option provides a kernel module that runs tests on the + on the struct ww_mutex locking API. + + It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction + with this test harness. + + Say M if you want these self tests to build as a module. + Say N if you are unsure. + endmenu # lock debugging config TRACE_IRQFLAGS -- cgit v1.2.3-59-g8ed1b From c22fb3807fd0a3bdd98c13c4d2190ab064e6c09d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:07 +0000 Subject: locking/ww_mutex: Add kselftests for ww_mutex AA deadlock detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-5-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 472d5b1f303f..835fa7a1036e 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -118,6 +118,41 @@ static int test_mutex(void) return 0; } +static int test_aa(void) +{ + struct ww_mutex mutex; + struct ww_acquire_ctx ctx; + int ret; + + ww_mutex_init(&mutex, &ww_class); + ww_acquire_init(&ctx, &ww_class); + + ww_mutex_lock(&mutex, &ctx); + + if (ww_mutex_trylock(&mutex)) { + pr_err("%s: trylocked itself!\n", __func__); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = ww_mutex_lock(&mutex, &ctx); + if (ret != -EALREADY) { + pr_err("%s: missed deadlock for recursing, ret=%d\n", + __func__, ret); + if (!ret) + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + ww_mutex_unlock(&mutex); + ww_acquire_fini(&ctx); + return ret; +} + static int __init test_ww_mutex_init(void) { int ret; @@ -126,6 +161,10 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + ret = test_aa(); + if (ret) + return ret; + return 0; } -- cgit v1.2.3-59-g8ed1b From 70207686e492fb97c11d88ae7d8ebce7d2d080aa Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:08 +0000 Subject: locking/ww_mutex: Add kselftests for ww_mutex ABBA deadlock detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-6-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 835fa7a1036e..5a643ba2b020 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -153,6 +153,96 @@ out: return ret; } +struct test_abba { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex b_mutex; + struct completion a_ready; + struct completion b_ready; + bool resolve; + int result; +}; + +static void test_abba_work(struct work_struct *work) +{ + struct test_abba *abba = container_of(work, typeof(*abba), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba->b_mutex, &ctx); + + complete(&abba->b_ready); + wait_for_completion(&abba->a_ready); + + err = ww_mutex_lock(&abba->a_mutex, &ctx); + if (abba->resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba->b_mutex); + ww_mutex_lock_slow(&abba->a_mutex, &ctx); + err = ww_mutex_lock(&abba->b_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba->a_mutex); + ww_mutex_unlock(&abba->b_mutex); + ww_acquire_fini(&ctx); + + abba->result = err; +} + +static int test_abba(bool resolve) +{ + struct test_abba abba; + struct ww_acquire_ctx ctx; + int err, ret; + + ww_mutex_init(&abba.a_mutex, &ww_class); + ww_mutex_init(&abba.b_mutex, &ww_class); + INIT_WORK_ONSTACK(&abba.work, test_abba_work); + init_completion(&abba.a_ready); + init_completion(&abba.b_ready); + abba.resolve = resolve; + + schedule_work(&abba.work); + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&abba.a_mutex, &ctx); + + complete(&abba.a_ready); + wait_for_completion(&abba.b_ready); + + err = ww_mutex_lock(&abba.b_mutex, &ctx); + if (resolve && err == -EDEADLK) { + ww_mutex_unlock(&abba.a_mutex); + ww_mutex_lock_slow(&abba.b_mutex, &ctx); + err = ww_mutex_lock(&abba.a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(&abba.b_mutex); + ww_mutex_unlock(&abba.a_mutex); + ww_acquire_fini(&ctx); + + flush_work(&abba.work); + destroy_work_on_stack(&abba.work); + + ret = 0; + if (resolve) { + if (err || abba.result) { + pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } else { + if (err != -EDEADLK && abba.result != -EDEADLK) { + pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n", + __func__, err, abba.result); + ret = -EINVAL; + } + } + return ret; +} + static int __init test_ww_mutex_init(void) { int ret; @@ -165,6 +255,14 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + ret = test_abba(false); + if (ret) + return ret; + + ret = test_abba(true); + if (ret) + return ret; + return 0; } -- cgit v1.2.3-59-g8ed1b From d1b42b800e5d09dcee52812b4396aca3a3696ba9 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:09 +0000 Subject: locking/ww_mutex: Add kselftests for resolving ww_mutex cyclic deadlocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check that ww_mutexes can detect cyclic deadlocks (generalised ABBA cycles) and resolve them by lock reordering. Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-7-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 115 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5a643ba2b020..84da738e57d1 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -21,9 +21,11 @@ #include #include #include +#include #include static DEFINE_WW_CLASS(ww_class); +struct workqueue_struct *wq; struct test_mutex { struct work_struct work; @@ -243,10 +245,118 @@ static int test_abba(bool resolve) return ret; } +struct test_cycle { + struct work_struct work; + struct ww_mutex a_mutex; + struct ww_mutex *b_mutex; + struct completion *a_signal; + struct completion b_signal; + int result; +}; + +static void test_cycle_work(struct work_struct *work) +{ + struct test_cycle *cycle = container_of(work, typeof(*cycle), work); + struct ww_acquire_ctx ctx; + int err; + + ww_acquire_init(&ctx, &ww_class); + ww_mutex_lock(&cycle->a_mutex, &ctx); + + complete(cycle->a_signal); + wait_for_completion(&cycle->b_signal); + + err = ww_mutex_lock(cycle->b_mutex, &ctx); + if (err == -EDEADLK) { + ww_mutex_unlock(&cycle->a_mutex); + ww_mutex_lock_slow(cycle->b_mutex, &ctx); + err = ww_mutex_lock(&cycle->a_mutex, &ctx); + } + + if (!err) + ww_mutex_unlock(cycle->b_mutex); + ww_mutex_unlock(&cycle->a_mutex); + ww_acquire_fini(&ctx); + + cycle->result = err; +} + +static int __test_cycle(unsigned int nthreads) +{ + struct test_cycle *cycles; + unsigned int n, last = nthreads - 1; + int ret; + + cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL); + if (!cycles) + return -ENOMEM; + + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + ww_mutex_init(&cycle->a_mutex, &ww_class); + if (n == last) + cycle->b_mutex = &cycles[0].a_mutex; + else + cycle->b_mutex = &cycles[n + 1].a_mutex; + + if (n == 0) + cycle->a_signal = &cycles[last].b_signal; + else + cycle->a_signal = &cycles[n - 1].b_signal; + init_completion(&cycle->b_signal); + + INIT_WORK(&cycle->work, test_cycle_work); + cycle->result = 0; + } + + for (n = 0; n < nthreads; n++) + queue_work(wq, &cycles[n].work); + + flush_workqueue(wq); + + ret = 0; + for (n = 0; n < nthreads; n++) { + struct test_cycle *cycle = &cycles[n]; + + if (!cycle->result) + continue; + + pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + n, nthreads, cycle->result); + ret = -EINVAL; + break; + } + + for (n = 0; n < nthreads; n++) + ww_mutex_destroy(&cycles[n].a_mutex); + kfree(cycles); + return ret; +} + +static int test_cycle(unsigned int ncpus) +{ + unsigned int n; + int ret; + + for (n = 2; n <= ncpus + 1; n++) { + ret = __test_cycle(n); + if (ret) + return ret; + } + + return 0; +} + static int __init test_ww_mutex_init(void) { + int ncpus = num_online_cpus(); int ret; + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + ret = test_mutex(); if (ret) return ret; @@ -263,11 +373,16 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + ret = test_cycle(ncpus); + if (ret) + return ret; + return 0; } static void __exit test_ww_mutex_exit(void) { + destroy_workqueue(wq); } module_init(test_ww_mutex_init); -- cgit v1.2.3-59-g8ed1b From 2a0c11282881875dc44f166a20eedf0d866dd0ef Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Dec 2016 11:47:10 +0000 Subject: locking/ww_mutex: Add kselftests for ww_mutex stress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161201114711.28697-8-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 254 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 84da738e57d1..da6c9a34f62f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -19,8 +19,10 @@ #include #include +#include #include #include +#include #include #include @@ -348,6 +350,246 @@ static int test_cycle(unsigned int ncpus) return 0; } +struct stress { + struct work_struct work; + struct ww_mutex *locks; + int nlocks; + int nloops; +}; + +static int *get_random_order(int count) +{ + int *order; + int n, r, tmp; + + order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + if (!order) + return order; + + for (n = 0; n < count; n++) + order[n] = n; + + for (n = count - 1; n > 1; n--) { + r = get_random_int() % (n + 1); + if (r != n) { + tmp = order[n]; + order[n] = order[r]; + order[r] = tmp; + } + } + + return order; +} + +static void dummy_load(struct stress *stress) +{ + usleep_range(1000, 2000); +} + +static void stress_inorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *locks = stress->locks; + struct ww_acquire_ctx ctx; + int *order; + + order = get_random_order(nlocks); + if (!order) + return; + + ww_acquire_init(&ctx, &ww_class); + + do { + int contended = -1; + int n, err; + +retry: + err = 0; + for (n = 0; n < nlocks; n++) { + if (n == contended) + continue; + + err = ww_mutex_lock(&locks[order[n]], &ctx); + if (err < 0) + break; + } + if (!err) + dummy_load(stress); + + if (contended > n) + ww_mutex_unlock(&locks[order[contended]]); + contended = n; + while (n--) + ww_mutex_unlock(&locks[order[n]]); + + if (err == -EDEADLK) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } + + if (err) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + } while (--stress->nloops); + + ww_acquire_fini(&ctx); + + kfree(order); + kfree(stress); +} + +struct reorder_lock { + struct list_head link; + struct ww_mutex *lock; +}; + +static void stress_reorder_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + LIST_HEAD(locks); + struct ww_acquire_ctx ctx; + struct reorder_lock *ll, *ln; + int *order; + int n, err; + + order = get_random_order(stress->nlocks); + if (!order) + return; + + for (n = 0; n < stress->nlocks; n++) { + ll = kmalloc(sizeof(*ll), GFP_KERNEL); + if (!ll) + goto out; + + ll->lock = &stress->locks[order[n]]; + list_add(&ll->link, &locks); + } + kfree(order); + order = NULL; + + ww_acquire_init(&ctx, &ww_class); + + do { + list_for_each_entry(ll, &locks, link) { + err = ww_mutex_lock(ll->lock, &ctx); + if (!err) + continue; + + ln = ll; + list_for_each_entry_continue_reverse(ln, &locks, link) + ww_mutex_unlock(ln->lock); + + if (err != -EDEADLK) { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + + ww_mutex_lock_slow(ll->lock, &ctx); + list_move(&ll->link, &locks); /* restarts iteration */ + } + + dummy_load(stress); + list_for_each_entry(ll, &locks, link) + ww_mutex_unlock(ll->lock); + } while (--stress->nloops); + + ww_acquire_fini(&ctx); + +out: + list_for_each_entry_safe(ll, ln, &locks, link) + kfree(ll); + kfree(order); + kfree(stress); +} + +static void stress_one_work(struct work_struct *work) +{ + struct stress *stress = container_of(work, typeof(*stress), work); + const int nlocks = stress->nlocks; + struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + int err; + + do { + err = ww_mutex_lock(lock, NULL); + if (!err) { + dummy_load(stress); + ww_mutex_unlock(lock); + } else { + pr_err_once("stress (%s) failed with %d\n", + __func__, err); + break; + } + } while (--stress->nloops); + + kfree(stress); +} + +#define STRESS_INORDER BIT(0) +#define STRESS_REORDER BIT(1) +#define STRESS_ONE BIT(2) +#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) + +static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) +{ + struct ww_mutex *locks; + int n; + + locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); + if (!locks) + return -ENOMEM; + + for (n = 0; n < nlocks; n++) + ww_mutex_init(&locks[n], &ww_class); + + for (n = 0; nthreads; n++) { + struct stress *stress; + void (*fn)(struct work_struct *work); + + fn = NULL; + switch (n & 3) { + case 0: + if (flags & STRESS_INORDER) + fn = stress_inorder_work; + break; + case 1: + if (flags & STRESS_REORDER) + fn = stress_reorder_work; + break; + case 2: + if (flags & STRESS_ONE) + fn = stress_one_work; + break; + } + + if (!fn) + continue; + + stress = kmalloc(sizeof(*stress), GFP_KERNEL); + if (!stress) + break; + + INIT_WORK(&stress->work, fn); + stress->locks = locks; + stress->nlocks = nlocks; + stress->nloops = nloops; + + queue_work(wq, &stress->work); + nthreads--; + } + + flush_workqueue(wq); + + for (n = 0; n < nlocks; n++) + ww_mutex_destroy(&locks[n]); + kfree(locks); + + return 0; +} + static int __init test_ww_mutex_init(void) { int ncpus = num_online_cpus(); @@ -377,6 +619,18 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); + if (ret) + return ret; + + ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); + if (ret) + return ret; + + ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + if (ret) + return ret; + return 0; } -- cgit v1.2.3-59-g8ed1b From 1e24edca0557dba6486d39d3c24c288475432bcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Nov 2016 17:12:23 +0100 Subject: locking/atomic, kref: Add KREF_INIT() Since we need to change the implementation, stop exposing internals. Provide KREF_INIT() to allow static initialization of struct kref. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/block/drbd/drbd_bitmap.c | 2 +- fs/fuse/fuse_i.h | 2 +- include/linux/kref.h | 2 ++ init/version.c | 4 +--- kernel/pid.c | 4 +--- 5 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index ab62b81c2ca7..dece26f119d4 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned .done = 0, .flags = flags, .error = 0, - .kref = { ATOMIC_INIT(2) }, + .kref = KREF_INIT(2), }; if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 91307940c8ac..052f8d3c41cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,7 +256,7 @@ struct fuse_io_priv { #define FUSE_IO_PRIV_SYNC(f) \ { \ - .refcnt = { ATOMIC_INIT(1) }, \ + .refcnt = KREF_INIT(1), \ .async = 0, \ .file = f, \ } diff --git a/include/linux/kref.h b/include/linux/kref.h index e15828fd71f1..9af255ad1e2f 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -24,6 +24,8 @@ struct kref { atomic_t refcount; }; +#define KREF_INIT(n) { .refcount = ATOMIC_INIT(n), } + /** * kref_init - initialize object. * @kref: object in question. diff --git a/init/version.c b/init/version.c index fe41a63efed6..5606341e9efd 100644 --- a/init/version.c +++ b/init/version.c @@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE); #endif struct uts_namespace init_uts_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, + .kref = KREF_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/kernel/pid.c b/kernel/pid.c index f66162f2359b..0291804151b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, + .kref = KREF_INIT(2), .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, -- cgit v1.2.3-59-g8ed1b From bcc9a76d5ac426bc45c9e863b1830347827ca77a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Jan 2017 21:33:35 -0500 Subject: locking/rwsem: Reinit wake_q after use In __rwsem_down_write_failed_common(), the same wake_q variable name is defined twice, with the inner wake_q hiding the one in outer scope. We can either use different names for the two wake_q's. Even better, we can use the same wake_q twice, if necessary. To enable the latter change, we need to define a new helper function wake_q_init() to enable reinitalization of wake_q after use. Signed-off-by: Waiman Long Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1485052415-9611-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ kernel/locking/rwsem-xadd.c | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index f4f9d32f12b3..4e62b378bd65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,12 @@ struct wake_q_head { #define DEFINE_WAKE_Q(name) \ struct wake_q_head name = { WAKE_Q_TAIL, &name.first } +static inline void wake_q_init(struct wake_q_head *head) +{ + head->first = WAKE_Q_TAIL; + head->lastp = &head->first; +} + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a3a381aee24b..2ad8d8dc3bb1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -502,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - DEFINE_WAKE_Q(wake_q); - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock @@ -513,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * for attempting rwsem_try_write_lock(). */ wake_up_q(&wake_q); + + /* + * Reinitialize wake_q after use. + */ + wake_q_init(&wake_q); } } else -- cgit v1.2.3-59-g8ed1b From 4009f4b3a9d8b74547269f293e6a920adf278996 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Jan 2017 11:32:34 -0500 Subject: locking/rtmutex: Flip unlikely() branch to likely() in __rt_mutex_slowlock() Running my likely/unlikely profiler for 3 weeks on two production machines, I discovered that the unlikely() test in __rt_mutex_slowlock() checking if state is TASK_INTERRUPTIBLE is hit 100% of the time, making it a very likely case. The reason is, on a vanilla kernel, the majority case of calling rt_mutex() is from the futex code. This code is always called as TASK_INTERRUPTIBLE. In the -rt patch, this code is commonly called when PREEMPT_RT is enabled with TASK_UNINTERRUPTIBLE. But that's not the likely scenario. The rt_mutex() code should be optimized for the common vanilla case, and that is from a futex, with TASK_INTERRUPTIBLE as the state. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170119113234.1efeedd1@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f443ed2320a..d340be3a488f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1179,7 +1179,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, * TASK_INTERRUPTIBLE checks for signals and * timeout. Ignored otherwise. */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { + if (likely(state == TASK_INTERRUPTIBLE)) { /* Signal pending? */ if (signal_pending(current)) ret = -EINTR; -- cgit v1.2.3-59-g8ed1b From b9c16a0e1f733c97e48798b2a9362c485bb3b731 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Jan 2017 16:06:09 +0100 Subject: locking/mutex: Fix lockdep_assert_held() fail In commit: 659cf9f5824a ("locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter for backoff when acquiring the lock") I replaced a comment with a lockdep_assert_held(). However it turns out we hide that lock from lockdep for hysterical raisins, which results in the assertion always firing. Remove the old debug code as lockdep will easily spot the abuse it was meant to catch, which will make the lock visible to lockdep and make the assertion work as intended. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Nicolai Haehnle Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 659cf9f5824a ("locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter for backoff when acquiring the lock") Link: http://lkml.kernel.org/r/20170117150609.GB32474@worktop Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.h | 17 ----------------- kernel/locking/mutex.c | 25 +++++++++++-------------- kernel/locking/mutex.h | 4 ---- 3 files changed, 11 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index a459faa48987..4174417d5309 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -26,20 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); - -#define spin_lock_mutex(lock, flags) \ - do { \ - struct mutex *l = container_of(lock, struct mutex, wait_lock); \ - \ - DEBUG_LOCKS_WARN_ON(in_interrupt()); \ - local_irq_save(flags); \ - arch_spin_lock(&(lock)->rlock.raw_lock);\ - DEBUG_LOCKS_WARN_ON(l->magic != l); \ - } while (0) - -#define spin_unlock_mutex(lock, flags) \ - do { \ - arch_spin_unlock(&(lock)->rlock.raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ - } while (0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 935116723a3d..705e06fe5e6c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -325,8 +325,6 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - unsigned long flags; - ww_mutex_lock_acquired(lock, ctx); lock->ctx = ctx; @@ -350,9 +348,9 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * Uh oh, we raced in fastpath, wake up everyone in this case, * so they can see the new lock->ctx. */ - spin_lock_mutex(&lock->base.wait_lock, flags); + spin_lock(&lock->base.wait_lock); __ww_mutex_wakeup_for_backoff(&lock->base, ctx); - spin_unlock_mutex(&lock->base.wait_lock, flags); + spin_unlock(&lock->base.wait_lock); } /* @@ -740,7 +738,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - unsigned long flags; bool first = false; struct ww_mutex *ww; int ret; @@ -766,7 +763,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, return 0; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ @@ -830,7 +827,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); /* @@ -853,9 +850,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) break; - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); acquired: __set_current_state(TASK_RUNNING); @@ -872,7 +869,7 @@ skip_wait: if (use_ww_ctx && ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); preempt_enable(); return 0; @@ -880,7 +877,7 @@ err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); err_early_backoff: - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); preempt_enable(); @@ -999,8 +996,8 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { struct task_struct *next = NULL; - unsigned long owner, flags; DEFINE_WAKE_Q(wake_q); + unsigned long owner; mutex_release(&lock->dep_map, 1, ip); @@ -1035,7 +1032,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne owner = old; } - spin_lock_mutex(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -1052,7 +1049,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock_mutex(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4410a4af42a3..6ebc1902f779 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -9,10 +9,6 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock, flags) \ - do { spin_lock(lock); (void)(flags); } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) -- cgit v1.2.3-59-g8ed1b From f9af456a61ecfbef8233c5046a9e347c9b98ba05 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 13 Jan 2017 11:42:04 +0900 Subject: lockdep: Fix incorrect condition to print bug msgs for MAX_LOCKDEP_CHAIN_HLOCKS Bug messages and stack dump for MAX_LOCKDEP_CHAIN_HLOCKS should only be printed once. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1484275324-28192-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7c38f8f3d97b..bf6072524756 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2203,7 +2203,7 @@ cache_hit: * Important for check_no_collision(). */ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { - if (debug_locks_off_graph_unlock()) + if (!debug_locks_off_graph_unlock()) return 0; print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); -- cgit v1.2.3-59-g8ed1b From bc88c10d7e6900916f5e1ba3829d66a9de92b633 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 8 Feb 2017 14:46:48 -0500 Subject: locking/spinlock/debug: Remove spinlock lockup detection code The current spinlock lockup detection code can sometimes produce false positives because of the unfairness of the locking algorithm itself. So the lockup detection code is now removed. Instead, we are relying on the NMI watchdog to detect potential lockup. We won't have lockup detection if the watchdog isn't running. The commented-out read-write lock lockup detection code are also removed. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1486583208-11038-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock_debug.c | 86 +++-------------------------------------- 1 file changed, 5 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 0374a596cffa..9aa0fccd5d43 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) lock->owner_cpu = -1; } -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - +/* + * We are now relying on the NMI watchdog to detect lockup instead of doing + * the detection here with an unfair lock which can cause problem of its own. + */ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); + arch_spin_lock(&lock->raw_lock); debug_spin_lock_after(lock); } @@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg) #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - void do_raw_read_lock(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); @@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock) lock->owner_cpu = -1; } -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - void do_raw_write_lock(rwlock_t *lock) { debug_write_lock_before(lock); -- cgit v1.2.3-59-g8ed1b From 95cb64c1fe61e70685a95f6260c8e9cd219fe08c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 18 Feb 2017 15:26:45 +0100 Subject: fork: Fix task_struct alignment Stupid bug that wrecked the alignment of task_struct and causes WARN()s in the x86 FPU code on some platforms. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: e274795ea7b7 ("locking/mutex: Fix mutex handoff") Link: http://lkml.kernel.org/r/20170218142645.GH6500@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a90510d0bbf8..ea33f8adc032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -434,7 +434,7 @@ void __init fork_init(void) #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif - int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", -- cgit v1.2.3-59-g8ed1b