From f4dbba591945dc301c302672adefba9e2ec08dc5 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 10 Nov 2016 13:06:39 -0800 Subject: locktorture: Fix potential memory leak with rw lock test When running locktorture module with the below commands with kmemleak enabled: $ modprobe locktorture torture_type=rw_lock_irq $ rmmod locktorture The below kmemleak got caught: root@10:~# echo scan > /sys/kernel/debug/kmemleak [ 323.197029] kmemleak: 2 new suspected memory leaks (see /sys/kernel/debug/kmemleak) root@10:~# cat /sys/kernel/debug/kmemleak unreferenced object 0xffffffc07592d500 (size 128): comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 c3 7b 02 00 00 00 00 00 .........{...... 00 00 00 00 00 00 00 00 d7 9b 02 00 00 00 00 00 ................ backtrace: [] create_object+0x110/0x288 [] kmemleak_alloc+0x58/0xa0 [] __kmalloc+0x234/0x318 [] 0xffffff80006fa130 [] do_one_initcall+0x44/0x138 [] do_init_module+0x68/0x1cc [] load_module+0x1a68/0x22e0 [] SyS_finit_module+0xe0/0xf0 [] el0_svc_naked+0x24/0x28 [] 0xffffffffffffffff unreferenced object 0xffffffc07592d480 (size 128): comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 3b 6f 01 00 00 00 00 00 ........;o...... 00 00 00 00 00 00 00 00 23 6a 01 00 00 00 00 00 ........#j...... backtrace: [] create_object+0x110/0x288 [] kmemleak_alloc+0x58/0xa0 [] __kmalloc+0x234/0x318 [] 0xffffff80006fa22c [] do_one_initcall+0x44/0x138 [] do_init_module+0x68/0x1cc [] load_module+0x1a68/0x22e0 [] SyS_finit_module+0xe0/0xf0 [] el0_svc_naked+0x24/0x28 [] 0xffffffffffffffff It is because cxt.lwsa and cxt.lrsa don't get freed in module_exit, so free them in lock_torture_cleanup() and free writer_tasks if reader_tasks is failed at memory allocation. Signed-off-by: Yang Shi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f8c5af52a131..d3de04b12f8c 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -780,6 +780,10 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + + kfree(cxt.lwsa); + kfree(cxt.lrsa); + end: torture_cleanup_end(); } @@ -924,6 +928,8 @@ static int __init lock_torture_init(void) GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + kfree(writer_tasks); + writer_tasks = NULL; firsterr = -ENOMEM; goto unwind; } -- cgit v1.2.3-59-g8ed1b From 6563de9d6f1336157c9861bcd9864e0b47d65f9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2016 13:33:57 -0700 Subject: rcu: Abstract the dynticks momentary-idle operation This commit is the first step towards full abstraction of all accesses to the ->dynticks counter, implementing the previously open-coded atomic add of two in a new rcu_dynticks_momentary_idle() function. This abstraction will ease changes to the ->dynticks counter operation. Note that this commit gets rid of the smp_mb__before_atomic() and the smp_mb__after_atomic() calls that were previously present. The reason that this is OK from a memory-ordering perspective is that the atomic operation is now atomic_add_return(), which, as a value-returning atomic, guarantees full ordering. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb4e2056ccf3..14e283c351f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -281,6 +281,19 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * Do a double-increment of the ->dynticks counter to emulate a + * momentary idle-CPU quiescent state. + */ +static void rcu_dynticks_momentary_idle(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special = atomic_add_return(2, &rdtp->dynticks); + + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & 0x1)); +} + DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); @@ -300,7 +313,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); static void rcu_momentary_dyntick_idle(void) { struct rcu_data *rdp; - struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; @@ -327,10 +339,7 @@ static void rcu_momentary_dyntick_idle(void) * quiescent state, with no need for this CPU to do anything * further. */ - rdtp = this_cpu_ptr(&rcu_dynticks); - smp_mb__before_atomic(); /* Earlier stuff before QS. */ - atomic_add(2, &rdtp->dynticks); /* QS. */ - smp_mb__after_atomic(); /* Later stuff after QS. */ + rcu_dynticks_momentary_idle(); break; } } -- cgit v1.2.3-59-g8ed1b From 8b2f63ab05eb233b2b396e133889ce3d1d30d944 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2016 14:12:05 -0700 Subject: rcu: Abstract the dynticks snapshot operation This commit is the second step towards full abstraction of all accesses to the ->dynticks counter, implementing the previously open-coded atomic add of zero in a new rcu_dynticks_snap() function. This abstraction will ease changes o the ->dynticks counter operation. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 19 ++++++++++++++++--- kernel/rcu/tree_exp.h | 6 ++---- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14e283c351f6..805d55ee0b2a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -281,6 +281,17 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * Snapshot the ->dynticks counter with full ordering so as to allow + * stable comparison of this counter with past and future snapshots. + */ +static int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +{ + int snap = atomic_add_return(0, &rdtp->dynticks); + + return snap; +} + /* * Do a double-increment of the ->dynticks counter to emulate a * momentary idle-CPU quiescent state. @@ -1049,7 +1060,9 @@ void rcu_nmi_exit(void) */ bool notrace __rcu_is_watching(void) { - return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + return atomic_read(&rdtp->dynticks) & 0x1; } /** @@ -1132,7 +1145,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); @@ -1157,7 +1170,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, int *rcrmp; unsigned int snap; - curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); + curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks); snap = (unsigned int)rdp->dynticks_snap; /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e59e1849b89a..011f626b2fd8 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -356,10 +356,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, mask_ofl_test = 0; for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); rdp->exp_dynticks_snap = - atomic_add_return(0, &rdtp->dynticks); + rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || !(rdp->exp_dynticks_snap & 0x1) || !(rnp->qsmaskinitnext & rdp->grpmask)) @@ -380,12 +379,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (atomic_add_return(0, &rdtp->dynticks) != + if (rcu_dynticks_snap(rdp->dynticks) != rdp->exp_dynticks_snap) { mask_ofl_test |= mask; continue; -- cgit v1.2.3-59-g8ed1b From 7c6094db591b320332441e5f169156a4255b2180 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 2 Nov 2016 17:30:02 +0100 Subject: rcu: update: Make RCU_EXPEDITE_BOOT be the default RCU_EXPEDITE_BOOT should speed up the boot process by enforcing synchronize_rcu_expedited() instead of synchronize_rcu() during the boot process. There should be no reason why one does not want this and there is no need worry about real time latency at this point. Therefore make it default. Note that users wishing to avoid expediting entirely, for example when bringing up new hardware possibly having flaky IPIs, can use the rcu_normal boot parameter to override boot-time expediting. Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Reworded commit log. ] Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- init/Kconfig | 13 ------------- kernel/rcu/update.c | 6 ++---- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 223b734abccd..96e6d56acd50 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL endchoice -config RCU_EXPEDITE_BOOT - bool - default n - help - This option enables expedited grace periods at boot time, - as if rcu_expedite_gp() had been invoked early in boot. - The corresponding rcu_unexpedite_gp() is invoked from - rcu_end_inkernel_boot(), which is intended to be invoked - at the end of the kernel-only boot sequence, just before - init is exec'ed. - - Accept the default if unsure. - endmenu # "RCU Subsystem" config BUILD_BIN2C diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4f6db7e6a117..9e03db9ea9c0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = - ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); */ void rcu_end_inkernel_boot(void) { - if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); } -- cgit v1.2.3-59-g8ed1b From 4d4f88fa235f7f9ef8213564dc1804144332238b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 Nov 2016 04:17:15 -0700 Subject: lockdep: Make RCU suspicious-access splats use pr_err This commit switches RCU suspicious-access splats use pr_err() instead of the current INFO printk()s. This change makes it easier to automatically classify splats. Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7c38f8f3d97b..d9a698e8458f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4412,13 +4412,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ printk("\n"); - printk("===============================\n"); - printk("[ INFO: suspicious RCU usage. ]\n"); + pr_err("===============================\n"); + pr_err("[ ERR: suspicious RCU usage. ]\n"); print_kernel_ident(); - printk("-------------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_err("-------------------------------\n"); + pr_err("%s:%d %s!\n", file, line, s); + pr_err("\nother info that might help us debug this:\n\n"); + pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() -- cgit v1.2.3-59-g8ed1b From 907565337ebf998a68cb5c5b2174ce5e5da065eb Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 3 Nov 2016 10:29:28 -0600 Subject: Fix: Disable sys_membarrier when nohz_full is enabled Userspace applications should be allowed to expect the membarrier system call with MEMBARRIER_CMD_SHARED command to issue memory barriers on nohz_full CPUs, but synchronize_sched() does not take those into account. Given that we do not want unrelated processes to be able to affect real-time sensitive nohz_full CPUs, simply return ENOSYS when membarrier is invoked on a kernel with enabled nohz_full CPUs. Signed-off-by: Mathieu Desnoyers CC: Josh Triplett CC: Steven Rostedt CC: [3.10+] Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Chris Metcalf Cc: Rik van Riel Acked-by: Lai Jiangshan Reviewed-by: Josh Triplett --- kernel/membarrier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/membarrier.c b/kernel/membarrier.c index 536c727a56e9..9f9284f37f8d 100644 --- a/kernel/membarrier.c +++ b/kernel/membarrier.c @@ -16,6 +16,7 @@ #include #include +#include /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -51,6 +52,9 @@ */ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) { + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -ENOSYS; if (unlikely(flags)) return -EINVAL; switch (cmd) { -- cgit v1.2.3-59-g8ed1b From c4402b27f1778fad0dbc27b2d88bb06ca22a06f0 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 9 Nov 2016 17:57:13 +0900 Subject: rcu: Only dump stalled-tasks stacks if there was a real stall The print_other_cpu_stall() function currently unconditionally invokes rcu_print_detail_task_stall(). This is OK because if there was a stall sufficient to cause print_other_cpu_stall() to be invoked, that stall is very likely to persist through the entire print_other_cpu_stall() execution. However, if the stall did not persist, the variable ndetected will be zero, and that variable is already tested in an "if" statement. Therefore, this commit moves the call to rcu_print_detail_task_stall() under that pre-existing "if" to improve readability, with a very rare reduction in overhead. Signed-off-by: Byungchul Park [ paulmck: Reworked commit log. ] Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb4e2056ccf3..6232d2f9a84e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1379,6 +1379,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); + + /* Complain about tasks blocking the grace period. */ + rcu_print_detail_task_stall(rsp); } else { if (READ_ONCE(rsp->gpnum) != gpnum || READ_ONCE(rsp->completed) == gpnum) { @@ -1395,9 +1398,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } } - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); - rcu_check_gp_kthread_starvation(rsp); panic_on_rcu_stall(); -- cgit v1.2.3-59-g8ed1b From 2535db485cad8f302196af05888721ce82528978 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Nov 2016 11:00:44 -0800 Subject: rcu: Remove unneeded rcu_process_callbacks() declarations The declarations of __rcu_process_callbacks() and rcu_process_callbacks() are not needed, as the definition of both of these functions appear before any uses. This commit therefore removes both declarations. Reported-by: "Ahmed, Iftekhar" Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tiny.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b23a4d076f3d..fa6a48d3917b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -41,8 +41,6 @@ /* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_ctrlblk *rcp); -- cgit v1.2.3-59-g8ed1b From 94060d2235cf419f2d46dbce95727e6f826b233c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 11:13:07 +0200 Subject: rcu: Remove unused but set variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 7ec99de36f40 ("rcu: Provide exact CPU-online tracking for RCU"), the variable mask in rcu_init_percpu_data is set but no longer used. Remove it to fix the following warning when building with 'W=1': kernel/rcu/tree.c: In function ‘rcu_init_percpu_data’: kernel/rcu/tree.c:3765:16: warning: variable ‘mask’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6232d2f9a84e..83bf054e194e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3765,7 +3765,6 @@ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3788,7 +3787,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) * of the next grace period. */ rnp = rdp->mynode; - mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); -- cgit v1.2.3-59-g8ed1b From 28053bc72c0e588c577557ce9e95a6cb65078471 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Dec 2016 11:31:31 -0800 Subject: rcu: Add long-term CPU kicking This commit prepares for the removal of short-term CPU kicking (in a subsequent commit). It does so by starting to invoke resched_cpu() for each holdout at each force-quiescent-state interval that is more than halfway through the stall-warning interval. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 83bf054e194e..0e61b62e3f4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1225,6 +1225,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + /* + * If more than halfway to RCU CPU stall-warning time, do + * a resched_cpu() to try to loosen things up a bit. + */ + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + resched_cpu(rdp->cpu); return 0; } -- cgit v1.2.3-59-g8ed1b From b201fa67371862229f27a1f022196423aa5c7381 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Nov 2016 00:07:23 -0800 Subject: rcu: Remove short-term CPU kicking Commit 4914950aaa12d ("rcu: Stop treating in-kernel CPU-bound workloads as errors") added a (relatively) short-timeout call to resched_cpu(). This was inspired by as issue that was fixed by b7e7ade34e61 ("sched/core: Fix remote wakeups"). But given that this issue was fixed, it is time for the current commit to remove this call to resched_cpu(). Reported-by: Byungchul Park Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0e61b62e3f4a..5a4aaad75e76 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1220,11 +1220,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } - /* And if it has been a really long time, kick the CPU as well. */ - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ /* * If more than halfway to RCU CPU stall-warning time, do * a resched_cpu() to try to loosen things up a bit. -- cgit v1.2.3-59-g8ed1b From 7aa92230c9e86b2150f718185f70e0af592e290b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Nov 2016 05:49:06 -0800 Subject: rcu: Once again use NMI-based stack traces in stall warnings This commit is for all intents and purposes a revert of bc1dce514e9b ("rcu: Don't use NMIs to dump other CPUs' stacks"). The reason to suppose that this can now safely be reverted is the presence of 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI"), which is said to have made NMI-based stack dumps safe. However, this reversion keeps one nice property of bc1dce514e9b ("rcu: Don't use NMIs to dump other CPUs' stacks"), namely that only those CPUs blocking the grace period are dumped. The new trigger_single_cpu_backtrace() is used to make this happen, as suggested by Josh Poimboeuf. Reported-by: Vince Weaver Signed-off-by: Paul E. McKenney Cc: Petr Mladek Cc: Peter Zijlstra Reviewed-by: Josh Poimboeuf Reviewed-by: Petr Mladek Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5a4aaad75e76..d7b63b88434b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1278,7 +1278,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1288,11 +1291,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } -- cgit v1.2.3-59-g8ed1b From 630c7ed9ca0608912fa7c8591d05dfc8742dc9e6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Dec 2016 15:37:47 -0800 Subject: rcu: Don't wake rcuc/X kthreads on NOCB CPUs Chris Friesen notice that rcuc/X kthreads were consuming CPU even on NOCB CPUs. This makes no sense because the only purpose or these kthreads is to invoke normal (non-offloaded) callbacks, of which there will never be any on NOCB CPUs. This problem was due to a bug in cpu_has_callbacks_ready_to_invoke(), which should have been checking ->nxttail[RCU_NEXT_TAIL] for NULL, but which was instead (incorrectly) checking ->nxttail[RCU_DONE_TAIL]. Because ->nxttail[RCU_DONE_TAIL] is never NULL, the only effect is to cause the rcuc/X kthread to execute when it should not do so. This commit therefore checks ->nxttail[RCU_NEXT_TAIL], which is NULL for NOCB CPUs. Reported-by: Chris Friesen Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d7b63b88434b..be2301238a23 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -611,7 +611,7 @@ static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_DONE_TAIL] != NULL; + rdp->nxttail[RCU_NEXT_TAIL] != NULL; } /* -- cgit v1.2.3-59-g8ed1b From 09e2db37ec6d22bc27e2ba94ab4889541567f52e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Dec 2016 13:31:02 -0800 Subject: rcu: Add comment headers to expedited-grace-period counter functions These functions (rcu_exp_gp_seq_start(), rcu_exp_gp_seq_end(), rcu_exp_gp_seq_snap(), and rcu_exp_gp_seq_done() seemed too obvious to comment when written, but not so much when being documented. This commit therefore adds header comments to each of them. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_exp.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e59e1849b89a..303df97bbfc5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,16 +20,26 @@ * Authors: Paul E. McKenney */ -/* Wrapper functions for expedited grace periods. */ +/* + * Record the start of an expedited grace period. + */ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) { rcu_seq_start(&rsp->expedited_sequence); } + +/* + * Record the end of an expedited grace period. + */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } + +/* + * Take a snapshot of the expedited-grace-period counter. + */ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { unsigned long s; @@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); return s; } + +/* + * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true + * if a full expedited grace period has elapsed since that snapshot + * was taken. + */ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { return rcu_seq_done(&rsp->expedited_sequence, s); -- cgit v1.2.3-59-g8ed1b From fdbb9b315ce40922f3a8d2b8352776d7bc963d84 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Dec 2016 07:17:58 -0800 Subject: rcu: Make rcu_cpu_starting() use its "cpu" argument The rcu_cpu_starting() function uses this_cpu_ptr() to locate the incoming CPU's rcu_data structure. This works for the boot CPU and for all CPUs onlined after rcu_init() executes (during very early boot). Currently, this is the full set of CPUs, so all is well. But if anyone ever parallelizes boot before rcu_init() time, it will fail. This commit therefore substitutes the rcu_cpu_starting() function's this_cpu_pointer() for per_cpu_ptr(), future-proofing the code and (arguably) improving readability. This commit inadvertently fixes a latent bug: If there ever had been more than just the boot CPU online at rcu_init() time, the old code would not initialize the non-boot CPUs, but rather would repeatedly initialize the boot CPU. Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index be2301238a23..a4b4762442bb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3873,7 +3873,7 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); + rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); -- cgit v1.2.3-59-g8ed1b From 9831ce3bb4f8b8f18f6e9719b64ba4e727d70fb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jan 2017 14:24:24 -0800 Subject: rcu: Fix comment in rcu_organize_nocb_kthreads() It used to be that the rcuo callback-offload kthreads were spawned in rcu_organize_nocb_kthreads(), and the comment before the "for" loop says as much. However, this spawning has long since moved to the CPU-hotplug code, so this commit fixes this comment. Reported-by: Michalis Kokologiannakis Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56583e764ebf..2f5541f6f031 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2366,8 +2366,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) } /* - * Each pass through this loop sets up one rcu_data structure and - * spawns one rcu_nocb_kthread(). + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); -- cgit v1.2.3-59-g8ed1b From bb4e2c08bbfa8eb032db7814f6100086aac102d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jan 2017 22:04:22 -0800 Subject: rcu: Eliminate unused expedited_normal counter Expedited grace periods no longer fall back to normal grace periods in response to lock contention, given that expedited grace periods now use the rcu_node tree so as to avoid contention. This commit therfore removes the expedited_normal counter. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 5 +---- kernel/rcu/tree.h | 1 - kernel/rcu/tree_trace.c | 3 +-- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 00a3a38b375a..6549012033f9 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,7 +237,7 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872 These fields are as follows: @@ -249,9 +249,6 @@ o "wd1", "wd2", and "wd3" are the number of times that an attempt completed an expedited grace period that satisfies the attempted request. "Our work is done." -o "n" is number of times that a concurrent CPU-hotplug operation - forced a fallback to a normal grace period. - o "enq" is the number of quiescent states still outstanding. o "sc" is the number of times that the attempt to start a diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..8f750dffb0dd 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -521,7 +521,6 @@ struct rcu_state { struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b1f28972872c..2e932cd1da31 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -194,9 +194,8 @@ static int show_rcuexp(struct seq_file *m, void *v) s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s0, s1, s2, s3, - atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; -- cgit v1.2.3-59-g8ed1b From 8dc79888a792f6c365c2a26903e49ff919e72488 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Jan 2017 15:25:55 -0800 Subject: rcu: Add lockdep checks to synchronous expedited primitives The non-expedited synchronize_*rcu() primitives have lockdep checks, but their expedited counterparts lack these checks. This commit therefore adds these checks to the expedited synchronize_*rcu() primitives. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_exp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 303df97bbfc5..f3e214898e3a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -639,6 +639,11 @@ void synchronize_sched_expedited(void) { struct rcu_state *rsp = &rcu_sched_state; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched_expedited() in RCU read-side critical section"); + /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; @@ -708,6 +713,11 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); -- cgit v1.2.3-59-g8ed1b From 2625d469baeef3aabdfe122572e00c517e2d9451 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2016 14:23:30 -0700 Subject: rcu: Abstract dynticks extended quiescent state enter/exit operations This commit is the third step towards full abstraction of all accesses to the ->dynticks counter, implementing the previously open-coded atomic add of 1 and entry checks in a new rcu_dynticks_eqs_enter() function, and the same but with exit checks in a new rcu_dynticks_eqs_exit() function. This abstraction will ease changes to the ->dynticks counter operation. Note that this commit gets rid of the smp_mb__before_atomic() and the smp_mb__after_atomic() calls that were previously present. The reason that this is OK from a memory-ordering perspective is that the atomic operation is now atomic_add_return(), which, as a value-returning atomic, guarantees full ordering. Signed-off-by: Paul E. McKenney [ paulmck: Fixed RCU_TRACE() statements added by this commit. ] Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 88 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 805d55ee0b2a..3169d5a21b55 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -281,6 +281,61 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * Record entry into an extended quiescent state. This is only to be + * called when not already in an extended quiescent state. + */ +static void rcu_dynticks_eqs_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + + /* + * CPUs seeing atomic_inc_return() must see prior RCU read-side + * critical sections, and we also must force ordering with the + * next idle sojourn. + */ + special = atomic_inc_return(&rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); +} + +/* + * Record exit from an extended quiescent state. This is only to be + * called from an extended quiescent state. + */ +static void rcu_dynticks_eqs_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + + /* + * CPUs seeing atomic_inc_return() must see prior idle sojourns, + * and we also must force ordering with the next RCU read-side + * critical section. + */ + special = atomic_inc_return(&rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); +} + +/* + * Reset the current CPU's ->dynticks counter to indicate that the + * newly onlined CPU is no longer in an extended quiescent state. + * This will either leave the counter unchanged, or increment it + * to the next non-quiescent value. + * + * The non-atomic test/increment sequence works because the upper bits + * of the ->dynticks counter are manipulated only by the corresponding CPU, + * or when the corresponding CPU is offline. + */ +static void rcu_dynticks_eqs_online(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + if (atomic_read(&rdtp->dynticks) & 0x1) + return; + atomic_add(0x1, &rdtp->dynticks); +} + /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. @@ -693,7 +748,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -712,12 +767,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); /* @@ -846,15 +896,10 @@ void rcu_irq_exit_irqson(void) */ static void rcu_eqs_exit_common(long long oldval, int user) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) rcu_dynticks_task_exit(); - smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -1001,11 +1046,7 @@ void rcu_nmi_enter(void) * period (observation due to Andy Lutomirski). */ if (!(atomic_read(&rdtp->dynticks) & 0x1)) { - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* atomic_inc() before later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_dynticks_eqs_exit(); incby = 1; } rdtp->dynticks_nmi_nesting += incby; @@ -1043,11 +1084,7 @@ void rcu_nmi_exit(void) /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ rdtp->dynticks_nmi_nesting = 0; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_eqs_enter(); } /** @@ -3800,8 +3837,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* -- cgit v1.2.3-59-g8ed1b From 02a5c550b2738f2bfea8e1e00aa75944d71c9e18 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2016 17:25:06 -0700 Subject: rcu: Abstract extended quiescent state determination This commit is the fourth step towards full abstraction of all accesses to the ->dynticks counter, implementing previously open-coded checks and comparisons in new rcu_dynticks_in_eqs() and rcu_dynticks_in_eqs_since() functions. This abstraction will ease changes to the ->dynticks counter operation. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 6 ++++++ kernel/rcu/tree.c | 52 +++++++++++++++++++++++++++++++++++------------- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_exp.h | 6 +++--- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_trace.c | 2 +- 6 files changed, 51 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ac81e4063b40..4f9b2fa2173d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,6 +27,12 @@ #include +struct rcu_dynticks; +static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +{ + return 0; +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3169d5a21b55..8b970319c75b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -336,17 +336,48 @@ static void rcu_dynticks_eqs_online(void) atomic_add(0x1, &rdtp->dynticks); } +/* + * Is the current CPU in an extended quiescent state? + * + * No ordering, as we are sampling CPU-local information. + */ +bool rcu_dynticks_curr_cpu_in_eqs(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + return !(atomic_read(&rdtp->dynticks) & 0x1); +} + /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -static int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +int rcu_dynticks_snap(struct rcu_dynticks *rdtp) { int snap = atomic_add_return(0, &rdtp->dynticks); return snap; } +/* + * Return true if the snapshot returned from rcu_dynticks_snap() + * indicates that RCU is in an extended quiescent state. + */ +static bool rcu_dynticks_in_eqs(int snap) +{ + return !(snap & 0x1); +} + +/* + * Return true if the CPU corresponding to the specified rcu_dynticks + * structure has spent some time in an extended quiescent state since + * rcu_dynticks_snap() returned the specified snapshot. + */ +static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +{ + return snap != rcu_dynticks_snap(rdtp); +} + /* * Do a double-increment of the ->dynticks counter to emulate a * momentary idle-CPU quiescent state. @@ -1045,7 +1076,7 @@ void rcu_nmi_enter(void) * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + if (rcu_dynticks_curr_cpu_in_eqs()) { rcu_dynticks_eqs_exit(); incby = 1; } @@ -1071,7 +1102,7 @@ void rcu_nmi_exit(void) * to us!) */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so @@ -1097,9 +1128,7 @@ void rcu_nmi_exit(void) */ bool notrace __rcu_is_watching(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - return atomic_read(&rdtp->dynticks) & 0x1; + return !rcu_dynticks_curr_cpu_in_eqs(); } /** @@ -1184,7 +1213,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - if ((rdp->dynticks_snap & 0x1) == 0) { + if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) @@ -1203,12 +1232,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - unsigned int curr; int *rcrmp; - unsigned int snap; - - curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks); - snap = (unsigned int)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1218,7 +1242,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; @@ -3807,7 +3831,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..3b953dcf6afc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -595,6 +595,8 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp); + #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 011f626b2fd8..e155a465cf84 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -360,7 +360,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rdp->exp_dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - !(rdp->exp_dynticks_snap & 0x1) || + rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -383,8 +383,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (rcu_dynticks_snap(rdp->dynticks) != - rdp->exp_dynticks_snap) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56583e764ebf..652209589adf 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], ticks_value, ticks_title, - atomic_read(&rdtp->dynticks) & 0xfff, + rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b1f28972872c..b833cd0a29e8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), + rcu_dynticks_snap(rdp->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -- cgit v1.2.3-59-g8ed1b From 3a19b46a5c17b12ef0691df19c676ba3da330a57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2016 11:21:21 -0800 Subject: rcu: Check cond_resched_rcu_qs() state less often to reduce GP overhead Commit 4a81e8328d37 ("rcu: Reduce overhead of cond_resched() checks for RCU") moved quiescent-state generation out of cond_resched() and commit bde6c3aa9930 ("rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops") introduced cond_resched_rcu_qs(), and commit 5cd37193ce85 ("rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors") introduced the per-CPU rcu_qs_ctr variable, which is frequently polled by the RCU core state machine. This frequent polling can increase grace-period rate, which in turn increases grace-period overhead, which is visible in some benchmarks (for example, the "open1" benchmark in Anton Blanchard's "will it scale" suite). This commit therefore reduces the rate at which rcu_qs_ctr is polled by moving that polling into the force-quiescent-state (FQS) machinery, and by further polling it only after the grace period has been in effect for at least jiffies_till_sched_qs jiffies. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 10 +++++----- kernel/rcu/tree.c | 46 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9d4f9b3a2b7b..e3facb356838 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -385,11 +385,11 @@ TRACE_EVENT(rcu_quiescent_state_report, /* * Tracepoint for quiescent states detected by force_quiescent_state(). - * These trace events include the type of RCU, the grace-period number - * that was blocked by the CPU, the CPU itself, and the type of quiescent - * state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, - * or "kick" when kicking a CPU that has been in dyntick-idle mode for - * too long. + * These trace events include the type of RCU, the grace-period number that + * was blocked by the CPU, the CPU itself, and the type of quiescent state, + * which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, "kick" + * when kicking a CPU that has been in dyntick-idle mode for too long, or + * "rqc" if the CPU got a quiescent state via its rcu_qs_ctr. */ TRACE_EVENT(rcu_fqs, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b970319c75b..d8245cbd08f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1232,7 +1232,10 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { + unsigned long jtsq; int *rcrmp; + unsigned long rjtsc; + struct rcu_node *rnp; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1248,6 +1251,31 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, return 1; } + /* Compute and saturate jiffies_till_sched_qs. */ + jtsq = jiffies_till_sched_qs; + rjtsc = rcu_jiffies_till_stall_check(); + if (jtsq > rjtsc / 2) { + WRITE_ONCE(jiffies_till_sched_qs, rjtsc); + jtsq = rjtsc / 2; + } else if (jtsq < 1) { + WRITE_ONCE(jiffies_till_sched_qs, 1); + jtsq = 1; + } + + /* + * Has this CPU encountered a cond_resched_rcu_qs() since the + * beginning of the grace period? For this to be the case, + * the CPU has to have noticed the current grace period. This + * might not be the case for nohz_full CPUs looping in the kernel. + */ + rnp = rdp->mynode; + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + return 1; + } + /* * Check for the CPU being offline, but only if the grace period * is old enough. We don't need to worry about the CPU changing @@ -1290,9 +1318,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * warning delay. */ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched)) { if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { WRITE_ONCE(rdp->cond_resched_completed, READ_ONCE(rdp->mynode->completed)); @@ -2550,10 +2577,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || - rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || - rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2608,8 +2633,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) + if (rdp->cpu_no_qs.b.norm) return; /* @@ -3563,9 +3587,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && - (!rdp->cpu_no_qs.b.norm || - rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { + } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; return 1; } -- cgit v1.2.3-59-g8ed1b From 38d30b336ccf8ee98e0e494a13738a0fade5a5e6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Dec 2016 09:55:27 -0800 Subject: rcu: Adjust FQS offline checks for exact online-CPU detection Commit 7ec99de36f40 ("rcu: Provide exact CPU-online tracking for RCU"), as its title suggests, got rid of RCU's remaining CPU-hotplug timing guesswork. This commit therefore removes the one-jiffy kludge that was used to paper over this guesswork. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d8245cbd08f9..6e4b1dcebeb2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1276,21 +1276,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, return 1; } - /* - * Check for the CPU being offline, but only if the grace period - * is old enough. We don't need to worry about the CPU changing - * state: If we see it offline even once, it has been through a - * quiescent state. - * - * The reason for insisting that the grace period be at least - * one jiffy old is that CPUs that are not quite online and that - * have just gone offline can still execute RCU read-side critical - * sections. - */ - if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) - return 0; /* Grace period is not old enough. */ - barrier(); - if (cpu_is_offline(rdp->cpu)) { + /* Check for the CPU being offline. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; -- cgit v1.2.3-59-g8ed1b From f2c4689640e9a34bc45c013032185ed4ce47e7ff Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Mon, 23 Jan 2017 13:35:18 -0800 Subject: srcu: Implement more-efficient reader counts SRCU uses two per-cpu counters: a nesting counter to count the number of active critical sections, and a sequence counter to ensure that the nesting counters don't change while they are being added together in srcu_readers_active_idx_check(). This patch instead uses per-cpu lock and unlock counters. Because both counters only increase and srcu_readers_active_idx_check() reads the unlock counter before the lock counter, this achieves the same end without having to increment two different counters in srcu_read_lock(). This also saves a smp_mb() in srcu_readers_active_idx_check(). Possible bug: There is no guarantee that the lock counter won't overflow during srcu_readers_active_idx_check(), as there are no memory barriers around srcu_flip() (see comment in srcu_readers_active_idx_check() for details). However, this problem was already present before this patch. Suggested-by: Mathieu Desnoyers Signed-off-by: Lance Roy Cc: Paul E. McKenney Cc: Lai Jiangshan Cc: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 10 ++-- kernel/rcu/rcutorture.c | 19 +++++++- kernel/rcu/srcu.c | 122 +++++++++++++++++------------------------------- 3 files changed, 66 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index dc8eb63c6568..a598cf3ac70c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -33,9 +33,9 @@ #include #include -struct srcu_struct_array { - unsigned long c[2]; - unsigned long seq[2]; +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; }; struct rcu_batch { @@ -46,7 +46,7 @@ struct rcu_batch { struct srcu_struct { unsigned long completed; - struct srcu_struct_array __percpu *per_cpu_ref; + struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ bool running; /* callbacks just queued */ @@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work); * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..d81345be730e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,10 +564,25 @@ static void srcu_torture_stats(void) pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; + struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..c9a0015e1c2e 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) rcu_batch_init(&sp->batch_check1); rcu_batch_init(&sp->batch_done); INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + sp->per_cpu_ref = alloc_percpu(struct srcu_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[idx]); } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; } /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent accesses after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but not have incremented the lock counter yet. * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. + * Possible bug: There is no guarantee that there haven't been ULONG_MAX + * increments of ->lock_count[] since the unlocks were counted, meaning + * that this could return true even if there are still active readers. + * Since there are no memory barriers around srcu_flip(), the CPU is not + * required to increment ->completed before running + * srcu_readers_unlock_idx(), which means that there could be an + * arbitrarily large number of critical sections that execute after + * srcu_readers_unlock_idx() but use the old value of ->completed. */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. - * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. - */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[0]); + sum += READ_ONCE(cpuc->lock_count[1]); + sum -= READ_ONCE(cpuc->unlock_count[0]); + sum -= READ_ONCE(cpuc->unlock_count[1]); } return sum; } @@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp) -- cgit v1.2.3-59-g8ed1b From d85b62f18d543c663cbdd6061054efeb9e66cee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2016 12:08:49 -0800 Subject: srcu: Force full grace-period ordering If a process invokes synchronize_srcu(), is delayed just the right amount of time, and thus does not sleep when waiting for the grace period to complete, there is no ordering between the end of the grace period and the code following the synchronize_srcu(). Similarly, there can be a lack of ordering between the end of the SRCU grace period and callback invocation. This commit adds the necessary ordering. Reported-by: Lance Roy Signed-off-by: Paul E. McKenney [ paulmck: Further smp_mb() adjustment per email with Lance Roy. ] --- include/linux/rcupdate.h | 12 ++++++++++++ kernel/rcu/srcu.c | 10 ++++++++-- kernel/rcu/tree.h | 12 ------------ 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 01f71e1d2e94..6ade6a52d9d4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1161,5 +1161,17 @@ do { \ ftrace_dump(oops_dump_mode); \ } while (0) +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c9a0015e1c2e..665bc9951523 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -358,6 +358,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, head->next = NULL; head->func = func; spin_lock_irqsave(&sp->queue_lock, flags); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; @@ -391,6 +392,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) head->next = NULL; head->func = wakeme_after_rcu; spin_lock_irq(&sp->queue_lock); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ if (!sp->running) { /* steal the processing owner */ sp->running = true; @@ -410,8 +412,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) spin_unlock_irq(&sp->queue_lock); } - if (!done) + if (!done) { wait_for_completion(&rcu.completion); + smp_mb(); /* Caller's later accesses after GP. */ + } + } /** @@ -579,7 +584,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount) /* * Invoke a limited number of SRCU callbacks that have passed through * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). */ static void srcu_invoke_callbacks(struct srcu_struct *sp) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..abcc25bdcb29 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -687,18 +687,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) } #endif /* #ifdef CONFIG_RCU_TRACE */ -/* - * Place this after a lock-acquisition primitive to guarantee that - * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies - * if the UNLOCK and LOCK are executed by the same CPU or if the - * UNLOCK and LOCK operate on the same lock variable. - */ -#ifdef CONFIG_PPC -#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ -#define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ - /* * Wrappers for the rcu_node::lock acquire and release. * -- cgit v1.2.3-59-g8ed1b From 7f554a3d05bea9f6b7bf8e0b041d09447f82d74a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Jan 2017 08:51:34 -0800 Subject: srcu: Reduce probability of SRCU ->unlock_count[] counter overflow Because there are no memory barriers between the srcu_flip() ->completed increment and the summation of the read-side ->unlock_count[] counters, both the compiler and the CPU can reorder the summation with the ->completed increment. If the updater is preempted long enough during this process, the read-side counters could overflow, resulting in a too-short grace period. This commit therefore adds a memory barrier just after the ->completed increment, ensuring that if the summation misses an increment of ->unlock_count[] from __srcu_read_unlock(), the next __srcu_read_lock() will see the new value of ->completed, thus bounding the number of ->unlock_count[] increments that can be missed to NR_CPUS. The actual overflow computation is more complex due to the possibility of nesting of __srcu_read_lock(). Reported-by: Lance Roy Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 665bc9951523..e773129c8b08 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -320,7 +320,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) */ static void srcu_flip(struct srcu_struct *sp) { - sp->completed++; + WRITE_ONCE(sp->completed, sp->completed + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ } /* -- cgit v1.2.3-59-g8ed1b