From 37e377d2823e03528cb64f435d7c0e30b1c668eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Feb 2012 22:12:18 -0800 Subject: rcu: Fixes to rcutorture error handling and cleanup The rcutorture initialization code ignored the error returns from rcu_torture_onoff_init() and rcu_torture_stall_init(). The rcutorture cleanup code failed to NULL out a number of pointers. These bugs will normally have no effect, but this commit fixes them nevertheless. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel/rcutorture.c') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..1463a0636443 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1337,6 +1337,7 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ kthread_stop(t); + boost_tasks[cpu] = NULL; } static int rcutorture_booster_init(int cpu) @@ -1484,13 +1485,15 @@ static void rcu_torture_onoff_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); kthread_stop(onoff_task); + onoff_task = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void +static int rcu_torture_onoff_init(void) { + return 0; } static void rcu_torture_onoff_cleanup(void) @@ -1554,6 +1557,7 @@ static void rcu_torture_stall_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); kthread_stop(stall_task); + stall_task = NULL; } static int rcutorture_cpu_notify(struct notifier_block *self, @@ -1665,6 +1669,7 @@ rcu_torture_cleanup(void) VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); kthread_stop(shutdown_task); } + shutdown_task = NULL; rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1897,9 +1902,17 @@ rcu_torture_init(void) goto unwind; } } - rcu_torture_onoff_init(); + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } register_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_stall_init(); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.2.3-59-g8ed1b From fae4b54f28f034d228fa3bfc98858c698b64e89c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Feb 2012 17:51:45 -0800 Subject: rcu: Introduce rcutorture testing for rcu_barrier() Although rcutorture does invoke rcu_barrier() and friends, it cannot really be called a torture test given that it invokes them only once at the end of the test. This commit therefore introduces heavy-duty rcutorture testing for rcu_barrier(), which may be carried out concurrently with normal rcutorture testing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 15 +++- kernel/rcutorture.c | 194 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 200 insertions(+), 9 deletions(-) (limited to 'kernel/rcutorture.c') diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 375d3fb71437..4ddf3913fd8c 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -47,6 +47,16 @@ irqreader Says to invoke RCU readers from irq level. This is currently permit this. (Or, more accurately, variants of RCU that do -not- permit this know to ignore this variable.) +n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted, + in which case n_barrier_cbs specifies the number of + RCU callbacks (and corresponding kthreads) to use for + this testing. The value cannot be negative. If you + specify this to be non-zero when torture_type indicates a + synchronous RCU implementation (one for which a member of + the synchronize_rcu() rather than the call_rcu() family is + used -- see the documentation for torture_type below), an + error will be reported and no testing will be carried out. + nfakewriters This is the number of RCU fake writer threads to run. Fake writer threads repeatedly use the synchronous "wait for current readers" function of the interface selected by @@ -188,7 +198,7 @@ OUTPUT The statistics output is as follows: rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 - rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 + rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 @@ -230,6 +240,9 @@ o "rtmbe": A non-zero value indicates that rcutorture believes that rcu_assign_pointer() and rcu_dereference() are not working correctly. This value should be zero. +o "rtbe": A non-zero value indicates that one of the rcu_barrier() + family of functions is not working correctly. + o "rtbke": rcutorture was unable to create the real-time kthreads used to force RCU priority inversion. This value should be zero. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1463a0636443..8cd262b41499 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ @@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); module_param(onoff_holdoff, int, 0444); @@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; static struct task_struct *onoff_task; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; @@ -173,6 +179,8 @@ static long n_offline_attempts; static long n_offline_successes; static long n_online_attempts; static long n_online_successes; +static long n_barrier_attempts; +static long n_barrier_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ @@ -327,6 +339,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); int (*stats)(char *page); @@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, + .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -637,6 +656,7 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" @@ -661,6 +681,7 @@ static struct rcu_torture_ops srcu_raw_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_raw" @@ -680,6 +701,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize_expedited, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_expedited" @@ -1129,7 +1151,8 @@ rcu_torture_printk(char *page) "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld", + "onoff: %ld/%ld:%ld/%ld " + "barrier: %ld/%ld:%ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1145,14 +1168,17 @@ rcu_torture_printk(char *page) n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts); + n_offline_attempts, + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { + n_rcu_torture_boost_failure != 0 || + i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); @@ -1560,6 +1586,151 @@ static void rcu_torture_stall_cleanup(void) stall_task = NULL; } +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + atomic_read(&barrier_cbs_count) == n_barrier_cbs || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + /* wake_up() path contains the required barriers. */ + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + printk(KERN_ALERT "%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + printk(KERN_ALERT "%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } +} + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1602,6 +1773,7 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); @@ -1681,7 +1853,7 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup) cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || n_offline_successes != n_offline_attempts) @@ -1697,6 +1869,7 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; + int retval; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, @@ -1754,6 +1927,7 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; @@ -1877,7 +2051,6 @@ rcu_torture_init(void) test_boost_duration = 2; if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { - int retval; boost_starttime = jiffies + test_boost_interval * HZ; register_cpu_notifier(&rcutorture_cpu_nb); @@ -1913,6 +2086,11 @@ rcu_torture_init(void) firsterr = i; goto unwind; } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.2.3-59-g8ed1b From cef50120b61c2af4ce34bc165e19cad66296f93d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 5 Feb 2012 07:42:44 -0800 Subject: rcu: Direct algorithmic SRCU implementation The current implementation of synchronize_srcu_expedited() can cause severe OS jitter due to its use of synchronize_sched(), which in turn invokes try_stop_cpus(), which causes each CPU to be sent an IPI. This can result in severe performance degradation for real-time workloads and especially for short-interation-length HPC workloads. Furthermore, because only one instance of try_stop_cpus() can be making forward progress at a given time, only one instance of synchronize_srcu_expedited() can make forward progress at a time, even if they are all operating on distinct srcu_struct structures. This commit, inspired by an earlier implementation by Peter Zijlstra (https://lkml.org/lkml/2012/1/31/211) and by further offline discussions, takes a strictly algorithmic bits-in-memory approach. This has the disadvantage of requiring one explicit memory-barrier instruction in each of srcu_read_lock() and srcu_read_unlock(), but on the other hand completely dispenses with OS jitter and furthermore allows SRCU to be used freely by CPUs that RCU believes to be idle or offline. The update-side implementation handles the single read-side memory barrier by rechecking the per-CPU counters after summing them and by running through the update-side state machine twice. This implementation has passed moderate rcutorture testing on both x86 and Power. Also updated to use this_cpu_ptr() instead of per_cpu_ptr(), as suggested by Peter Zijlstra. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Reviewed-by: Lai Jiangshan --- include/linux/srcu.h | 10 +- kernel/rcutorture.c | 2 +- kernel/srcu.c | 284 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 198 insertions(+), 98 deletions(-) (limited to 'kernel/rcutorture.c') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d3d5fa54f25e..a478c8eb8479 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -31,13 +31,19 @@ #include struct srcu_struct_array { - int c[2]; + unsigned long c[2]; }; +/* Bit definitions for field ->c above and ->snap below. */ +#define SRCU_USAGE_BITS 2 +#define SRCU_REF_MASK (ULONG_MAX >> SRCU_USAGE_BITS) +#define SRCU_USAGE_COUNT (SRCU_REF_MASK + 1) + struct srcu_struct { - int completed; + unsigned completed; struct srcu_struct_array __percpu *per_cpu_ref; struct mutex mutex; + unsigned long snap[NR_CPUS]; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8cd262b41499..d10b179dea83 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -639,7 +639,7 @@ static int srcu_torture_stats(char *page) cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..84c9b97dc3d9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -73,19 +73,102 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * srcu_readers_active_idx -- returns approximate number of readers - * active on the specified rank of per-CPU counters. + * Returns approximate number of readers active on the specified rank + * of per-CPU counters. Also snapshots each counter's value in the + * corresponding element of sp->snap[] for later use validating + * the sum. */ +static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + unsigned long t; -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + sum += t; + sp->snap[cpu] = t; + } + return sum & SRCU_REF_MASK; +} + +/* + * To be called from the update side after an index flip. Returns true + * if the modulo sum of the counters is stably zero, false if there is + * some possibility of non-zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { int cpu; - int sum; - sum = 0; + /* + * Note that srcu_readers_active_idx() can incorrectly return + * zero even though there is a pre-existing reader throughout. + * To see this, suppose that task A is in a very long SRCU + * read-side critical section that started on CPU 0, and that + * no other reader exists, so that the modulo sum of the counters + * is equal to one. Then suppose that task B starts executing + * srcu_readers_active_idx(), summing up to CPU 1, and then that + * task C starts reading on CPU 0, so that its increment is not + * summed, but finishes reading on CPU 2, so that its decrement + * -is- summed. Then when task B completes its sum, it will + * incorrectly get zero, despite the fact that task A has been + * in its SRCU read-side critical section the whole time. + * + * We therefore do a validation step should srcu_readers_active_idx() + * return zero. + */ + if (srcu_readers_active_idx(sp, idx) != 0) + return false; + + /* + * Since the caller recently flipped ->completed, we can see at + * most one increment of each CPU's counter from this point + * forward. The reason for this is that the reader CPU must have + * fetched the index before srcu_readers_active_idx checked + * that CPU's counter, but not yet incremented its counter. + * Its eventual counter increment will follow the read in + * srcu_readers_active_idx(), and that increment is immediately + * followed by smp_mb() B. Because smp_mb() D is between + * the ->completed flip and srcu_readers_active_idx()'s read, + * that CPU's subsequent load of ->completed must see the new + * value, and therefore increment the counter in the other rank. + */ + smp_mb(); /* A */ + + /* + * Now, we check the ->snap array that srcu_readers_active_idx() + * filled in from the per-CPU counter values. Since both + * __srcu_read_lock() and __srcu_read_unlock() increment the + * upper bits of the per-CPU counter, an increment/decrement + * pair will change the value of the counter. Since there is + * only one possible increment, the only way to wrap the counter + * is to have a huge number of counter decrements, which requires + * a huge number of tasks and huge SRCU read-side critical-section + * nesting levels, even on 32-bit systems. + * + * All of the ways of confusing the readings require that the scan + * in srcu_readers_active_idx() see the read-side task's decrement, + * but not its increment. However, between that decrement and + * increment are smb_mb() B and C. Either or both of these pair + * with smp_mb() A above to ensure that the scan below will see + * the read-side tasks's increment, thus noting a difference in + * the counter values between the two passes. + * + * Therefore, if srcu_readers_active_idx() returned zero, and + * none of the counters changed, we know that the zero was the + * correct sum. + * + * Of course, it is possible that a task might be delayed + * for a very long time in __srcu_read_lock() after fetching + * the index but before incrementing its counter. This + * possibility will be dealt with in __synchronize_srcu(). + */ for_each_possible_cpu(cpu) - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; - return sum; + if (sp->snap[cpu] != + ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx])) + return false; /* False zero reading! */ + return true; } /** @@ -131,10 +214,11 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; preempt_disable(); - idx = sp->completed & 0x1; - barrier(); /* ensure compiler looks -once- at sp->completed. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; - srcu_barrier(); /* ensure compiler won't misorder critical section. */ + idx = rcu_dereference_index_check(sp->completed, + rcu_read_lock_sched_held()) & 0x1; + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += + SRCU_USAGE_COUNT + 1; + smp_mb(); /* B */ /* Avoid leaking the critical section. */ preempt_enable(); return idx; } @@ -149,8 +233,9 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { preempt_disable(); - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += + SRCU_USAGE_COUNT - 1; preempt_enable(); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -163,12 +248,65 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); * we repeatedly block for 1-millisecond time periods. This approach * has done well in testing, so there is no need for a config parameter. */ -#define SYNCHRONIZE_SRCU_READER_DELAY 10 +#define SYNCHRONIZE_SRCU_READER_DELAY 5 + +/* + * Flip the readers' index by incrementing ->completed, then wait + * until there are no more readers using the counters referenced by + * the old index value. (Recall that the index is the bottom bit + * of ->completed.) + * + * Of course, it is possible that a reader might be delayed for the + * full duration of flip_idx_and_wait() between fetching the + * index and incrementing its counter. This possibility is handled + * by __synchronize_srcu() invoking flip_idx_and_wait() twice. + */ +static void flip_idx_and_wait(struct srcu_struct *sp, bool expedited) +{ + int idx; + int trycount = 0; + + idx = sp->completed++ & 0x1; + + /* + * If a reader fetches the index before the above increment, + * but increments its counter after srcu_readers_active_idx_check() + * sums it, then smp_mb() D will pair with __srcu_read_lock()'s + * smp_mb() B to ensure that the SRCU read-side critical section + * will see any updates that the current task performed before its + * call to synchronize_srcu(), or to synchronize_srcu_expedited(), + * as the case may be. + */ + smp_mb(); /* D */ + + /* + * SRCU read-side critical sections are normally short, so wait + * a small amount of time before possibly blocking. + */ + if (!srcu_readers_active_idx_check(sp, idx)) { + udelay(SYNCHRONIZE_SRCU_READER_DELAY); + while (!srcu_readers_active_idx_check(sp, idx)) { + if (expedited && ++ trycount < 10) + udelay(SYNCHRONIZE_SRCU_READER_DELAY); + else + schedule_timeout_interruptible(1); + } + } + + /* + * The following smp_mb() E pairs with srcu_read_unlock()'s + * smp_mb C to ensure that if srcu_readers_active_idx_check() + * sees srcu_read_unlock()'s counter decrement, then any + * of the current task's subsequent code will happen after + * that SRCU read-side critical section. + */ + smp_mb(); /* E */ +} /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) +static void __synchronize_srcu(struct srcu_struct *sp, bool expedited) { int idx; @@ -178,90 +316,53 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) !lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); - idx = sp->completed; + smp_mb(); /* Ensure prior action happens before grace period. */ + idx = ACCESS_ONCE(sp->completed); + smp_mb(); /* Access to ->completed before lock acquisition. */ mutex_lock(&sp->mutex); /* * Check to see if someone else did the work for us while we were - * waiting to acquire the lock. We need -two- advances of + * waiting to acquire the lock. We need -three- advances of * the counter, not just one. If there was but one, we might have * shown up -after- our helper's first synchronize_sched(), thus * having failed to prevent CPU-reordering races with concurrent - * srcu_read_unlock()s on other CPUs (see comment below). So we - * either (1) wait for two or (2) supply the second ourselves. + * srcu_read_unlock()s on other CPUs (see comment below). If there + * was only two, we are guaranteed to have waited through only one + * full index-flip phase. So we either (1) wait for three or + * (2) supply the additional ones we need. */ - if ((sp->completed - idx) >= 2) { + if (sp->completed == idx + 2) + idx = 1; + else if (sp->completed == idx + 3) { mutex_unlock(&sp->mutex); return; - } - - sync_func(); /* Force memory barrier on all CPUs. */ + } else + idx = 0; /* - * The preceding synchronize_sched() ensures that any CPU that - * sees the new value of sp->completed will also see any preceding - * changes to data structures made by this CPU. This prevents - * some other CPU from reordering the accesses in its SRCU - * read-side critical section to precede the corresponding - * srcu_read_lock() -- ensuring that such references will in - * fact be protected. + * If there were no helpers, then we need to do two flips of + * the index. The first flip is required if there are any + * outstanding SRCU readers even if there are no new readers + * running concurrently with the first counter flip. * - * So it is now safe to do the flip. - */ - - idx = sp->completed & 0x1; - sp->completed++; - - sync_func(); /* Force memory barrier on all CPUs. */ - - /* - * At this point, because of the preceding synchronize_sched(), - * all srcu_read_lock() calls using the old counters have completed. - * Their corresponding critical sections might well be still - * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. We initially give readers - * an arbitrarily chosen 10 microseconds to get out of their - * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. The 10-microsecond value has done - * very well in testing. - */ - - if (srcu_readers_active_idx(sp, idx)) - udelay(SYNCHRONIZE_SRCU_READER_DELAY); - while (srcu_readers_active_idx(sp, idx)) - schedule_timeout_interruptible(1); - - sync_func(); /* Force memory barrier on all CPUs. */ - - /* - * The preceding synchronize_sched() forces all srcu_read_unlock() - * primitives that were executing concurrently with the preceding - * for_each_possible_cpu() loop to have completed by this point. - * More importantly, it also forces the corresponding SRCU read-side - * critical sections to have also completed, and the corresponding - * references to SRCU-protected data items to be dropped. + * The second flip is required when a new reader picks up + * the old value of the index, but does not increment its + * counter until after its counters is summed/rechecked by + * srcu_readers_active_idx_check(). In this case, the current SRCU + * grace period would be OK because the SRCU read-side critical + * section started after this SRCU grace period started, so the + * grace period is not required to wait for the reader. * - * Note: - * - * Despite what you might think at first glance, the - * preceding synchronize_sched() -must- be within the - * critical section ended by the following mutex_unlock(). - * Otherwise, a task taking the early exit can race - * with a srcu_read_unlock(), which might have executed - * just before the preceding srcu_readers_active() check, - * and whose CPU might have reordered the srcu_read_unlock() - * with the preceding critical section. In this case, there - * is nothing preventing the synchronize_sched() task that is - * taking the early exit from freeing a data structure that - * is still being referenced (out of order) by the task - * doing the srcu_read_unlock(). - * - * Alternatively, the comparison with "2" on the early exit - * could be changed to "3", but this increases synchronize_srcu() - * latency for bulk loads. So the current code is preferred. + * However, the next SRCU grace period would be waiting for the + * other set of counters to go to zero, and therefore would not + * wait for the reader, which would be very bad. To avoid this + * bad scenario, we flip and wait twice, clearing out both sets + * of counters. */ - + for (; idx < 2; idx++) + flip_idx_and_wait(sp, expedited); mutex_unlock(&sp->mutex); } @@ -281,7 +382,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, synchronize_sched); + __synchronize_srcu(sp, 0); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -289,18 +390,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * synchronize_srcu_expedited - Brute-force SRCU grace period * @sp: srcu_struct with which to synchronize. * - * Wait for an SRCU grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_srcu_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_srcu() instead. + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. * * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. It is also illegal to call + * that is acquired by a CPU-hotplug notifier. It is also illegal to call * synchronize_srcu_expedited() from the corresponding SRCU read-side * critical section; doing so will result in deadlock. However, it is * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct @@ -309,7 +403,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); */ void synchronize_srcu_expedited(struct srcu_struct *sp) { - __synchronize_srcu(sp, synchronize_sched_expedited); + __synchronize_srcu(sp, 1); } EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); -- cgit v1.2.3-59-g8ed1b From 9059c94017f748d9e20c3b089188a7abb27f6233 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 19 Mar 2012 16:12:14 +0800 Subject: rcu: Add rcutorture test for call_srcu() Add srcu_torture_deferred_free() for srcu_ops so as to test the new call_srcu(). Rename the original srcu_ops to srcu_sync_ops. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) (limited to 'kernel/rcutorture.c') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index d10b179dea83..e66b34ab7555 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -625,6 +625,11 @@ static int srcu_torture_completed(void) return srcu_batches_completed(&srcu_ctl); } +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} + static void srcu_torture_synchronize(void) { synchronize_srcu(&srcu_ctl); @@ -654,7 +659,7 @@ static struct rcu_torture_ops srcu_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .call = NULL, .cb_barrier = NULL, @@ -662,6 +667,21 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static struct rcu_torture_ops srcu_sync_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .call = NULL, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_sync" +}; + static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) { return srcu_read_lock_raw(&srcu_ctl); @@ -679,7 +699,7 @@ static struct rcu_torture_ops srcu_raw_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .call = NULL, .cb_barrier = NULL, @@ -687,6 +707,21 @@ static struct rcu_torture_ops srcu_raw_ops = { .name = "srcu_raw" }; +static struct rcu_torture_ops srcu_raw_sync_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock_raw, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock_raw, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .call = NULL, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_raw_sync" +}; + static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -1685,7 +1720,7 @@ static int rcu_torture_barrier_init(void) for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)i, + (void *)(long)i, "rcu_torture_barrier_cbs"); if (IS_ERR(barrier_cbs_tasks[i])) { ret = PTR_ERR(barrier_cbs_tasks[i]); @@ -1873,7 +1908,8 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, + &srcu_raw_sync_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3-59-g8ed1b