rcu: Reduce expedited GP memory contention via per-CPU variables

Currently, the piggybacked-work checks carried out by sync_exp_work_done() atomically increment a small set of variables (the ->expedited_workdone0, ->expedited_workdone1, ->expedited_workdone2, ->expedited_workdone3 fields in the rcu_state structure), which will form a memory-contention bottleneck given a sufficiently large number of CPUs concurrently invoking either synchronize_rcu_expedited() or synchronize_sched_expedited(). This commit therefore moves these for fields to the per-CPU rcu_data structure, eliminating the memory contention. The show_rcuexp() function also changes to sum up each field in the rcu_data structures. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2015-10-01 10:26:24 -0700
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2015-12-04 12:26:52 -0800
commit: df5bd5144a80a9f6c3807383b11f735dae9caf9d (patch)
tree: fce44d0970a70446c5fd6b50f2e764db0efb8e56 /kernel/rcu/tree_trace.c
parent: rcu: Invert sync_rcu_exp_select_cpus() "if" statement (diff)
download: linux-dev-df5bd5144a80a9f6c3807383b11f735dae9caf9d.tar.xz
linux-dev-df5bd5144a80a9f6c3807383b11f735dae9caf9d.zip
1 files changed, 12 insertions, 6 deletions
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8efaba870d96..d43649450ea4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -183,14 +183,20 @@ static const struct file_operations rcudata_fops = {
 
 static int show_rcuexp(struct seq_file *m, void *v)
 {
+	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+	struct rcu_data *rdp;
+	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+	for_each_possible_cpu(cpu) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		s0 += atomic_long_read(&rdp->expedited_workdone0);
+		s1 += atomic_long_read(&rdp->expedited_workdone1);
+		s2 += atomic_long_read(&rdp->expedited_workdone2);
+		s3 += atomic_long_read(&rdp->expedited_workdone3);
+	}
 	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence,
-		   atomic_long_read(&rsp->expedited_workdone0),
-		   atomic_long_read(&rsp->expedited_workdone1),
-		   atomic_long_read(&rsp->expedited_workdone2),
-		   atomic_long_read(&rsp->expedited_workdone3),
+		   rsp->expedited_sequence, s0, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2015-10-01 10:26:24 -0700
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2015-12-04 12:26:52 -0800
commit	df5bd5144a80a9f6c3807383b11f735dae9caf9d (patch)
tree	fce44d0970a70446c5fd6b50f2e764db0efb8e56 /kernel/rcu/tree_trace.c
parent	rcu: Invert sync_rcu_exp_select_cpus() "if" statement (diff)
download	linux-dev-df5bd5144a80a9f6c3807383b11f735dae9caf9d.tar.xz linux-dev-df5bd5144a80a9f6c3807383b11f735dae9caf9d.zip