From 25f3d7effab632eb10d145f1a5aebf6515a04b98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Feb 2018 22:05:38 -0800 Subject: rcu: Parallelize expedited grace-period initialization The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcu.h | 1 + kernel/rcu/tree.c | 3 + kernel/rcu/tree.h | 10 +++ kernel/rcu/tree_exp.h | 184 +++++++++++++++++++++++++++++--------------------- 4 files changed, 120 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..976019d6fa06 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..23781fc90830 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4200,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..98d33902b65c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -58,6 +58,14 @@ struct rcu_dynticks { #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; +/* Communicate arguments to a workqueue handler. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 @@ -157,6 +165,8 @@ struct rcu_node { spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; wait_queue_head_t exp_wq[4]; + struct rcu_exp_work rew; + bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..73e1d3dca5b1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu) } /* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. + * Select the CPUs within the specified rcu_node that the upcoming + * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; + smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_node *rnp; + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); + struct rcu_state *rsp = rewp->rew_rsp; - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); - int snap; + func = rewp->rew_func; + raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (raw_smp_processor_id() == cpu || - !(rnp->qsmaskinitnext & mask)) { + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; + + if (raw_smp_processor_id() == cpu || + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; - } else { - snap = rcu_dynticks_snap(rdtp); - if (rcu_dynticks_in_eqs(snap)) - mask_ofl_test |= mask; - else - rdp->exp_dynticks_snap = snap; - } + else + rdp->exp_dynticks_snap = snap; } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - if (!(mask_ofl_ipi & mask)) - continue; + if (!(mask_ofl_ipi & mask)) + continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { - mask_ofl_test |= mask; - continue; - } - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with CPU hotplug operation. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rnp->qsmaskinitnext & mask) && - (rnp->expmask & mask)) { - /* Online, so delay for a bit and try again. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); - schedule_timeout_uninterruptible(1); - goto retry_ipi; - } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { + mask_ofl_test |= mask; + continue; + } + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with CPU hotplug operation. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if ((rnp->qsmaskinitnext & mask) && + (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + schedule_timeout_uninterruptible(1); + goto retry_ipi; + } + /* CPU really is offline, so we can ignore it. */ + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_node *rnp; + + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); + sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + + /* Schedule work for each leaf rcu_node structure. */ + rcu_for_each_leaf_node(rsp, rnp) { + rnp->exp_need_flush = false; + if (!READ_ONCE(rnp->expmask)) + continue; /* Avoid early boot non-existent wq. */ + rnp->rew.rew_func = func; + rnp->rew.rew_rsp = rsp; + if (!READ_ONCE(rcu_par_gp_wq) || + rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + /* No workqueues yet. */ + sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); + continue; } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + rnp->exp_need_flush = true; } + + /* Wait for workqueue jobs (if any) to complete. */ + rcu_for_each_leaf_node(rsp, rnp) + if (rnp->exp_need_flush) + flush_work(&rnp->rew.rew_work); } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } -/* Let the workqueue handler know what it is supposed to do. */ -struct rcu_exp_work { - smp_call_func_t rew_func; - struct rcu_state *rew_rsp; - unsigned long rew_s; - struct work_struct rew_work; -}; - /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. -- cgit v1.2.3-59-g8ed1b From 7be8c56f8f8a58af92f8791c5a09d48e342d7101 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 7 Mar 2018 16:49:39 +0800 Subject: rcu: exp: Fix "must hold exp_mutex" comments for QS reporting functions Since commit d9a3da0699b2 ("rcu: Add expedited grace-period support for preemptible RCU"), there are comments for some funtions in rcu_report_exp_rnp()'s call-chain saying that exp_mutex or its predecessors needs to be held. However, exp_mutex and its predecessors were used only to synchronize between GPs, and it is clear that all variables visited by those functions are under the protection of rcu_node's ->lock. Moreover, those functions are currently called without held exp_mutex, and seems that doesn't introduce any trouble. So this patch fixes this problem by updating the comments to match the current code. Signed-off-by: Boqun Feng Fixes: d9a3da0699b2 ("rcu: Add expedited grace-period support for preemptible RCU") Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree_exp.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 73e1d3dca5b1..d7622cb85aa7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -154,7 +154,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the rcu_state's exp_mutex. + * Caller must hold the specificed rcu_node structure's ->lock */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -170,8 +170,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. + * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -207,8 +206,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -221,8 +218,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. + * specified leaf rcu_node structure. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) -- cgit v1.2.3-59-g8ed1b From 55ebfce0605a4dce35467dfb1dc1b3ad26fe9f86 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 9 Mar 2018 09:14:51 +0800 Subject: rcu: exp: Protect all sync_rcu_preempt_exp_done() with rcu_node lock Currently some callsites of sync_rcu_preempt_exp_done() are not called with the corresponding rcu_node's ->lock held, which could introduces bugs as per Paul: o CPU 0 in sync_rcu_preempt_exp_done() reads ->exp_tasks and sees that it is NULL. o CPU 1 blocks within an RCU read-side critical section, so it enqueues the task and points ->exp_tasks at it and clears CPU 1's bit in ->expmask. o All other CPUs clear their bits in ->expmask. o CPU 0 reads ->expmask, sees that it is zero, so incorrectly concludes that all quiescent states have completed, despite the fact that ->exp_tasks is non-NULL. To fix this, sync_rcu_preempt_exp_unlocked() is introduced to replace lockless callsites of sync_rcu_preempt_exp_done(). Further, a lockdep annotation is added into sync_rcu_preempt_exp_done() to prevent mis-use in the future. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree_exp.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d7622cb85aa7..755d0f461f26 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney */ +#include + /* * Record the start of an expedited grace period. */ @@ -158,10 +160,30 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } +/* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + /* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU @@ -501,9 +523,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -536,7 +558,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, -- cgit v1.2.3-59-g8ed1b From be01b4cab118e7a2d9747c71b3bef8f28fdda193 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 26 Feb 2018 14:11:36 +0900 Subject: rcu: Inline rcu_preempt_do_callback() into its sole caller The rcu_preempt_do_callbacks() function was introduced in commit 09223371dea(rcu: Use softirq to address performance regression), where it was necessary to handle kernel builds both containing and not containing RCU-preempt. Since then, various changes (most notably f8b7fc6b51 ("rcu: use softirq instead of kthreads except when RCU_BOOST=y")) have resulted in this function being invoked only from rcu_kthread_do_work(), which is present only in kernels containing RCU-preempt, which in turn means that the rcu_preempt_do_callbacks() function is no longer needed. This commit therefore inlines rcu_preempt_do_callbacks() into its sole remaining caller and also removes the rcu_state_p and rcu_data_p indirection for added clarity. Signed-off-by: Byungchul Park Reviewed-by: Steven Rostedt (VMware) [ paulmck: Remove the rcu_state_p and rcu_data_p indirection. ] Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 11 +---------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..3a0dc30100e8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -438,7 +438,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST -static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..6c5df18bbf2f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -685,15 +685,6 @@ static void rcu_preempt_check_callbacks(void) t->rcu_read_unlock_special.b.need_qs = true; } -#ifdef CONFIG_RCU_BOOST - -static void rcu_preempt_do_callbacks(void) -{ - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -1140,7 +1131,7 @@ static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_preempt_do_callbacks(); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) -- cgit v1.2.3-59-g8ed1b From ef12620626e4e6a062acd9701d1d3429b2356773 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2018 10:34:54 -0800 Subject: rcu: Don't allocate rcu_nocb_mask if no one needs it Commit 44c65ff2e3b0 ("rcu: Eliminate NOCBs CPU-state Kconfig options") made allocation of rcu_nocb_mask depend only on the rcu_nocbs=, nohz_full=, or isolcpus= kernel boot parameters. However, it failed to change the initial value of rcu_init_nohz()'s local variable need_rcu_nocb_mask to false, which can result in useless allocation of an all-zero rcu_nocb_mask. This commit therefore fixes this bug by changing the initial value of need_rcu_nocb_mask to false. While we are in the area, also correct the error message that is printed when someone specifies that can-never-exist CPUs should be NOCBs CPUs. Reported-by: Byungchul Park Signed-off-by: Paul E. McKenney Acked-by: Byungchul Park Tested-by: Nicholas Piggin --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6c5df18bbf2f..3566e4f6dfcc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2283,7 +2283,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) void __init rcu_init_nohz(void) { int cpu; - bool need_rcu_nocb_mask = true; + bool need_rcu_nocb_mask = false; struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) @@ -2306,7 +2306,7 @@ void __init rcu_init_nohz(void) #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } -- cgit v1.2.3-59-g8ed1b From efcd2d543685f377efda09e1ee84dacbda545523 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 28 Feb 2018 14:49:02 +0900 Subject: rcu: Call wake_nocb_leader_defer() with 'FORCE' when nocb_q_count is high If an excessive number of callbacks have been queued, but the NOCB leader kthread's wakeup must be deferred, then we should wake up the leader unconditionally once it is safe to do so. This was handled correctly in commit fbce7497ee ("rcu: Parallelize and economize NOCB kthread wakeups"), but then commit 8be6e1b15c ("rcu: Use timer as backstop for NOCB deferred wakeups") passed RCU_NOCB_WAKE instead of the correct RCU_NOCB_WAKE_FORCE to wake_nocb_leader_defer(). As an interesting aside, RCU_NOCB_WAKE_FORCE is never passed to anything, which should have been taken as a hint. ;-) This commit therefore passes RCU_NOCB_WAKE_FORCE instead of RCU_NOCB_WAKE to wake_nocb_leader_defer() when a callback is queued onto a NOCB CPU that already has an excessive number of callbacks pending. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3566e4f6dfcc..12774c4fb546 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1957,7 +1957,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; -- cgit v1.2.3-59-g8ed1b From 6fba2b3767ea6e3e1204855031492415cc4dce4f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 2 Mar 2018 16:39:12 +0900 Subject: rcu: Remove deprecated RCU debugfs tracing code Commit ae91aa0adb14 ("rcu: Remove debugfs tracing") removed the RCU debugfs tracing code, but did not remove the no-longer used ->exp_workdone{0,1,2,3} fields in the srcu_data structure. This commit therefore removes these fields along with the code that uselessly updates them. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.h | 4 ---- kernel/rcu/tree_exp.h | 13 +++++-------- 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3a0dc30100e8..5fd374c71404 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -224,10 +224,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - atomic_long_t exp_workdone0; /* # done by workqueue. */ - atomic_long_t exp_workdone1; /* # done by others #1. */ - atomic_long_t exp_workdone2; /* # done by others #2. */ - atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ /* 6) Callback offloading. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..f512dd4e57a8 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -248,14 +248,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); return true; } return false; @@ -289,7 +287,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + if (sync_exp_work_done(rsp, s)) return true; /* Work not done, either wait here or go up. */ @@ -302,8 +300,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); + sync_exp_work_done(rsp, s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ @@ -313,7 +310,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } mutex_lock(&rsp->exp_mutex); fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + if (sync_exp_work_done(rsp, s)) { mutex_unlock(&rsp->exp_mutex); return true; } @@ -633,7 +630,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + sync_exp_work_done(rsp, s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ -- cgit v1.2.3-59-g8ed1b From cee4393989333795ae04dc9f3b83a578afe3fca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Mar 2018 16:35:27 -0800 Subject: rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs() Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched()") substituted cond_resched() for the earlier call to cond_resched_rcu_qs(). However, the new-age cond_resched() does not do anything to help RCU-tasks grace periods because (1) RCU-tasks is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a complete no-op when preemption is enabled. This situation results in hangs when running the trace benchmarks. A number of potential fixes were discussed on LKML (https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home), including making cond_resched() not be a no-op; making cond_resched() not be a no-op, but only when running tracing benchmarks; reverting the aforementioned commit (which works because cond_resched_rcu_qs() does provide an RCU-tasks quiescent state; and adding a call to the scheduler/RCU rcu_note_voluntary_context_switch() function. All were deemed unsatisfactory, either due to added cond_resched() overhead or due to magic functions inviting cargo culting. This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(), which provides a clear hint as to what this function is doing and why and where it should be used, and then replaces the call to cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's benchmark_event_kthread() function. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/rcuperf.c | 2 +- kernel/rcu/tree.c | 20 ++++++++++---------- kernel/rcu/tree_plugin.h | 4 ++-- kernel/rcu/update.c | 2 +- kernel/torture.c | 2 +- kernel/trace/trace_benchmark.c | 4 ++-- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 36360d07f25b..19d235fefdb9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -188,13 +188,13 @@ static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** - * cond_resched_rcu_qs - Report potential quiescent states to RCU + * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPT kernels. */ -#define cond_resched_rcu_qs() \ +#define cond_resched_tasks_rcu_qs() \ do { \ if (!cond_resched()) \ rcu_note_voluntary_context_switch_lite(current); \ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 777e7a6a0292..e232846516b3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void) */ static void rcu_perf_wait_shutdown(void) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) return; while (!torture_must_stop()) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..c4db0e20b035 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1234,10 +1234,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * Has this CPU encountered a cond_resched_rcu_qs() since the - * beginning of the grace period? For this to be the case, - * the CPU has to have noticed the current grace period. This - * might not be the case for nohz_full CPUs looping in the kernel. + * Has this CPU encountered a cond_resched() since the beginning + * of the grace period? For this to be the case, the CPU has to + * have noticed the current grace period. This might not be the + * case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); @@ -2049,7 +2049,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2151,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } @@ -2202,7 +2202,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2247,7 +2247,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; @@ -2260,7 +2260,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } } else { /* Deal with stray signal. */ - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2782,7 +2782,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 12774c4fb546..dfc42c3d52d4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1598,7 +1598,7 @@ static int rcu_oom_notify(struct notifier_block *self, for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); } /* Unconditionally decrement: no need to wake ourselves up. */ @@ -2227,7 +2227,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 68fa19a5e7bd..e401960c7f51 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in diff --git a/kernel/torture.c b/kernel/torture.c index 37b94012a3f8..3de1efbecd6a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -574,7 +574,7 @@ void stutter_wait(const char *title) { int spt; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 22fee766081b..80e0b2aca703 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg) * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * - * Note the _rcu_qs() version of cond_resched() will + * Note the tasks_rcu_qs() version of cond_resched() will * notify synchronize_rcu_tasks() that this thread has * passed a quiescent state for rcu_tasks. Otherwise * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_tasks_rcu_qs(); } return 0; -- cgit v1.2.3-59-g8ed1b From c3442697c2d73d3cdb9d4135cf630ad36ba8552f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Mar 2018 11:29:40 -0800 Subject: softirq: Eliminate unused cond_resched_softirq() macro The cond_resched_softirq() macro is not used anywhere in mainline, so this commit simplifies the kernel by eliminating it. Suggested-by: Eric Dumazet Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Tested-by: Nicholas Piggin --- include/linux/sched.h | 8 -------- kernel/sched/core.c | 14 -------------- kernel/softirq.c | 3 +-- 3 files changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b3d697f3b573..6fc99045658a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1613,7 +1613,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, - * cond_resched_softirq() will enable bhs before scheduling. */ #ifndef CONFIG_PREEMPT extern int _cond_resched(void); @@ -1633,13 +1632,6 @@ extern int __cond_resched_lock(spinlock_t *lock); __cond_resched_lock(lock); \ }) -extern int __cond_resched_softirq(void); - -#define cond_resched_softirq() ({ \ - ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ - __cond_resched_softirq(); \ -}) - static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aaeebfcc..6a09e6af64b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5012,20 +5012,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de3640c78..03981f1c39ea 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt) } /* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), + * Special-case - softirqs can safely be enabled by __do_softirq(), * without processing still-pending softirqs: */ void _local_bh_enable(void) -- cgit v1.2.3-59-g8ed1b From 0e5da22e3f809ab9e86a566b9537b02b9496408e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Mar 2018 08:05:04 -0700 Subject: rcu: Move __rcu_read_lock() and __rcu_read_unlock() to tree_plugin.h The __rcu_read_lock() and __rcu_read_unlock() functions were moved to kernel/rcu/update.c in order to implement tiny preemptible RCU. However, tiny preemptible RCU was removed from the kernel a long time ago, so this commit belatedly moves them back into the only remaining preemptible-RCU code. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree_plugin.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 48 ------------------------------------------------ 2 files changed, 44 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dfc42c3d52d4..ee24a5de0503 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -383,6 +383,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } +/* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = READ_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + /* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e401960c7f51..4c230a60ece4 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode); #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ -#ifdef CONFIG_PREEMPT_RCU - -/* - * Preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* critical section after entry code. */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* - * Preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = -- cgit v1.2.3-59-g8ed1b From 265f5f28f042fee66531f9f647c9d78e997995b6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Mar 2018 11:53:22 -0700 Subject: rcu: Update rcu_bind_gp_kthread() header comment The header comment for rcu_bind_gp_kthread() refers to sysidle, which is no longer with us. However, it is still important to bind RCU's grace-period kthreads to the housekeeping CPU(s), so rather than remove rcu_bind_gp_kthread(), this commit updates the comment. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ee24a5de0503..d37b9bb3f481 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2622,8 +2622,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) } /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. + * Bind the RCU grace-period kthreads to the housekeeping CPU. */ static void rcu_bind_gp_kthread(void) { -- cgit v1.2.3-59-g8ed1b From 17672480fb1e953f999623b598a98130f8aacfbc Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 25 Mar 2018 20:50:03 +0300 Subject: rcu: Declare rcu_eqs_special_set() in public header Because rcu_eqs_special_set() is declared only in internal header kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is inaccessible outside of the RCU implementation. This patch therefore moves the rcu_eqs_special_set() declaration to include/linux/rcutree.h, which allows it to be used in non-rcu kernel code. Signed-off-by: Yury Norov Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/linux/rcutree.h | 1 + kernel/rcu/tree.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index fd996cdf1833..448f20f27396 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void) void rcu_barrier(void); void rcu_barrier_bh(void); void rcu_barrier_sched(void); +bool rcu_eqs_special_set(int cpu); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); unsigned long get_state_synchronize_sched(void); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5fd374c71404..0b3a90ebe225 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,7 +404,6 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); -bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -- cgit v1.2.3-59-g8ed1b From f7194ac32ca241d28765a98e42a7fe13debc85a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Apr 2018 17:19:17 -0700 Subject: srcu: Add cleanup_srcu_struct_quiesced() The current cleanup_srcu_struct() flushes work, which prevents it from being invoked from some workqueue contexts, as well as from atomic (non-blocking) contexts. This patch therefore introduced a cleanup_srcu_struct_quiesced(), which can be invoked only after all activity on the specified srcu_struct has completed. This restriction allows cleanup_srcu_struct_quiesced() to be invoked from workqueue contexts as well as from atomic contexts. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Tested-by: Nitzan Carmi Tested-by: Nicholas Piggin --- include/linux/srcu.h | 36 +++++++++++++++++++++++++++++++++++- kernel/rcu/rcutorture.c | 7 ++++++- kernel/rcu/srcutiny.c | 9 ++++++--- kernel/rcu/srcutree.c | 30 +++++++++++++++++------------- 4 files changed, 64 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 33c1c698df09..91494d7e8e41 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -69,11 +69,45 @@ struct srcu_struct { }; void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)); -void cleanup_srcu_struct(struct srcu_struct *sp); +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +static inline void cleanup_srcu_struct(struct srcu_struct *sp) +{ + _cleanup_srcu_struct(sp, false); +} + +/** + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. Also, + * all grace-period processing must have completed. + * + * "Completed" means that the last synchronize_srcu() and + * synchronize_srcu_expedited() calls must have returned before the call + * to cleanup_srcu_struct_quiesced(). It also means that the callback + * from the last call_srcu() must have been invoked before the call to + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help + * with this last. Violating these rules will get you a WARN_ON() splat + * (with high probability, anyway), and will also cause the srcu_struct + * to be leaked. + */ +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) +{ + _cleanup_srcu_struct(sp, true); +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC /** diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..f0e1d44459f8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -593,7 +593,12 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - cleanup_srcu_struct(&srcu_ctld); + static DEFINE_TORTURE_RANDOM(rand); + + if (torture_random(&rand) & 0x800) + cleanup_srcu_struct(&srcu_ctld); + else + cleanup_srcu_struct_quiesced(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 76ac5f50b2c7..622792abe41a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void cleanup_srcu_struct(struct srcu_struct *sp) +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); - flush_work(&sp->srcu_work); + if (quiesced) + WARN_ON(work_pending(&sp->srcu_work)); + else + flush_work(&sp->srcu_work); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); WARN_ON(sp->srcu_cb_head); WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fb560fca9ef4..b4123d7a2cec 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp) return SRCU_INTERVAL; } -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { int cpu; if (WARN_ON(!srcu_get_delay(sp))) - return; /* Leakage unless caller handles error. */ + return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - flush_delayed_work(&sp->work); + return; /* Just leak it! */ + if (quiesced) { + if (WARN_ON(delayed_work_pending(&sp->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&sp->work); + } for_each_possible_cpu(cpu) - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (quiesced) { + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->sda); sp->sda = NULL; } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the -- cgit v1.2.3-59-g8ed1b From 5b4c11d54b1b8d0714702006e00441ada59889a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Apr 2018 17:11:44 -0700 Subject: rcu: Add leaf-node macros This commit adds rcu_first_leaf_node() that returns a pointer to the first leaf rcu_node structure in the specified RCU flavor and an rcu_is_leaf_node() that returns true iff the specified rcu_node structure is a leaf. This commit also uses these macros where appropriate. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcu.h | 11 ++++++++--- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..5b5bb9ee2e20 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +/* Returns first leaf rcu_node of the specified RCU flavor. */ +#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) + +/* Is this rcu_node a leaf? */ +#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * rcu_node tree with but one rcu_node structure, this loop is a no-op. */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + for ((rnp) = rcu_first_leaf_node(rsp); \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c4db0e20b035..b22d2e1ca5c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2398,7 +2398,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, @@ -4056,7 +4056,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) init_swait_queue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->expedited_wq); - rnp = rsp->level[rcu_num_lvls - 1]; + rnp = rcu_first_leaf_node(rsp); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d37b9bb3f481..b999032e9466 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* * Decide where to queue the newly blocked task. In theory, @@ -533,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit v1.2.3-59-g8ed1b From 9036c2ffd596261d2067fc2d693dc4f0d7a51214 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Apr 2018 21:17:56 -0700 Subject: rcu: Improve non-root rcu_cbs_completed() accuracy When rcu_cbs_completed() is invoked on a non-root rcu_node structure, it unconditionally assumes that two grace periods must complete before the callbacks at hand can be invoked. This is overly conservative because if that non-root rcu_node structure believes that no grace period is in progress, and if the corresponding rcu_state structure's ->gpnum field has not yet been incremented, then these callbacks may safely be invoked after only one grace period has completed. This change is required to permit grace-period start requests to use funnel locking, which is in turn permitted to reduce root rcu_node ->lock contention, which has been observed by Nick Piggin. Furthermore, such contention will likely be increased by the merging of RCU-bh, RCU-preempt, and RCU-sched, so it makes sense to take steps to decrease it. This commit therefore improves the accuracy of rcu_cbs_completed() when invoked on a non-root rcu_node structure as described above. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..f5ca72f2ed43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1641,6 +1641,21 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) return rnp->completed + 1; + /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. -- cgit v1.2.3-59-g8ed1b From 825a9911f6447299a69edacecc81fa2cdc5290a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 15:54:32 -0700 Subject: rcu: Make rcu_start_future_gp()'s grace-period check more precise The rcu_start_future_gp() function uses a sloppy check for a grace period being in progress, which works today because there are a number of code sequences that resolve the resulting races. However, some of these race-resolution code sequences must acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_start_future_gp() check more precise, eliminating the sloppy lockless check of the rcu_state structure's ->gpnum and ->completed fields. The effect is that rcu_start_future_gp() will sometimes unnecessarily attempt to start a new grace period, but this overhead will be reduced later using funnel locking. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f5ca72f2ed43..4bbba17422cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1705,20 +1705,12 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * If this rcu_node structure believes that a grace period is in + * progress, then we must wait for the one following, which is in + * "c". Because our request will be noticed at the end of the + * current grace period, we don't need to explicitly start one. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { + if (rnp->gpnum != rnp->completed) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; -- cgit v1.2.3-59-g8ed1b From c91a8675b9cc697c725b6d97fcc7f157f4a989d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 11:11:39 -0700 Subject: rcu: Add accessor macros for the ->need_future_gp[] array Accessors for the ->need_future_gp[] array are currently open-coded, which makes them difficult to change. To improve maintainability, this commit adds need_future_gp_mask() to compute the indexing mask from the array size, need_future_gp_element() to access the element corresponding to the specified grace-period number, and need_any_future_gp() to determine if any future grace period is needed. This commit also applies need_future_gp_element() to existing open-coded single-element accesses. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 16 +++++++--------- kernel/rcu/tree.h | 15 +++++++++++++++ kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4bbba17422cd..79fb99951a0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -718,11 +718,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_future_needs_gp(struct rcu_state *rsp) { struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); + return READ_ONCE(need_future_gp_element(rnp, rnp->completed)); } /* @@ -1699,7 +1697,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, */ c = rcu_cbs_completed(rdp->rsp, rnp); trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { + if (need_future_gp_element(rnp, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); goto out; } @@ -1711,7 +1709,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * current grace period, we don't need to explicitly start one. */ if (rnp->gpnum != rnp->completed) { - rnp->need_future_gp[c & 0x1]++; + need_future_gp_element(rnp, c)++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1737,13 +1735,13 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * If the needed for the required grace period is already * recorded, trace and leave. */ - if (rnp_root->need_future_gp[c & 0x1]) { + if (need_future_gp_element(rnp_root, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; + need_future_gp_element(rnp_root, c)++; /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { @@ -1771,8 +1769,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; + need_future_gp_element(rnp, c) = 0; + needmore = need_future_gp_element(rnp, c + 1); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..18b091474ffa 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -159,6 +159,21 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || (rnp)->need_future_gp[__i]; \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..640ea927d8a4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1790,7 +1790,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) */ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; + need_future_gp_element(rnp, rnp->completed + 1) += nrq; } static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -- cgit v1.2.3-59-g8ed1b From 5fe0a56298e674358ff2740a6288bf21509d895d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 16:41:20 -0700 Subject: rcu: Make rcu_gp_kthread() check for early-boot activity The rcu_gp_kthread() function immediately sleeps waiting to be notified of the need for a new grace period, which currently works because there are a number of code sequences that will provide the needed wakeup later. However, some of these code sequences need to acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_gp_kthread() check for early-boot activity when it starts up, omitting the initial sleep in that case. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79fb99951a0c..497f139056c7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2192,6 +2192,12 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); + /* Check for early-boot work. */ + raw_spin_lock_irq_rcu_node(rnp); + if (need_any_future_gp(rnp)) + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + raw_spin_unlock_irq_rcu_node(rnp); + rcu_bind_gp_kthread(); for (;;) { -- cgit v1.2.3-59-g8ed1b From fb31340f8a43a6f2e871164822ef4979b36232ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 07:20:30 -0700 Subject: rcu: Make rcu_gp_cleanup() more accurately predict need for new GP Currently, rcu_gp_cleanup() scans the rcu_node tree in order to reset state to reflect the end of the grace period. It also checks to see whether a new grace period is needed, but in a number of cases, rather than directly cause the new grace period to be immediately started, it instead leaves the grace-period-needed state where various fail-safes can find it. This works fine, but results in higher contention on the root rcu_node structure's ->lock, which is undesirable, and contention on that lock has recently become noticeable. This commit therefore makes rcu_gp_cleanup() immediately start a new grace period if there is any need for one. It is quite possible that it will later be necessary to throttle the grace-period rate, but that can be dealt with when and if. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 16 ++++++++++------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 17 ----------------- 3 files changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 497f139056c7..afc5e32f0da4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1763,14 +1763,14 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { int c = rnp->completed; - int needmore; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); need_future_gp_element(rnp, c) = 0; - needmore = need_future_gp_element(rnp, c + 1); + needmore = need_any_future_gp(rnp); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; @@ -2113,7 +2113,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2152,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2162,13 +2161,18 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_future_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; if (needgp || cpu_needs_another_gp(rsp, rdp)) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 18b091474ffa..bd1103763551 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -469,7 +469,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 640ea927d8a4..313b77d9cf06 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1780,19 +1780,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - need_future_gp_element(rnp, rnp->completed + 1) += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -2495,10 +2482,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; -- cgit v1.2.3-59-g8ed1b From 51af970d19f395fc57b82514022126de6c5420cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Apr 2018 10:40:57 -0700 Subject: rcu: Avoid losing ->need_future_gp[] values due to GP start/end races The rcu_cbs_completed() function provides the value of ->completed at which new callbacks can safely be invoked. This is recorded in two-element ->need_future_gp[] arrays in the rcu_node structure, and the elements of these arrays corresponding to the just-completed grace period are zeroed at the end of that grace period. However, the rcu_cbs_completed() function can return the current ->completed value plus either one or two, so it is possible for the corresponding ->need_future_gp[] entry to be cleared just after it was set, thus losing a request for a future grace period. This commit avoids this race by expanding ->need_future_gp[] to four elements. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bd1103763551..952cd0c223fe 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -150,8 +150,7 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ + int need_future_gp[4]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; -- cgit v1.2.3-59-g8ed1b From 0ae94e00ce40e4447080ab7675220f725c690330 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 14:14:42 -0700 Subject: rcu: Make rcu_future_needs_gp() check all ->need_future_gps[] elements Currently, the rcu_future_needs_gp() function checks only the current element of the ->need_future_gps[] array, which might miss elements that were offset from the expected element, for example, due to races with the start or the end of a grace period. This commit therefore makes rcu_future_needs_gp() use the need_any_future_gp() macro to check all of the elements of this array. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index afc5e32f0da4..b05ab6379562 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -720,7 +720,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); lockdep_assert_irqs_disabled(); - return READ_ONCE(need_future_gp_element(rnp, rnp->completed)); + return need_any_future_gp(rnp); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 952cd0c223fe..123c30eac8b5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -169,7 +169,8 @@ struct rcu_node { bool __nonzero = false; \ \ for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || (rnp)->need_future_gp[__i]; \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ __nonzero; \ }) -- cgit v1.2.3-59-g8ed1b From 6f576e281690316270275bbef17c79ea304ad511 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 16:50:31 -0700 Subject: rcu: Convert ->need_future_gp[] array to boolean There is no longer any need for ->need_future_gp[] to count the number of requests for future grace periods, so this commit converts the additions to assignments to "true" and reduces the size of each element to one byte. While we are in the area, fix an obsolete comment. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b05ab6379562..6ef1f2b4a6d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1709,7 +1709,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * current grace period, we don't need to explicitly start one. */ if (rnp->gpnum != rnp->completed) { - need_future_gp_element(rnp, c)++; + need_future_gp_element(rnp, c) = true; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1741,7 +1741,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* Record the need for the future grace period. */ - need_future_gp_element(rnp_root, c)++; + need_future_gp_element(rnp_root, c) = true; /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { @@ -1769,7 +1769,7 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = 0; + need_future_gp_element(rnp, c) = false; needmore = need_any_future_gp(rnp); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 123c30eac8b5..9f97fd7f648c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -150,7 +150,7 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[4]; /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; -- cgit v1.2.3-59-g8ed1b From ec4eaccef4af28376345554580606a43d7392ed8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Apr 2018 08:49:24 -0700 Subject: rcu: Make rcu_migrate_callbacks wake GP kthread when needed The rcu_migrate_callbacks() function invokes rcu_advance_cbs() twice, ignoring the return value. This is OK at pressent because of failsafe code that does the wakeup when needed. However, this failsafe code acquires the root rcu_node structure's lock frequently, while rcu_migrate_callbacks() does so only once per CPU-offline operation. This commit therefore makes rcu_migrate_callbacks() wake up the RCU GP kthread when either call to rcu_advance_cbs() returns true, thus removing need for the failsafe code. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ef1f2b4a6d3..f75eb5174021 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3876,6 +3876,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3887,12 +3888,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", -- cgit v1.2.3-59-g8ed1b From a6058d85a2b24fa40ce7f0d7683989ec47b603b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 14:33:18 -0700 Subject: rcu: Avoid __call_rcu_core() root rcu_node ->lock acquisition When __call_rcu_core() notices excessive numbers of callbacks pending on the current CPU, we know that at least one of them is not yet classified, namely the one that was just now queued. Therefore, it is not necessary to invoke rcu_start_gp() and thus not necessary to acquire the root rcu_node structure's ->lock. This commit therefore replaces the rcu_start_gp() with rcu_accelerate_cbs(), thus replacing an acquisition of the root rcu_node structure's ->lock with that of this CPU's leaf rcu_node structure. This decreases contention on the root rcu_node structure's ->lock. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f75eb5174021..6396a3d10be9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2988,11 +2988,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { -- cgit v1.2.3-59-g8ed1b From bd7af8463b9fae02b4c7d7248a088ca685ef184c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 09:51:20 -0700 Subject: rcu: Switch __rcu_process_callbacks() to rcu_accelerate_cbs() The __rcu_process_callbacks() function currently checks to see if the current CPU needs a grace period and also if there is any other reason to kick off a new grace period. This is one of the fail-safe checks that has been rendered unnecessary by the changes that increase the accuracy of rcu_gp_cleanup()'s estimate as to whether another grace period is required. Because this particular fail-safe involved acquiring the root rcu_node structure's ->lock, which has seen excessive contention in real life, this fail-safe needs to go. However, one check must remain, namely the check for newly arrived RCU callbacks that have not yet been associated with a grace period. One might hope that the checks in __note_gp_changes(), which is invoked indirectly from rcu_check_quiescent_state(), would suffice, but this function won't be invoked at all if RCU is idle. It is therefore necessary to replace the fail-safe checks with a simpler check for newly arrived callbacks during an RCU idle period, which is exactly what this commit does. This change removes the final call to rcu_start_gp(), so this function is removed as well. Note that lockless use of cpu_needs_another_gp() is racy, but that these races are harmless in this case. If RCU really is idle, the values will not change, so the return value from cpu_needs_another_gp() will be correct. If RCU is not idle, the resulting redundant call to rcu_accelerate_cbs() will be harmless, and might even have the benefit of reducing grace-period latency a bit. This commit also moves interrupt disabling into the "if" statement to improve real-time response a bit. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 53 +++++++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6396a3d10be9..fbacc486ed4c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2334,34 +2334,6 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, return true; } -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - /* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period @@ -2889,22 +2861,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ -- cgit v1.2.3-59-g8ed1b From a508aa597ec2f046c00b8809f887f90cf1aaa47f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 09:05:50 -0700 Subject: rcu: Cleanup, don't put ->completed into an int It is true that currently only the low-order two bits are used, so there should be no problem given modern machines and compilers, but good hygiene and maintainability dictates use of an unsigned long instead of an int. This commit therefore makes this change. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fbacc486ed4c..c7b1e6b2a3da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1765,7 +1765,7 @@ out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; + unsigned long c = rnp->completed; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); -- cgit v1.2.3-59-g8ed1b From a824a287f6eaec65f1cf7aedfd5f6b69d2d3858f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 09:40:32 -0700 Subject: rcu: Clear request other than RCU_GP_FLAG_INIT at GP end Once the grace period has ended, any RCU_GP_FLAG_FQS requests are irrelevant: The grace period has ended, so there is no longer any point in forcing quiescent states in order to try to make it end sooner. This commit therefore causes rcu_gp_cleanup() to clear any bits other than RCU_GP_FLAG_INIT from ->gp_flags at the end of the grace period. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7b1e6b2a3da..25dbbc753fef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2181,6 +2181,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } -- cgit v1.2.3-59-g8ed1b From d5cd96851d520e5caff13ddf99e3b2b759ae3b1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 10:45:06 -0700 Subject: rcu: Inline rcu_start_gp_advanced() into rcu_start_future_gp() The rcu_start_gp_advanced() is invoked only from rcu_start_future_gp() and much of its code is redundant when invoked from that context. This commit therefore inlines rcu_start_gp_advanced() into rcu_start_future_gp(), then removes rcu_start_gp_advanced(). Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 56 ++++++++++++------------------------------------------- 1 file changed, 12 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 25dbbc753fef..4433f68a1c7b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -1679,7 +1677,8 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * rcu_node structure's ->need_future_gp field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ static bool __maybe_unused rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, @@ -1687,7 +1686,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, { unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root = rcu_get_root(rsp); raw_lockdep_assert_held_rcu_node(rnp); @@ -1695,7 +1695,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. */ - c = rcu_cbs_completed(rdp->rsp, rnp); + c = rcu_cbs_completed(rsp, rnp); trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (need_future_gp_element(rnp, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); @@ -1727,7 +1727,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * period in progress, it will be smaller than the one we obtained * earlier. Adjust callbacks as needed. */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); + c = rcu_cbs_completed(rsp, rnp_root); if (!rcu_is_nocb_cpu(rdp->cpu)) (void)rcu_segcblist_accelerate(&rdp->cblist, c); @@ -1748,7 +1748,12 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + if (!rsp->gp_kthread) + goto unlock_out; /* No grace-period kthread yet! */ + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ } unlock_out: if (rnp != rnp_root) @@ -2298,43 +2303,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } } -/* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - /* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period -- cgit v1.2.3-59-g8ed1b From 41e80595abfc608eb0fe5148bcaed1ed78d7a6b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 11:24:09 -0700 Subject: rcu: Make rcu_start_future_gp() caller select grace period The rcu_accelerate_cbs() function selects a grace-period target, which it uses to have rcu_segcblist_accelerate() assign numbers to recently queued callbacks. Then it invokes rcu_start_future_gp(), which selects a grace-period target again, which is a bit pointless. This commit therefore changes rcu_start_future_gp() to take the grace-period target as a parameter, thus avoiding double selection. This commit also changes the name of rcu_start_future_gp() to rcu_start_this_gp() to reflect this change in functionality, and also makes a similar change to the name of trace_rcu_future_gp(). Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 53 ++++++++++++++++++++---------------------------- kernel/rcu/tree_plugin.h | 9 ++++---- 2 files changed, 27 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4433f68a1c7b..94519c7d552f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1659,12 +1659,9 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1672,33 +1669,27 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; struct rcu_state *rsp = rdp->rsp; struct rcu_node *rnp_root = rcu_get_root(rsp); raw_lockdep_assert_held_rcu_node(rnp); - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); + /* If the specified GP is already known needed, return to caller. */ + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); if (need_future_gp_element(rnp, c)) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartleaf")); goto out; } @@ -1710,7 +1701,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, */ if (rnp->gpnum != rnp->completed) { need_future_gp_element(rnp, c) = true; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1736,7 +1727,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * recorded, trace and leave. */ if (need_future_gp_element(rnp_root, c)) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } @@ -1745,9 +1736,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedroot")); if (!rsp->gp_kthread) goto unlock_out; /* No grace-period kthread yet! */ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); @@ -1759,8 +1750,6 @@ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1776,8 +1765,8 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) need_future_gp_element(rnp, c) = false; needmore = need_any_future_gp(rnp); - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1812,6 +1801,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1830,8 +1820,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2174,8 +2165,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); if (need_any_future_gp(rnp)) { - trace_rcu_future_gp(rnp, rdp, rsp->completed - 1, - TPS("CleanupMore")); + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); needgp = true; } /* Advance CBs to reduce false positives below. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 313b77d9cf06..322777492fff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2035,7 +2035,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2044,7 +2045,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2052,9 +2053,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit v1.2.3-59-g8ed1b From 360e0da67eab610b0efd53cbab3e1535095e7aa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 11:50:41 -0700 Subject: rcu: Add funnel locking to rcu_start_this_gp() The rcu_start_this_gp() function had a simple form of funnel locking that used only the leaves and root of the rcu_node tree, which is fine for systems with only a few hundred CPUs, but sub-optimal for systems having thousands of CPUs. This commit therefore adds full-tree funnel locking. This variant of funnel locking is unusual in the following ways: 1. The leaf-level rcu_node structure's ->lock is held throughout. Other funnel-locking implementations drop the leaf-level lock before progressing to the next level of the tree. 2. Funnel locking can be started at the root, which is convenient for code that already holds the root rcu_node structure's ->lock. Other funnel-locking implementations start at the leaves. 3. If an rcu_node structure other than the initial one believes that a grace period is in progress, it is not necessary to go further up the tree. This is because grace-period cleanup scans the full tree, so that marking the need for a subsequent grace period anywhere in the tree suffices -- but only if a grace period is currently in progress. 4. It is possible that the RCU grace-period kthread has not yet started, and this case must be handled appropriately. However, the general approach of using a tree to control lock contention is still in place. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 92 +++++++++++++++++++++---------------------------------- 1 file changed, 35 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 94519c7d552f..d3c769502929 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1682,74 +1682,52 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* If the specified GP is already known needed, return to caller. */ - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); - if (need_future_gp_element(rnp, c)) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_node *rnp_root; /* - * If this rcu_node structure believes that a grace period is in - * progress, then we must wait for the one following, which is in - * "c". Because our request will be noticed at the end of the - * current grace period, we don't need to explicitly start one. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed) { - need_future_gp_element(rnp, c) = true; - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (need_future_gp_element(rnp_root, c)) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - need_future_gp_element(rnp_root, c) = true; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedroot")); - if (!rsp->gp_kthread) - goto unlock_out; /* No grace-period kthread yet! */ - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - ret = true; /* Caller must wake GP kthread. */ + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: return ret; } -- cgit v1.2.3-59-g8ed1b From 665f08f1ce9cf608a9435e11d66f55be4e72540a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 12:23:34 -0700 Subject: rcu: Make rcu_start_this_gp() check for out-of-range requests If rcu_start_this_gp() is invoked with a requested grace period more than three in the future, then either the ->need_future_gp[] array needs to be bigger or the caller needs to be repaired. This commit therefore adds a WARN_ON_ONCE() checking for this condition. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3c769502929..07bccb1f0c87 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1698,6 +1698,8 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); if (need_future_gp_element(rnp_root, c) || ULONG_CMP_GE(rnp_root->gpnum, c) || (rnp != rnp_root && -- cgit v1.2.3-59-g8ed1b From 384f77f4cb765707216ea43f9122580d8a07be7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 16:16:45 -0700 Subject: rcu: The rcu_gp_cleanup() function does not need cpu_needs_another_gp() All of the cpu_needs_another_gp() function's checks (except for newly arrived callbacks) have been subsumed into the rcu_gp_cleanup() function's scan of the rcu_node tree. This commit therefore drops the call to cpu_needs_another_gp(). The check for newly arrived callbacks is supplied by rcu_accelerate_cbs(). Any needed advancing (as in the earlier rcu_advance_cbs() call) will be supplied when the corresponding CPU becomes aware of the end of the now-completed grace period. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 07bccb1f0c87..7776d709e060 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2150,11 +2150,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = true; } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); -- cgit v1.2.3-59-g8ed1b From c1935209df8c903fc3a33143223338826fa54bd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 16:29:13 -0700 Subject: rcu: Simplify and inline cpu_needs_another_gp() Now that RCU no longer relies on failsafe checks, cpu_needs_another_gp() can be greatly simplified. This simplification eliminates the last call to rcu_future_needs_gp() and to rcu_segcblist_future_gp_needed(), both of which which can then be eliminated. And then, because cpu_needs_another_gp() is called only from __rcu_pending(), it can be inlined and eliminated. This commit carries out the simplification, inlining, and elimination called out above. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcu_segcblist.c | 18 ------------------ kernel/rcu/rcu_segcblist.h | 2 -- kernel/rcu/tree.c | 40 +++------------------------------------- 3 files changed, 3 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -403,24 +403,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) return true; } -/* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - /* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7776d709e060..020a0fe2dbee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -708,42 +708,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -/* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - lockdep_assert_irqs_disabled(); - return need_any_future_gp(rnp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. @@ -3298,7 +3262,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ -- cgit v1.2.3-59-g8ed1b From a458360af63a36424c9f607015f0858aacb84a19 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 15:59:55 -0700 Subject: rcu: Drop early GP request check from rcu_gp_kthread() Now that grace-period requests use funnel locking and now that they set ->gp_flags to RCU_GP_FLAG_INIT even when the RCU grace-period kthread has not yet started, rcu_gp_kthread() no longer needs to check need_any_future_gp() at startup time. This commit therefore removes this check. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 020a0fe2dbee..ed238886e6ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2135,12 +2135,6 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); - /* Check for early-boot work. */ - raw_spin_lock_irq_rcu_node(rnp); - if (need_any_future_gp(rnp)) - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - raw_spin_unlock_irq_rcu_node(rnp); - rcu_bind_gp_kthread(); for (;;) { -- cgit v1.2.3-59-g8ed1b From 034777d7f5c6bc5326184ffa63b7a840ef0e9759 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 08:43:11 -0700 Subject: rcutorture: Print end-of-test state This commit adds end-of-test state printout to help check whether RCU shut down nicely. Note that this printout only helps for flavors of RCU that are not used much by the kernel. In particular, for normal RCU having a grace period in progress is expected behavior. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcutorture.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..fd86965b33a7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1609,6 +1609,9 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int flags = 0; + unsigned long gpnum = 0; + unsigned long completed = 0; int i; rcutorture_record_test_transition(); @@ -1639,6 +1642,11 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); + pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", + cur_ops->name, gpnum, completed, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) -- cgit v1.2.3-59-g8ed1b From f64c6013a2029316ea552f99823d116ee5f5f955 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2018 09:50:53 -0700 Subject: rcu/x86: Provide early rcu_cpu_starting() callback The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Add !SMP support, per 0day Test Robot report. ] --- arch/x86/kernel/cpu/mtrr/main.c | 4 ++++ include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 9 +++++++++ 5 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7468de429087..3ea0047beb40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -793,6 +794,9 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; + + rcu_cpu_starting(smp_processor_id()); + /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 19d235fefdb9..e679b175b411 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -108,7 +108,6 @@ void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); -void rcu_cpu_starting(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_RCU_STALL_COMMON diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ce9beec35e34..7b3c82e8a625 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); } #define rcutree_offline_cpu NULL #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL +static inline void rcu_cpu_starting(unsigned int cpu) { } #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 448f20f27396..914655848ef6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -101,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); +void rcu_cpu_starting(unsigned int cpu); #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4fccdfa25716..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3665,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3686,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3742,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ -- cgit v1.2.3-59-g8ed1b