From 6acce3ef84520537f8a09a12c9ddbe814a584dd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Oct 2013 14:38:20 +0200 Subject: sched: Remove get_online_cpus() usage Remove get_online_cpus() usage from the scheduler; there's 4 sites that use it: - sched_init_smp(); where its completely superfluous since we're in 'early' boot and there simply cannot be any hotplugging. - sched_getaffinity(); we already take a raw spinlock to protect the task cpus_allowed mask, this disables preemption and therefore also stabilizes cpu_online_mask as that's modified using stop_machine. However switch to active mask for symmetry with sched_setaffinity()/set_cpus_allowed_ptr(). We guarantee active mask stability by inserting sync_rcu/sched() into _cpu_down. - sched_setaffinity(); we don't appear to need get_online_cpus() either, there's two sites where hotplug appears relevant: * cpuset_cpus_allowed(); for the !cpuset case we use possible_mask, for the cpuset case we hold task_lock, which is a spinlock and thus for mainline disables preemption (might cause pain on RT). * set_cpus_allowed_ptr(); Holds all scheduler locks and thus has preemption properly disabled; also it already deals with hotplug races explicitly where it releases them. - migrate_swap(); we can make stop_two_cpus() do the heavy lifting for us with a little trickery. By adding a sync_sched/rcu() after the CPU_DOWN_PREPARE notifier we can provide preempt/rcu guarantees for cpu_active_mask. Use these to validate that both our cpus are active when queueing the stop work before we queue the stop_machine works for take_cpu_down(). Signed-off-by: Peter Zijlstra Cc: "Srivatsa S. Bhat" Cc: Paul McKenney Cc: Mel Gorman Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Linus Torvalds Cc: Andrew Morton Cc: Steven Rostedt Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131011123820.GV3081@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..63aa50d7ce1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } smpboot_park_threads(cpu); + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ -- cgit v1.3-14-g43fede From 01b0f19707c51ef247404e6af1d4a97a11ba34f7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:07:25 -0800 Subject: cpu/mem hotplug: add try_online_node() for cpu_up() cpu_up() has #ifdef CONFIG_MEMORY_HOTPLUG code blocks, which call mem_online_node() to put its node online if offlined and then call build_all_zonelists() to initialize the zone list. These steps are specific to memory hotplug, and should be managed in mm/memory_hotplug.c. lock_memory_hotplug() should also be held for the whole steps. For this reason, this patch replaces mem_online_node() with try_online_node(), which performs the whole steps with lock_memory_hotplug() held. try_online_node() is named after try_offline_node() as they have similar purpose. There is no functional change in this patch. Signed-off-by: Toshi Kani Reviewed-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 +++++++- kernel/cpu.c | 29 +++-------------------------- mm/memory_hotplug.c | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 29 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index dd38e62b84d2..22203c293f07 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -94,6 +94,8 @@ extern void __online_page_set_limits(struct page *page); extern void __online_page_increment_counters(struct page *page); extern void __online_page_free(struct page *page); +extern int try_online_node(int nid); + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size); @@ -225,6 +227,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline int try_online_node(int nid) +{ + return 0; +} + static inline void lock_memory_hotplug(void) {} static inline void unlock_memory_hotplug(void) {} @@ -256,7 +263,6 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); -extern int mem_online_node(int nid); extern int add_memory(int nid, u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/cpu.c b/kernel/cpu.c index 63aa50d7ce1e..973d034acf84 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -437,11 +437,6 @@ int cpu_up(unsigned int cpu) { int err = 0; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; - pg_data_t *pgdat; -#endif - if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -452,27 +447,9 @@ int cpu_up(unsigned int cpu) return -EINVAL; } -#ifdef CONFIG_MEMORY_HOTPLUG - nid = cpu_to_node(cpu); - if (!node_online(nid)) { - err = mem_online_node(nid); - if (err) - return err; - } - - pgdat = NODE_DATA(nid); - if (!pgdat) { - printk(KERN_ERR - "Can't online cpu %d due to NULL pgdat\n", cpu); - return -ENOMEM; - } - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } -#endif + err = try_online_node(cpu_to_node(cpu)); + if (err) + return err; cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5118028468eb..8285346be663 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1043,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -/* +/** + * try_online_node - online a node if offlined + * * called by cpu_up() to online a node without onlined memory. */ -int mem_online_node(int nid) +int try_online_node(int nid) { pg_data_t *pgdat; int ret; + if (node_online(nid)) + return 0; + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } @@ -1061,6 +1067,12 @@ int mem_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + out: unlock_memory_hotplug(); return ret; -- cgit v1.3-14-g43fede From 106dd5afde3cd10db7e1370b6ddc77f0b2496a75 Mon Sep 17 00:00:00 2001 From: Michael wang Date: Wed, 13 Nov 2013 11:10:56 +0800 Subject: sched: Fix endless sync_sched/rcu() loop inside _cpu_down() Commit 6acce3ef8: sched: Remove get_online_cpus() usage tries to do sync_sched/rcu() inside _cpu_down() but triggers: INFO: task swapper/0:1 blocked for more than 120 seconds. ... [] synchronize_rcu+0x2c/0x30 [] _cpu_down+0x2b2/0x340 ... It was caused by that in the rcu boost case we rely on smpboot thread to finish the rcu callback, which has already been parked before sync in here and leads to the endless sync_sched/rcu(). This patch exchanges the sequence of smpboot_park_threads() and sync_sched/rcu() to fix the bug. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5282EDC0.6060003@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 63aa50d7ce1e..2227b58734a7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } - smpboot_park_threads(cpu); /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might * not imply sync_sched(), so explicitly call both. + * + * Do sync before park smpboot threads to take care the rcu boost case. */ #ifdef CONFIG_PREEMPT synchronize_sched(); #endif synchronize_rcu(); + smpboot_park_threads(cpu); + /* * So now all preempt/rcu users must observe !cpu_active(). */ -- cgit v1.3-14-g43fede