From f139caf2e89713687514d9db847a4fa2e29c87a2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:40:54 +0400 Subject: sched, cleanup, treewide: Remove set_current_state(TASK_RUNNING) after schedule() schedule(), io_schedule() and schedule_timeout() always return with TASK_RUNNING state set, so one more setting is unnecessary. (All places in patch are visible good, only exception is kiblnd_scheduler() from: drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c Its schedule() is one line above standard 3 lines of unified diff) No places where set_current_state() is used for mb(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529254.3569.23.camel@tkhai Cc: Alasdair Kergon Cc: Anil Belur Cc: Arnd Bergmann Cc: Dave Kleikamp Cc: David Airlie Cc: David Howells Cc: Dmitry Eremin Cc: Frank Blaschka Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Helge Deller Cc: Isaac Huang Cc: James E.J. Bottomley Cc: James E.J. Bottomley Cc: J. Bruce Fields Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Laura Abbott Cc: Liang Zhen Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Masaru Nomura Cc: Michael Opdenacker Cc: Mikael Starvik Cc: Mike Snitzer Cc: Neil Brown Cc: Oleg Drokin Cc: Peng Tao Cc: Richard Weinberger Cc: Robert Love Cc: Steven Rostedt Cc: Trond Myklebust Cc: Ursula Braun Cc: Zi Shen Lim Cc: devel@driverdev.osuosl.org Cc: dm-devel@redhat.com Cc: dri-devel@lists.freedesktop.org Cc: fcoe-devel@open-fcoe.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux390@de.ibm.com Cc: linux-afs@lists.infradead.org Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-raid@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: qla2xxx-upstream@qlogic.com Cc: user-mode-linux-devel@lists.sourceforge.net Cc: user-mode-linux-user@lists.sourceforge.net Signed-off-by: Ingo Molnar --- arch/cris/arch-v10/drivers/sync_serial.c | 1 - arch/cris/arch-v32/drivers/sync_serial.c | 1 - arch/um/drivers/random.c | 1 - 3 files changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 29eb02ab3f25..0f3983241e60 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -1086,7 +1086,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } local_irq_restore(flags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) return -EINTR; diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c index bbb806b68838..5a149134cfb5 100644 --- a/arch/cris/arch-v32/drivers/sync_serial.c +++ b/arch/cris/arch-v32/drivers/sync_serial.c @@ -1089,7 +1089,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 9e3a72205827..dd16c902ff70 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -79,7 +79,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, set_task_state(current, TASK_INTERRUPTIBLE); schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&host_read_wait, &wait); if (atomic_dec_and_test(&host_sleep_count)) { -- cgit v1.2.3-59-g8ed1b From d4311ff1a8da48d609db9500f121c15580dfeeb7 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:17 +0100 Subject: init/main.c: Give init_task a canary Tasks get their end of stack set to STACK_END_MAGIC with the aim to catch stack overruns. Currently this feature does not apply to init_task. This patch removes this restriction. Note that a similar patch was posted by Prarit Bhargava some time ago but was never merged: http://marc.info/?l=linux-kernel&m=127144305403241&w=2 Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Michael Ellerman Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Alex Thorlton Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daeseok Youn Cc: David Rientjes Cc: Fabian Frederick Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Opdenacker Cc: Paul Mackerras Cc: Prarit Bhargava Cc: Rik van Riel Cc: Rusty Russell Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Vladimir Davydov Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-2-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +-- arch/x86/mm/fault.c | 3 +-- include/linux/sched.h | 2 ++ init/main.c | 1 + kernel/fork.c | 12 +++++++++--- kernel/trace/trace_stack.c | 4 +--- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7e6c39..35d0760c3fa4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -538,7 +537,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) regs->nip); stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24194681513..bc23a7043c65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,7 +3,6 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include /* STACK_END_MAGIC */ #include /* test_thread_flag(), ... */ #include /* oops_begin/end, ... */ #include /* search_exception_table */ @@ -710,7 +709,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (tsk != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 82ff3d6efb19..118dca7d5a28 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -57,6 +57,7 @@ struct sched_param { #include #include #include +#include #include @@ -2638,6 +2639,7 @@ static inline unsigned long stack_not_used(struct task_struct *p) return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif +extern void set_task_stack_end_magic(struct task_struct *tsk); /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available diff --git a/init/main.c b/init/main.c index bb1aed928f21..5fc3fc7bd475 100644 --- a/init/main.c +++ b/init/main.c @@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void) * lockdep hash: */ lockdep_init(); + set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 9387ae8ab048..ad64248c4b18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..1636e41828c2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include #include #include -#include #include @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (*end_of_stack(current) != STACK_END_MAGIC) { print_max_stack(); BUG(); } -- cgit v1.2.3-59-g8ed1b From a70857e46dd13e87ae06bf0e64cb6a2d4f436265 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:18 +0100 Subject: sched: Add helper for task stack page overrun checking This facility is used in a few places so let's introduce a helper function to improve code readability. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Paul Mackerras Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-3-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 4 +--- arch/x86/mm/fault.c | 4 +--- include/linux/sched.h | 2 ++ kernel/trace/trace_stack.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 35d0760c3fa4..99b2f2775658 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -507,7 +507,6 @@ bail: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; - unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -536,8 +535,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bc23a7043c65..6240bc7ae741 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -648,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; - unsigned long *stackend; unsigned long flags; int sig; @@ -708,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 118dca7d5a28..18f52624eaa6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2617,6 +2617,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(void *obj) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1636e41828c2..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -170,7 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if (*end_of_stack(current) != STACK_END_MAGIC) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } -- cgit v1.2.3-59-g8ed1b From d3bfca1a7b028a57d648dbc0985492c6a4466ccf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:48 +0200 Subject: ARM: topology: Use the new cpu_capacity interface Use the new arch_scale_cpu_capacity() scheduler facility in order to reflect the original capacity of a CPU instead of arch_scale_freq_capacity() which is more linked to a scaling of the capacity linked to the frequency. Signed-off-by: Vincent Guittot Acked-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Grant Likely Cc: Guenter Roeck Cc: Linus Torvalds Cc: Mark Brown Cc: Nicolas Pitre Cc: Rob Herring Cc: Russell King Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1409051215-16788-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index e35d880f9773..89cfdd6e50cb 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } @@ -166,7 +166,7 @@ static void update_cpu_capacity(unsigned int cpu) set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", - cpu, arch_scale_freq_capacity(NULL, cpu)); + cpu, arch_scale_cpu_capacity(NULL, cpu)); } #else -- cgit v1.2.3-59-g8ed1b From c55f5158f5606f8a62e694b7e009f59b92ac6258 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2014 17:06:41 +0200 Subject: sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW Kirill found that there's a subtle race in the __ARCH_WANT_UNLOCKED_CTXSW code, and instead of fixing it, remove the entire exception because neither arch that uses it seems to actually still require it. Boot tested on mips64el (qemu) only. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kirill Tkhai Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: James Hogan Cc: Kees Cook Cc: Linus Torvalds Cc: Paul Burton Cc: Qais Yousef Cc: Ralf Baechle Cc: Tony Luck Cc: oleg@redhat.com Cc: linux@roeck-us.net Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20140923150641.GH3312@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/processor.h | 1 - arch/mips/include/asm/processor.h | 6 ------ kernel/sched/core.c | 6 ------ kernel/sched/sched.h | 30 ------------------------------ 4 files changed, 43 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index c7367130ab14..ce53c50d0ba4 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -19,7 +19,6 @@ #include #include -#define __ARCH_WANT_UNLOCKED_CTXSW #define ARCH_HAS_PREFETCH_SWITCH_STACK #define IA64_NUM_PHYS_STACK_REG 96 diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 05f08438a7c4..f1df4cb4a286 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -397,12 +397,6 @@ unsigned long get_wchan(struct task_struct *p); #define ARCH_HAS_PREFETCHW #define prefetchw(x) __builtin_prefetch((x), 1, 1) -/* - * See Documentation/scheduler/sched-arch.txt; prevents deadlock on SMP - * systems. - */ -#define __ARCH_WANT_UNLOCKED_CTXSW - #endif #endif /* _ASM_PROCESSOR_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d65566d07fcf..5b0eac9f4e78 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2331,10 +2331,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ post_schedule(rq); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } @@ -2377,9 +2373,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 16e1ca9cb7e8..6130251de280 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,7 +975,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -1013,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ -- cgit v1.2.3-59-g8ed1b From cebf15eb09a2fd2fa73ee4faa9c4d2f813cf0f09 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 18 Sep 2014 12:33:34 -0700 Subject: x86, sched: Add new topology for multi-NUMA-node CPUs I'm getting the spew below when booting with Haswell (Xeon E5-2699 v3) CPUs and the "Cluster-on-Die" (CoD) feature enabled in the BIOS. It seems similar to the issue that some folks from AMD ran in to on their systems and addressed in this commit: 161270fc1f9d ("x86/smp: Fix topology checks on AMD MCM CPUs") Both these Intel and AMD systems break an assumption which is being enforced by topology_sane(): a socket may not contain more than one NUMA node. AMD special-cased their system by looking for a cpuid flag. The Intel mode is dependent on BIOS options and I do not know of a way which it is enumerated other than the tables being parsed during the CPU bringup process. In other words, we have to trust the ACPI tables . This detects the situation where a NUMA node occurs at a place in the middle of the "CPU" sched domains. It replaces the default topology with one that relies on the NUMA information from the firmware (SRAT table) for all levels of sched domains above the hyperthreads. This also fixes a sysfs bug. We used to freak out when we saw the "mc" group cross a node boundary, so we stopped building the MC group. MC gets exported as the 'core_siblings_list' in /sys/devices/system/cpu/cpu*/topology/ and this caused CPUs with the same 'physical_package_id' to not be listed together in 'core_siblings_list'. This violates a statement from Documentation/ABI/testing/sysfs-devices-system-cpu: core_siblings: internal kernel map of cpu#'s hardware threads within the same physical_package_id. core_siblings_list: human-readable list of the logical CPU numbers within the same physical_package_id as cpu#. The sysfs effects here cause an issue with the hwloc tool where it gets confused and thinks there are more sockets than are physically present. Before this patch, there are two packages: # cd /sys/devices/system/cpu/ # cat cpu*/topology/physical_package_id | sort | uniq -c 18 0 18 1 But 4 _sets_ of core siblings: # cat cpu*/topology/core_siblings_list | sort | uniq -c 9 0-8 9 18-26 9 27-35 9 9-17 After this set, there are only 2 sets of core siblings, which is what we expect for a 2-socket system. # cat cpu*/topology/physical_package_id | sort | uniq -c 18 0 18 1 # cat cpu*/topology/core_siblings_list | sort | uniq -c 18 0-17 18 18-35 Example spew: ... NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. #2 #3 #4 #5 #6 #7 #8 .... node #1, CPUs: #9 ------------[ cut here ]------------ WARNING: CPU: 9 PID: 0 at /home/ak/hle/linux-hle-2.6/arch/x86/kernel/smpboot.c:306 topology_sane.isra.2+0x74/0x90() sched: CPU #9's mc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Not tainted 3.17.0-rc1-00293-g8e01c4d-dirty #631 Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS GRNDSDP1.86B.0036.R05.1407140519 07/14/2014 0000000000000009 ffff88046ddabe00 ffffffff8172e485 ffff88046ddabe48 ffff88046ddabe38 ffffffff8109691d 000000000000b001 0000000000000009 ffff88086fc12580 000000000000b020 0000000000000009 ffff88046ddabe98 Call Trace: [] dump_stack+0x45/0x56 [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_fmt+0x4c/0x50 [] topology_sane.isra.2+0x74/0x90 [] set_cpu_sibling_map+0x31e/0x4f0 [] start_secondary+0x1ad/0x240 ---[ end trace 3fe5f587a9fcde61 ]--- #10 #11 #12 #13 #14 #15 #16 #17 .... node #2, CPUs: #18 #19 #20 #21 #22 #23 #24 #25 #26 .... node #3, CPUs: #27 #28 #29 #30 #31 #32 #33 #34 #35 Signed-off-by: Dave Hansen [ Added LLC domain and s/match_mc/match_die/ ] Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: David Rientjes Cc: Igor Mammedov Cc: Linus Torvalds Cc: Prarit Bhargava Cc: Toshi Kani Cc: brice.goglin@gmail.com Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20140918193334.C065EBCE@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 55 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2d872e08fab9..8de8eb756d1f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -295,12 +295,20 @@ void smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static bool +topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); +} + static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + return !WARN_ONCE(!topology_same_node(c, o), "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " "[node: %d != %d]. Ignoring dependency.\n", cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); @@ -341,17 +349,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +/* + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) { - if (cpu_has(c, X86_FEATURE_AMD_DCM)) - return true; - - return topology_sane(c, o, "mc"); - } + if (c->phys_proc_id == o->phys_proc_id) + return true; return false; } +static struct sched_domain_topology_level numa_inside_package_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; +/* + * set_sched_topology() sets the topology internal to a CPU. The + * NUMA topologies are layered on top of it to build the full + * system topology. + * + * If NUMA nodes are observed to occur within a CPU package, this + * function should be called. It forces the sched domain code to + * only use the SMT level for the CPU portion of the topology. + * This essentially falls back to relying on NUMA information + * from the SRAT table to describe the entire system topology + * (except for hyperthreads). + */ +static void primarily_use_numa_for_topology(void) +{ + set_sched_topology(numa_inside_package_topology); +} + void set_cpu_sibling_map(int cpu) { bool has_smt = smp_num_siblings > 1; @@ -388,7 +423,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_mc(c, o))) { + if ((i == cpu) || (has_mp && match_die(c, o))) { link_mask(core, cpu, i); /* @@ -410,6 +445,8 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } + if (match_die(c, o) == !topology_same_node(c, o)) + primarily_use_numa_for_topology(); } } -- cgit v1.2.3-59-g8ed1b From 728e5653e6fdb2a0892e94a600aef8c9a036c7eb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 30 Sep 2014 14:45:46 -0700 Subject: sched/x86: Fix up typo in topology detection Commit: cebf15eb09a2 ("x86, sched: Add new topology for multi-NUMA-node CPUs") some code to try to detect the situation where we have a NUMA node inside of the "DIE" sched domain. It detected this by looking for cpus which match_die() but do not match NUMA nodes via topology_same_node(). I wrote it up as: if (match_die(c, o) == !topology_same_node(c, o)) which actually seemed to work some of the time, albiet accidentally. It should have been doing an &&, not an ==. This code essentially chopped off the "DIE" domain on one of Andrew Morton's systems. He reported that this patch fixed his issue. Signed-off-by: Dave Hansen Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Cc: Dave Hansen Cc: Andrew Morton Cc: David Rientjes Cc: Igor Mammedov Cc: Jan Kiszka Cc: Lan Tianyu Cc: Linus Torvalds Cc: Prarit Bhargava Cc: Toshi Kani Link: http://lkml.kernel.org/r/20140930214546.FD481CFF@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8de8eb756d1f..bae9e09291ec 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -445,7 +445,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) == !topology_same_node(c, o)) + if (match_die(c, o) && !topology_same_node(c, o)) primarily_use_numa_for_topology(); } } -- cgit v1.2.3-59-g8ed1b From 347abad981c1ef815ea5ba861adba6a8c6aa1580 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Sep 2014 15:59:47 -0400 Subject: sched, time: Fix build error with 64 bit cputime_t on 32 bit systems On 32 bit systems cmpxchg cannot handle 64 bit values, so some additional magic is required to allow a 32 bit system with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build. Make sure the correct cmpxchg function is used when doing an atomic swap of a cputime_t. Reported-by: Arnd Bergmann Signed-off-by: Rik van Riel Acked-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: linux390@de.ibm.com Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20140930155947.070cdb1f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ include/asm-generic/cputime_jiffies.h | 2 ++ include/asm-generic/cputime_nsecs.h | 2 ++ kernel/sched/cputime.c | 29 +++++++++++++++++++---------- 5 files changed, 27 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559ab271f..6c840ceab820 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #ifdef __KERNEL__ /* diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd3634519..3001887f94b7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,8 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + static inline unsigned long __div(unsigned long long n, unsigned long base) { #ifndef CONFIG_64BIT diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f53986..fe386fc6e85e 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -3,6 +3,8 @@ typedef unsigned long __nocast cputime_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct) (__ct) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e817606c549..0419485891f2 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,8 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 64492dff8a81..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -554,6 +554,23 @@ drop_precision: return (__force cputime_t) scaled; } +/* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + /* * Adjust tick based cputime random precision against scheduler * runtime accounting. @@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - * Atomic exchange protects against concurrent cputime_adjust(). - */ - while (stime > (rtime = ACCESS_ONCE(prev->stime))) - cmpxchg(&prev->stime, rtime, stime); - while (utime > (rtime = ACCESS_ONCE(prev->utime))) - cmpxchg(&prev->utime, rtime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; -- cgit v1.2.3-59-g8ed1b