From 2fff78c784ed97a8e5aa225ef5228f0a6d862d82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 18:10:10 +0100 Subject: futex: fix reference leak Catalin noticed that (38d47c1b7075: futex: rely on get_user_pages() for shared futexes) caused an mm_struct leak. Some tracing with the function graph tracer quickly pointed out that futex_wait() has exit paths with unbalanced reference counts. This regression was discovered by kmemleak. Reported-by: Catalin Marinas Signed-off-by: Peter Zijlstra Tested-by: "Pallipadi, Venkatesh" Tested-by: Catalin Marinas Signed-off-by: Ingo Molnar --- kernel/futex.c | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f89d373a9c6d..438701adce23 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1165,6 +1165,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct task_struct *curr = current; + struct restart_block *restart; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; @@ -1216,11 +1217,13 @@ retry: if (!ret) goto retry; - return ret; + goto out; } ret = -EWOULDBLOCK; - if (uval != val) - goto out_unlock_put_key; + if (unlikely(uval != val)) { + queue_unlock(&q, hb); + goto out_put_key; + } /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1284,38 +1287,38 @@ retry: */ /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; if (!unqueue_me(&q)) - return 0; + goto out_put_key; + ret = -ETIMEDOUT; if (rem) - return -ETIMEDOUT; + goto out_put_key; /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ + ret = -ERESTARTSYS; if (!abs_time) - return -ERESTARTSYS; - else { - struct restart_block *restart; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; - restart->futex.flags = 0; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; - return -ERESTART_RESTARTBLOCK; - } + goto out_put_key; -out_unlock_put_key: - queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = (u32 *)uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; + restart->futex.flags = 0; + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + ret = -ERESTART_RESTARTBLOCK; + +out_put_key: + put_futex_key(fshared, &q.key); out: return ret; } -- cgit v1.2.3-59-g8ed1b From a0490fa35dc0022ef95f64802e8edf18c411c790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 11:35:40 +0100 Subject: sched: cpu hotplug fix rq_attach_root() does a kfree() with the runqueue lock held. That's not a very wise move, fix it. Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1d0ed360088..410eec404133 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6944,20 +6944,26 @@ static void free_rootdomain(struct root_domain *rd) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { + struct root_domain *old_rd = NULL; unsigned long flags; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - struct root_domain *old_rd = rq->rd; + old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); cpumask_clear_cpu(rq->cpu, old_rd->span); - if (atomic_dec_and_test(&old_rd->refcount)) - free_rootdomain(old_rd); + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; } atomic_inc(&rd->refcount); @@ -6968,6 +6974,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + free_rootdomain(old_rd); } static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) -- cgit v1.2.3-59-g8ed1b From 3997ad317fdf9ecdb5702e2b4fd1f8229814ff8c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Feb 2009 15:00:52 +0100 Subject: timers: more consistently use clock vs timer While reviewing the manpages, I noticed I'd missed some clock vs timer sites. Make sure that all timer functions call cpu_timer_sample_group() and not cpu_clock_sample_group(). This ensures that we enable the process wide timer in time, and therefore pay the O(n) thread group cost from the syscall. Not doing it here, will result in the first jiffy tick after setting the timer doing this, resulting in a very expensive tick (but only once) and a delay in actually starting the timer. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 60 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2313a4cc14ea..e976e505648d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -680,6 +680,33 @@ static void cpu_timer_fire(struct k_itimer *timer) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -741,7 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &val); } else { - cpu_clock_sample_group(timer->it_clock, p, &val); + cpu_timer_sample_group(timer->it_clock, p, &val); } if (old) { @@ -889,7 +916,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) read_unlock(&tasklist_lock); goto dead; } else { - cpu_clock_sample_group(timer->it_clock, p, &now); + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead = (unlikely(p->exit_state) && thread_group_empty(p)); } @@ -1244,7 +1271,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) clear_dead_task(timer, now); goto out_unlock; } - cpu_clock_sample_group(timer->it_clock, p, &now); + cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); /* Leave the tasklist_lock locked for the call below. */ } @@ -1408,33 +1435,6 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } -/* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - struct task_cputime cputime; - - thread_group_cputimer(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = cputime_add(cputime.utime, cputime.stime); - break; - case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; - break; - case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); - break; - } - return 0; -} - /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. -- cgit v1.2.3-59-g8ed1b From fb5ae64fdde29236e1a15e0366946df7060f41f2 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Fri, 13 Feb 2009 14:04:21 +0000 Subject: User namespaces: Only put the userns when we unhash the uid uids in namespaces other than init don't get a sysfs entry. For those in the init namespace, while we're waiting to remove the sysfs entry for the uid the uid is still hashed, and alloc_uid() may re-grab that uid without getting a new reference to the user_ns, which we've already put in free_user before scheduling remove_user_sysfs_dir(). Reported-and-tested-by: KOSAKI Motohiro Signed-off-by: Serge E. Hallyn Acked-by: David Howells Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 477b6660f447..3551ac742395 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -72,6 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); + put_user_ns(up->user_ns); } static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) @@ -334,7 +335,6 @@ static void free_user(struct user_struct *up, unsigned long flags) atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); - put_user_ns(up->user_ns); INIT_WORK(&up->work, remove_user_sysfs_dir); schedule_work(&up->work); } @@ -357,7 +357,6 @@ static void free_user(struct user_struct *up, unsigned long flags) sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); - put_user_ns(up->user_ns); kmem_cache_free(uid_cachep, up); } -- cgit v1.2.3-59-g8ed1b From 327ec5699c29454322d0136375f717f509c145b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Feb 2009 11:21:37 +0100 Subject: irq: clean up manage.c - make printk message git-greppable - fix a few style details Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1c5055069170..8f4bc61f0df9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -397,7 +397,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * allocate special interrupts that are part of the architecture. */ static int -__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) +__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **p; const char *old_name = NULL; @@ -687,11 +687,12 @@ int request_irq(unsigned int irq, irq_handler_t handler, * the behavior is classified as "will not fix" so we need to * start nudging drivers away from using that idiom. */ - if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) - == (IRQF_SHARED|IRQF_DISABLED)) - pr_warning("IRQ %d/%s: IRQF_DISABLED is not " - "guaranteed on shared IRQs\n", - irq, devname); + if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == + (IRQF_SHARED|IRQF_DISABLED)) { + pr_warning( + "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", + irq, devname); + } #ifdef CONFIG_LOCKDEP /* -- cgit v1.2.3-59-g8ed1b From ae88a23b32fa7e0dc9fa7ce735966e68eb41b0bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Feb 2009 11:29:50 +0100 Subject: irq: refactor and clean up the free_irq() code flow Impact: cleanup - separate out the loop from the actual freeing logic, this wins us two indentation levels allowing a number of followup prettifications - turn the WARN_ON() into a more informative WARN(). - clean up the comments and the code flow some more Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 101 ++++++++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8f4bc61f0df9..7a954b860c07 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -575,72 +575,79 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction **p; + struct irqaction *action, **p, **pp; unsigned long flags; - WARN_ON(in_interrupt()); + WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return; spin_lock_irqsave(&desc->lock, flags); + + /* + * There can be multiple actions per IRQ descriptor, find the right + * one based on the dev_id: + */ p = &desc->action; for (;;) { - struct irqaction *action = *p; + action = *p; + pp = p; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + spin_unlock_irqrestore(&desc->lock, flags); + + return; + } - if (action) { - struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; - p = &action->next; - if (action->dev_id != dev_id) - continue; + break; + } - /* Found it - now remove it from the list of entries */ - *pp = action->next; + /* Found it - now remove it from the list of entries: */ + *pp = action->next; - /* Currently used only by UML, might disappear one day.*/ + /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->chip->release) + desc->chip->release(irq, dev_id); #endif - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); - else - desc->chip->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); - unregister_handler_proc(irq, action); + /* If this was the last handler, shut down the IRQ line: */ + if (!desc->action) { + desc->status |= IRQ_DISABLED; + if (desc->chip->shutdown) + desc->chip->shutdown(irq); + else + desc->chip->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); + + unregister_handler_proc(irq, action); + + /* Make sure it's not being used on another CPU: */ + synchronize_irq(irq); - /* Make sure it's not being used on another CPU */ - synchronize_irq(irq); -#ifdef CONFIG_DEBUG_SHIRQ - /* - * It's a shared IRQ -- the driver ought to be - * prepared for it to happen even now it's - * being freed, so let's make sure.... We do - * this after actually deregistering it, to - * make sure that a 'real' IRQ doesn't run in - * parallel with our fake - */ - if (action->flags & IRQF_SHARED) { - local_irq_save(flags); - action->handler(irq, dev_id); - local_irq_restore(flags); - } -#endif - kfree(action); - return; - } - printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); #ifdef CONFIG_DEBUG_SHIRQ - dump_stack(); -#endif - spin_unlock_irqrestore(&desc->lock, flags); - return; + /* + * It's a shared IRQ -- the driver ought to be prepared for an IRQ + * event to happen even now it's being freed, so let's make sure that + * is so by doing an extra call to the handler .... + * + * ( We do this after actually deregistering it, to make sure that a + * 'real' IRQ doesn't run in * parallel with our fake. ) + */ + if (action->flags & IRQF_SHARED) { + local_irq_save(flags); + action->handler(irq, dev_id); + local_irq_restore(flags); } +#endif + kfree(action); } EXPORT_SYMBOL(free_irq); -- cgit v1.2.3-59-g8ed1b From 391b170f10e669dd429aa47bce998c2fa839404c Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 6 Jan 2009 13:57:11 +0200 Subject: mmiotrace: count events lost due to not recording Impact: enhances lost events counting in mmiotrace The tracing framework, or the ring buffer facility it uses, has a switch to stop recording data. When recording is off, the trace events will be lost. The framework does not count these, so mmiotrace has to count them itself. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fffcb069f1dc..80e503ef6136 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" @@ -19,6 +20,7 @@ struct header_iter { static struct trace_array *mmio_trace_array; static bool overrun_detected; static unsigned long prev_overruns; +static atomic_t dropped_count; static void mmio_reset_data(struct trace_array *tr) { @@ -121,11 +123,11 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { - unsigned long cnt = 0; + unsigned long cnt = atomic_xchg(&dropped_count, 0); unsigned long over = ring_buffer_overruns(iter->tr->buffer); if (over > prev_overruns) - cnt = over - prev_overruns; + cnt += over - prev_overruns; prev_overruns = over; return cnt; } @@ -310,8 +312,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_RW; @@ -338,8 +342,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_MAP; -- cgit v1.2.3-59-g8ed1b From 6bc5c366b1a45ca18fba6851f62db5743b3f6db5 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:23:51 +0200 Subject: trace: mmiotrace to the tracer menu in Kconfig Impact: cosmetic change in Kconfig menu layout This patch was originally suggested by Peter Zijlstra, but seems it was forgotten. CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable directly under the Kernel hacking / debugging menu in the kernel configuration system. They were present only for x86 and x86_64. Other tracers that use the ftrace tracing framework are in their own sub-menu. This patch moves the mmiotrace configuration options there. Since the Kconfig file, where the tracer menu is, is not architecture specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by x86/x86_64. CONFIG_MMIOTRACE now depends on it. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 24 ++---------------------- kernel/trace/Kconfig | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 10d6cc3fd052..e1983fa025d2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -174,28 +174,8 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PCI - select TRACING - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/tracers/mmiotrace.txt. - If you are not helping to develop drivers, say N. - -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. +config HAVE_MMIOTRACE_SUPPORT + def_bool y # # IO delay types: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e2a4ff6fc3a6..58a93fbd68aa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -302,4 +302,27 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + endmenu -- cgit v1.2.3-59-g8ed1b From 5b058bcde961bf28678a70e44c079107313543b6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Feb 2009 18:35:34 +0100 Subject: tracing/function-graph-tracer: trace the idle tasks When the function graph tracer is activated, it iterates over the task_list to allocate a stack to store the return addresses. But the per cpu idle tasks are not iterated by using do_each_thread / while_each_thread. So we have to iterate on them manually. This fixes somes weirdness in the traces and many losses of traces. Examples on two cpus: 0) Xorg-4287 | 2.906 us | } 0) Xorg-4287 | 3.965 us | } 0) Xorg-4287 | 5.302 us | } ------------------------------------------ 0) Xorg-4287 => -0 ------------------------------------------ 0) -0 | 2.861 us | } 0) -0 | 0.526 us | set_normalized_timespec(); 0) -0 | 7.201 us | } 0) -0 | 8.214 us | } 0) -0 | | clockevents_program_event() { 0) -0 | | lapic_next_event() { 0) -0 | 0.510 us | native_apic_mem_write(); 0) -0 | 1.546 us | } 0) -0 | 2.583 us | } 0) -0 | + 12.435 us | } 0) -0 | + 13.470 us | } 0) -0 | 0.608 us | _spin_unlock_irqrestore(); 0) -0 | + 23.270 us | } 0) -0 | + 24.336 us | } 0) -0 | + 25.417 us | } 0) -0 | 0.593 us | _spin_unlock(); 0) -0 | + 41.869 us | } 0) -0 | + 42.906 us | } 0) -0 | + 95.035 us | } 0) -0 | 0.540 us | menu_reflect(); 0) -0 | ! 100.404 us | } 0) -0 | 0.564 us | mce_idle_callback(); 0) -0 | | enter_idle() { 0) -0 | 0.526 us | mce_idle_callback(); 0) -0 | 1.757 us | } 0) -0 | | cpuidle_idle_call() { 0) -0 | | menu_select() { 0) -0 | 0.525 us | pm_qos_requirement(); 0) -0 | 0.518 us | tick_nohz_get_sleep_length(); 0) -0 | 2.621 us | } [...] 1) -0 | 0.518 us | touch_softlockup_watchdog(); 1) -0 | + 14.355 us | } 1) -0 | + 22.840 us | } 1) -0 | + 25.949 us | } 1) -0 | | handle_irq() { 1) -0 | 0.511 us | irq_to_desc(); 1) -0 | | handle_edge_irq() { 1) -0 | 0.638 us | _spin_lock(); 1) -0 | | ack_apic_edge() { 1) -0 | 0.510 us | irq_to_desc(); 1) -0 | | move_native_irq() { 1) -0 | 0.510 us | irq_to_desc(); 1) -0 | 1.532 us | } 1) -0 | 0.511 us | native_apic_mem_write(); ------------------------------------------ 1) -0 => cat-5073 ------------------------------------------ 1) cat-5073 | 3.731 us | } 1) cat-5073 | | run_local_timers() { 1) cat-5073 | 0.533 us | hrtimer_run_queues(); 1) cat-5073 | | raise_softirq() { 1) cat-5073 | | __raise_softirq_irqoff() { 1) cat-5073 | | /* nr: 1 */ 1) cat-5073 | 2.718 us | } 1) cat-5073 | 3.814 us | } Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9a236ffe2aa4..fdf913dfc7e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2033,7 +2033,7 @@ free: static int start_graph_tracing(void) { struct ftrace_ret_stack **ret_stack_list; - int ret; + int ret, cpu; ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * sizeof(struct ftrace_ret_stack *), @@ -2042,6 +2042,10 @@ static int start_graph_tracing(void) if (!ret_stack_list) return -ENOMEM; + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) + ftrace_graph_init_task(idle_task(cpu)); + do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); -- cgit v1.2.3-59-g8ed1b From 8316e38100c70cd1443ac90074eccdd033aa218d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 20:28:29 +0100 Subject: irq: further clean up the free_irq() code flow Linus noticed that the 'pp' variable can be eliminated altogether, and the loop can be cleaned up further. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7a954b860c07..de5a765e88ab 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action, **p, **pp; + struct irqaction *action, **p; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -592,7 +592,6 @@ void free_irq(unsigned int irq, void *dev_id) p = &desc->action; for (;;) { action = *p; - pp = p; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); @@ -601,15 +600,13 @@ void free_irq(unsigned int irq, void *dev_id) return; } + if (action->dev_id == dev_id) + break; p = &action->next; - if (action->dev_id != dev_id) - continue; - - break; } /* Found it - now remove it from the list of entries: */ - *pp = action->next; + *p = action->next; /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD -- cgit v1.2.3-59-g8ed1b From f17c75453b2d195eba0a90d9f16a3ba88c85b3b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 20:43:37 +0100 Subject: irq: name 'p' variables a bit better 'p' stands for pointer - make it clear in setup_irq() and free_irq() what kind of pointer it is. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index de5a765e88ab..c589305210d7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -399,7 +399,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { - struct irqaction *old, **p; + struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; int shared = 0; @@ -431,8 +431,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * The following block of code has to be executed atomically */ spin_lock_irqsave(&desc->lock, flags); - p = &desc->action; - old = *p; + old_ptr = &desc->action; + old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are @@ -455,8 +455,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { - p = &old->next; - old = *p; + old_ptr = &old->next; + old = *old_ptr; } while (old); shared = 1; } @@ -507,7 +507,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) (int)(new->flags & IRQF_TRIGGER_MASK)); } - *p = new; + *old_ptr = new; /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action, **p; + struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -589,9 +589,9 @@ void free_irq(unsigned int irq, void *dev_id) * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ - p = &desc->action; + action_ptr = &desc->action; for (;;) { - action = *p; + action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); @@ -602,11 +602,11 @@ void free_irq(unsigned int irq, void *dev_id) if (action->dev_id == dev_id) break; - p = &action->next; + action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ - *p = action->next; + *action_ptr = action->next; /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD -- cgit v1.2.3-59-g8ed1b From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2009 10:25:40 +0100 Subject: block: fix bad definition of BIO_RW_SYNC We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before 213d9417fec62ef4c3675621b9364a667954d4dd. Signed-off-by: Jens Axboe --- block/blktrace.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/md.c | 4 ++-- include/linux/bio.h | 2 -- include/linux/blktrace_api.h | 1 + include/linux/fs.h | 6 +++--- kernel/power/swap.c | 5 +++-- mm/page_io.c | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/block/blktrace.c b/block/blktrace.c index 39cc3bfe56e4..7cf9d1ff45a0 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -142,7 +142,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= ddir_act[rw & WRITE]; what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, SYNCIO); what |= MASK_TC_BIT(rw, AHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index a34338567a2a..f14813be4eff 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions, struct dpages old_pages = *dp; if (sync) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3073618269ea..0a225da21272 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNC), + .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4495104f6c9f..03b4cd0a6344 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); bio->bi_bdev = bdev; bio->bi_sector = sector; diff --git a/include/linux/bio.h b/include/linux/bio.h index 2aa283ab062b..1b16108a5417 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -171,8 +171,6 @@ struct bio { #define BIO_RW_FAILFAST_TRANSPORT 8 #define BIO_RW_FAILFAST_DRIVER 9 -#define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) - #define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 25379cba2370..6e915878e88c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -15,6 +15,7 @@ enum blktrace_cat { BLK_TC_WRITE = 1 << 1, /* writes */ BLK_TC_BARRIER = 1 << 2, /* barrier */ BLK_TC_SYNC = 1 << 3, /* sync IO */ + BLK_TC_SYNCIO = BLK_TC_SYNC, BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ BLK_TC_REQUEUE = 1 << 5, /* requeueing */ BLK_TC_ISSUE = 1 << 6, /* issue */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 6022f44043f2..67857dc16365 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -87,10 +87,10 @@ struct inodes_stat_t { #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ -#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define READ_META (READ | (1 << BIO_RW_META)) -#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) -#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) +#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 6da14358537c..505f319e489c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -60,6 +60,7 @@ static struct block_device *resume_bdev; static int submit(int rw, pgoff_t page_off, struct page *page, struct bio **bio_chain) { + const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); @@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_get(bio); if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); wait_on_page_locked(page); if (rw == READ) bio_set_pages_dirty(bio); @@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); } return 0; } diff --git a/mm/page_io.c b/mm/page_io.c index dc6ce0afbded..3023c475e041 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3-59-g8ed1b From 67e055d144c5b2acdc1c63811fde031263bf92c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 18 Feb 2009 14:48:20 -0800 Subject: cgroups: fix possible use after free In cgroup_kill_sb(), root is freed before sb is detached from the list, so another sget() may find this sb and call cgroup_test_super(), which will access the root that has been freed. Reported-by: Al Viro Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e14db9c089b9..9edb5c4b79b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1122,8 +1122,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); - kfree(root); kill_litter_super(sb); + kfree(root); } static struct file_system_type cgroup_fs_type = { -- cgit v1.2.3-59-g8ed1b From 42f5e039c3f6512271636928ddc4e7f7a0371672 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 18 Feb 2009 14:48:21 -0800 Subject: pm: fix build for CONFIG_PM unset Compilation of kprobes.c with CONFIG_PM unset is broken due to some broken config dependncies. Fix that. Signed-off-by: Rafael J. Wysocki Reported-by: Ingo Molnar Tested-by: Masami Hiramatsu Cc: Len Brown Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/power/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 170a9213c1b6..e4791b3ba55d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o diff --git a/kernel/power/Makefile b/kernel/power/Makefile index d7a10167a25b..720ea4f781bd 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,7 +3,7 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -obj-y := main.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o -- cgit v1.2.3-59-g8ed1b From 0c5119c1e655e0719a69601b1049acdd5ec1c125 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Feb 2009 18:33:57 -0500 Subject: tracing: disable tracing while testing ring buffer Impact: fix to prevent hard lockup on self tests If one of the tracers are broken and is constantly filling the ring buffer while the test of the ring buffer is running, it will hang the box. The reason is that the test is a consumer that will not stop till the ring buffer is empty. But if the tracer is broken and is constantly producing input to the buffer, this test will never end. The result is a lockup of the box. This happened when KALLSYMS was not defined and the dynamic ftrace test constantly filled the ring buffer, because the filter failed and all functions were being traced. Something was being called that constantly filled the buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 88c8eb70f54a..a7e0ef662f9f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -57,11 +57,20 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) cnt = ring_buffer_entries(tr->buffer); + /* + * The trace_test_buffer_cpu runs a while loop to consume all data. + * If the calling tracer is broken, and is constantly filling + * the buffer, this will run forever, and hard lock the box. + * We disable the ring buffer while we do this test to prevent + * a hard lock up. + */ + tracing_off(); for_each_possible_cpu(cpu) { ret = trace_test_buffer_cpu(tr, cpu); if (ret) break; } + tracing_on(); __raw_spin_unlock(&ftrace_max_lock); local_irq_restore(flags); -- cgit v1.2.3-59-g8ed1b From 4d7a077c0c7bfdba04cf0aa0b79053cf4ebaacf8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Feb 2009 22:06:18 -0500 Subject: tracing: have function trace select kallsyms Impact: fix output of function tracer to be useful The function tracer is pretty useless if KALLSYMS is not configured. Unless you are good at reading hex values, the function tracer should select the KALLSYMS configuration. Also, the dynamic function tracer will fail its self test if KALLSYMS is not selected. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 58a93fbd68aa..34e707e5ab87 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -52,6 +52,7 @@ config FUNCTION_TRACER depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL select FRAME_POINTER + select KALLSYMS select TRACING select CONTEXT_SWITCH_TRACER help @@ -238,6 +239,7 @@ config STACK_TRACER depends on DEBUG_KERNEL select FUNCTION_TRACER select STACKTRACE + select KALLSYMS help This special tracer records the maximum stack footprint of the kernel and displays it in debugfs/tracing/stack_trace. -- cgit v1.2.3-59-g8ed1b From 4b3e3d228429c75d398f1aa24532e468d3220c49 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Feb 2009 22:50:01 -0500 Subject: tracing: limit the number of loops the ring buffer self test can make Impact: prevent deadlock if ring buffer gets corrupted This patch adds a paranoid check to make sure the ring buffer consumer does not go into an infinite loop. Since the ring buffer has been set to read only, the consumer should not loop for more than the ring buffer size. A check is added to make sure the consumer does not loop more than the ring buffer size. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7e0ef662f9f..bc8e80a86bca 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -23,10 +23,20 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; + unsigned int loops = 0; while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { entry = ring_buffer_event_data(event); + /* + * The ring buffer is a size of trace_buf_size, if + * we loop more than the size, there's something wrong + * with the ring buffer. + */ + if (loops++ > trace_buf_size) { + printk(KERN_CONT ".. bad ring buffer "); + goto failed; + } if (!trace_valid_entry(entry)) { printk(KERN_CONT ".. invalid entry %d ", entry->type); -- cgit v1.2.3-59-g8ed1b From eed3ee08292d821282169708e5e8e89a0d0a0c63 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 14 Feb 2009 02:00:19 +0100 Subject: PM/resume: wait for device probing to finish the resume code does not currently wait for device probing to finish. Even without async function calls this is dicey and not correct, but with async function calls during the boot sequence this is going to get hit more... This patch adds the synchronization using the newly introduced helper. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki Cc: Len Brown Acked-by: Greg KH Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 432ee575c9ee..7b40e94b1d42 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -594,6 +594,12 @@ static int software_resume(void) int error; unsigned int flags; + /* + * If the user said "noresume".. bail out early. + */ + if (noresume) + return 0; + /* * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate @@ -610,6 +616,11 @@ static int software_resume(void) mutex_unlock(&pm_mutex); return -ENOENT; } + /* + * Some device discovery might still be in progress; we need + * to wait for this to finish. + */ + wait_for_device_probe(); swsusp_resume_device = name_to_dev_t(resume_file); pr_debug("PM: Resume from partition %s\n", resume_file); } else { -- cgit v1.2.3-59-g8ed1b From 09664fda48c5dd63277f1f42888ca9d5dca6037a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 14 Feb 2009 02:02:16 +0100 Subject: PM: fix build for CONFIG_PM unset Compilation of kprobes.c with CONFIG_PM unset is broken due to some broken config dependncies. Fix that. Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Reported-by: Ingo Molnar Tested-by: Masami Hiramatsu Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/power/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 170a9213c1b6..e4791b3ba55d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o diff --git a/kernel/power/Makefile b/kernel/power/Makefile index d7a10167a25b..720ea4f781bd 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,7 +3,7 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -obj-y := main.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o -- cgit v1.2.3-59-g8ed1b From ebae2604f2c3693717d9dc687c84578f0526480c Mon Sep 17 00:00:00 2001 From: Andrey Borzenkov Date: Sat, 14 Feb 2009 02:05:14 +0100 Subject: PM: Fix pm_notifiers during user mode hibernation Snapshot device is opened with O_RDONLY during suspend and O_WRONLY durig resume. Make sure we also call notifiers with correct parameter telling them what we are really doing. Signed-off-by: Andrey Borzenkov Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Acked-by: Pavel Machek Signed-off-by: Linus Torvalds --- kernel/power/user.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 005b93d839ba..6c85359364f2 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -95,15 +95,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + pm_notifier_call_chain(PM_POST_HIBERNATION); } else { data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_notifier_call_chain(PM_POST_RESTORE); } if (error) atomic_inc(&snapshot_device_available); -- cgit v1.2.3-59-g8ed1b From b090f9fa53d51c8a33370071de9e391919ee1fa7 Mon Sep 17 00:00:00 2001 From: Arve Hjønnevåg Date: Sat, 14 Feb 2009 02:06:17 +0100 Subject: PM: Wait for console in resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoids later waking up to a blinking cursor if the device woke up and returned to sleep before the console switch happened. Signed-off-by: Brian Swetland Signed-off-by: Arve Hjønnevåg Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Signed-off-by: Linus Torvalds --- kernel/power/console.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index b8628be2a465..a3961b205de7 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -78,6 +78,12 @@ void pm_restore_console(void) } set_console(orig_fgconsole); release_console_sem(); + + if (vt_waitactive(orig_fgconsole)) { + pr_debug("Resume: Can't switch VCs."); + return; + } + kmsg_redirect = orig_kmsg; } #endif -- cgit v1.2.3-59-g8ed1b From 403f307576396f3362fbb65af190885b6036c72c Mon Sep 17 00:00:00 2001 From: Arve Hjønnevåg Date: Sat, 14 Feb 2009 02:07:24 +0100 Subject: PM: Fix suspend_console and resume_console to use only one semaphore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a race where a thread acquires the console while the console is suspended, and the console is resumed before this thread releases it. In this case, the secondary console semaphore would be left locked, and the primary semaphore would be released twice. This in turn would cause the console switch on suspend or resume to hang forever. Note that suspend_console does not actually lock the console for clients that use acquire_console_sem, it only locks it for clients that use try_acquire_console_sem. If we change suspend_console to fully lock the console, then the kernel may deadlock on suspend. One client of try_acquire_console_sem is acquire_console_semaphore_for_printk, which uses it to prevent printk from using the console while it is suspended. Signed-off-by: Arve Hjønnevåg Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Signed-off-by: Linus Torvalds --- kernel/printk.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 69188f226a93..e3602d0755b0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -73,7 +73,6 @@ EXPORT_SYMBOL(oops_in_progress); * driver system. */ static DECLARE_MUTEX(console_sem); -static DECLARE_MUTEX(secondary_console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -891,12 +890,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); acquire_console_sem(); console_suspended = 1; + up(&console_sem); } void resume_console(void) { if (!console_suspend_enabled) return; + down(&console_sem); console_suspended = 0; release_console_sem(); } @@ -912,11 +913,9 @@ void resume_console(void) void acquire_console_sem(void) { BUG_ON(in_interrupt()); - if (console_suspended) { - down(&secondary_console_sem); - return; - } down(&console_sem); + if (console_suspended) + return; console_locked = 1; console_may_schedule = 1; } @@ -926,6 +925,10 @@ int try_acquire_console_sem(void) { if (down_trylock(&console_sem)) return -1; + if (console_suspended) { + up(&console_sem); + return -1; + } console_locked = 1; console_may_schedule = 0; return 0; @@ -979,7 +982,7 @@ void release_console_sem(void) unsigned wake_klogd = 0; if (console_suspended) { - up(&secondary_console_sem); + up(&console_sem); return; } -- cgit v1.2.3-59-g8ed1b From 770824bdc421ff58a64db608294323571c949f4c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 22 Feb 2009 18:38:50 +0100 Subject: PM: Split up sysdev_[suspend|resume] from device_power_[down|up] Move the sysdev_suspend/resume from the callee to the callers, with no real change in semantics, so that we can rework the disabling of interrupts during suspend/hibernation. This is based on an earlier patch from Linus. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/apm_32.c | 4 ++++ drivers/base/base.h | 2 -- drivers/base/power/main.c | 3 --- drivers/xen/manage.c | 8 ++++++++ include/linux/pm.h | 2 ++ kernel/kexec.c | 7 +++++++ kernel/power/disk.c | 11 +++++++++++ kernel/power/main.c | 8 ++++++-- 8 files changed, 38 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 98807bb095ad..266ec6c18b6c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1192,6 +1192,7 @@ static int suspend(int vetoable) device_suspend(PMSG_SUSPEND); local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); @@ -1208,6 +1209,7 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); device_resume(PMSG_RESUME); @@ -1228,6 +1230,7 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1235,6 +1238,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); } diff --git a/drivers/base/base.h b/drivers/base/base.h index 0a5f055dffba..9f50f1b545dc 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -88,8 +88,6 @@ extern void driver_detach(struct device_driver *drv); extern int driver_probe_device(struct device_driver *drv, struct device *dev); extern void sysdev_shutdown(void); -extern int sysdev_suspend(pm_message_t state); -extern int sysdev_resume(void); extern char *make_class_name(const char *name, struct kobject *kobj); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 670c9d6c1407..2d14f4ae6c01 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -333,7 +333,6 @@ static void dpm_power_up(pm_message_t state) */ void device_power_up(pm_message_t state) { - sysdev_resume(); dpm_power_up(state); } EXPORT_SYMBOL_GPL(device_power_up); @@ -577,8 +576,6 @@ int device_power_down(pm_message_t state) } dev->power.status = DPM_OFF_IRQ; } - if (!error) - error = sysdev_suspend(state); if (error) dpm_power_up(resume_event(state)); return error; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 9b91617b9582..56892a142ee2 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -45,6 +45,13 @@ static int xen_suspend(void *data) err); return err; } + err = sysdev_suspend(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", + err); + device_power_up(PMSG_RESUME); + return err; + } xen_mm_pin_all(); gnttab_suspend(); @@ -61,6 +68,7 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); + sysdev_resume(); device_power_up(PMSG_RESUME); if (!*cancelled) { diff --git a/include/linux/pm.h b/include/linux/pm.h index de2e0a8f6728..24ba5f67b3a3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -381,10 +381,12 @@ struct dev_pm_info { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); +extern int sysdev_resume(void); extern void device_power_up(pm_message_t state); extern void device_resume(pm_message_t state); extern void device_pm_unlock(void); +extern int sysdev_suspend(pm_message_t state); extern int device_power_down(pm_message_t state); extern int device_suspend(pm_message_t state); extern int device_prepare_suspend(pm_message_t state); diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b08864e..483899578259 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1465,6 +1465,11 @@ int kernel_kexec(void) error = device_power_down(PMSG_FREEZE); if (error) goto Enable_irqs; + + /* Suspend system devices */ + error = sysdev_suspend(PMSG_FREEZE); + if (error) + goto Power_up_devices; } else #endif { @@ -1477,6 +1482,8 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + sysdev_resume(); + Power_up_devices: device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 7b40e94b1d42..4a4a206b1979 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -227,6 +227,12 @@ static int create_image(int platform_mode) "aborting hibernation\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + goto Power_up_devices; + } if (hibernation_test(TEST_CORE)) goto Power_up; @@ -242,9 +248,11 @@ static int create_image(int platform_mode) if (!in_suspend) platform_leave(platform_mode); Power_up: + sysdev_resume(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ + Power_up_devices: device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: @@ -335,6 +343,7 @@ static int resume_target_kernel(void) "aborting resume\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_QUIESCE); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); @@ -357,6 +366,7 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); + sysdev_resume(); device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); @@ -440,6 +450,7 @@ int hibernation_platform_enter(void) local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { + sysdev_suspend(PMSG_HIBERNATE); hibernation_ops->enter(); /* We should never get here */ while (1); diff --git a/kernel/power/main.c b/kernel/power/main.c index b4d219016b6c..c9632f841f64 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state) goto Done; } - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); + error = sysdev_suspend(PMSG_SUSPEND); + if (!error) { + if (!suspend_test(TEST_CORE)) + error = suspend_ops->enter(state); + sysdev_resume(); + } device_power_up(PMSG_RESUME); Done: -- cgit v1.2.3-59-g8ed1b From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2009 18:03:42 -0800 Subject: rcu: Teach RCU that idle task is not quiscent state at boot This patch fixes a bug located by Vegard Nossum with the aid of kmemcheck, updated based on review comments from Nick Piggin, Ingo Molnar, and Andrew Morton. And cleans up the variable-name and function-name language. ;-) The boot CPU runs in the context of its idle thread during boot-up. During this time, idle_cpu(0) will always return nonzero, which will fool Classic and Hierarchical RCU into deciding that a large chunk of the boot-up sequence is a big long quiescent state. This in turn causes RCU to prematurely end grace periods during this time. This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks() function to ignore the idle task as a quiescent state until the system has started up the scheduler in rest_init(), introducing a new non-API function rcu_idle_now_means_idle() to inform RCU of this transition. RCU maintains an internal rcu_idle_cpu_truthful variable to track this state, which is then used by rcu_check_callback() to determine if it should believe idle_cpu(). Because this patch has the effect of disallowing RCU grace periods during long stretches of the boot-up sequence, this patch also introduces Josh Triplett's UP-only optimization that makes synchronize_rcu() be a no-op if num_online_cpus() returns 1. This allows boot-time code that calls synchronize_rcu() to proceed normally. Note, however, that RCU callbacks registered by call_rcu() will likely queue up until later in the boot sequence. Although rcuclassic and rcutree can also use this same optimization after boot completes, rcupreempt must restrict its use of this optimization to the portion of the boot sequence before the scheduler starts up, given that an rcupreempt RCU read-side critical section may be preeempted. In addition, this patch takes Nick Piggin's suggestion to make the system_state global variable be __read_mostly. Changes since v4: o Changes the name of the introduced function and variable to be less emotional. ;-) Changes since v3: o WARN_ON(nr_context_switches() > 0) to verify that RCU switches out of boot-time mode before the first context switch, as suggested by Nick Piggin. Changes since v2: o Created rcu_blocking_is_gp() internal-to-RCU API that determines whether a call to synchronize_rcu() is itself a grace period. o The definition of rcu_blocking_is_gp() for rcuclassic and rcutree checks to see if but a single CPU is online. o The definition of rcu_blocking_is_gp() for rcupreempt checks to see both if but a single CPU is online and if the system is still in early boot. This allows rcupreempt to again work correctly if running on a single CPU after booting is complete. o Added check to rcupreempt's synchronize_sched() for there being but one online CPU. Tested all three variants both SMP and !SMP, booted fine, passed a short rcutorture test on both x86 and Power. Located-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 6 ++++++ include/linux/rcupdate.h | 4 ++++ include/linux/rcupreempt.h | 15 +++++++++++++++ include/linux/rcutree.h | 6 ++++++ init/main.c | 3 ++- kernel/rcuclassic.c | 4 ++-- kernel/rcupdate.c | 12 ++++++++++++ kernel/rcupreempt.c | 3 +++ kernel/rcutree.c | 4 ++-- 9 files changed, 52 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index f3f697df1d71..80044a4f3ab9 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void); #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) +/* A context switch is a grace period for rcuclassic. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 921340a7b71c..528343e6da51 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,6 +52,9 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +/* Internal to kernel, but needed by rcupreempt.h. */ +extern int rcu_scheduler_active; + #if defined(CONFIG_CLASSIC_RCU) #include #elif defined(CONFIG_TREE_RCU) @@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void); /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 3e05c09b54a2..74304b4538d8 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ +/* + * A context switch is a grace period for rcupreempt synchronize_rcu() + * only during early boot, before the scheduler has been initialized. + * So, how the heck do we get a context switch? Well, if the caller + * invokes synchronize_rcu(), they are willing to accept a context + * switch, so we simply pretend that one happened. + * + * After boot, there might be a blocked or preempted task in an RCU + * read-side critical section, so we cannot then take the fastpath. + */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1 && !rcu_scheduler_active; +} + #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d4368b7975c3..a722fb67bb2d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void) } #endif /* CONFIG_NO_HZ */ +/* A context switch is a grace period for rcutree. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUTREE_H */ diff --git a/init/main.c b/init/main.c index 844209453c02..83697e160b3a 100644 --- a/init/main.c +++ b/init/main.c @@ -97,7 +97,7 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif -enum system_states system_state; +enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); /* @@ -463,6 +463,7 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); + rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,6 +44,7 @@ #include #include #include +#include enum rcu_barrier { RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -175,3 +181,9 @@ void __init rcu_init(void) __rcu_init(); } +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; + if (num_online_cpus() == 1) + return; /* blocking is gp if only one CPU! */ + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user -- cgit v1.2.3-59-g8ed1b From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 25 Feb 2009 09:59:26 -0800 Subject: sched_rt: don't start timer when rt bandwidth disabled Impact: fix incorrect condition check No need to start rt bandwidth timer when rt bandwidth is disabled. If this timer starts, it may stop at sched_rt_period_timer() on the first time. Signed-off-by: Hiroshi Shimamoto Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..c3baa9653d1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) -- cgit v1.2.3-59-g8ed1b From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 27 Feb 2009 15:13:54 +0530 Subject: sched: don't allow setuid to succeed if the user does not have rt bandwidth Impact: fix hung task with certain (non-default) rt-limit settings Corey Hickey reported that on using setuid to change the uid of a rt process, the process would be unkillable and not be running. This is because there was no rt runtime for that user group. Add in a check to see if a user can attach an rt task to its task group. On failure, return EINVAL, which is also returned in CONFIG_CGROUP_SCHED. Reported-by: Corey Hickey Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 13 +++++++++++-- kernel/sys.c | 31 ++++++++++++++++++++----------- kernel/user.c | 18 ++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..8c216e057c94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/kernel/sched.c b/kernel/sched.c index c3baa9653d1d..8e2558c2ba67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -559,7 +559,7 @@ error: abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). -- cgit v1.2.3-59-g8ed1b From 5170836679185357dc1b7660bad13287b39e1e33 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Feb 2009 14:03:03 -0800 Subject: Fix recursive lock in free_uid()/free_user_ns() free_uid() and free_user_ns() are corecursive when CONFIG_USER_SCHED=n, but free_user_ns() is called from free_uid() by way of uid_hash_remove(), which requires uidhash_lock to be held. free_user_ns() then calls free_uid() to complete the destruction. Fix this by deferring the destruction of the user_namespace. Signed-off-by: David Howells Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/user_namespace.h | 1 + kernel/user_namespace.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 315bcd375224..cc4f45361dbb 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -13,6 +13,7 @@ struct user_namespace { struct kref kref; struct hlist_head uidhash_table[UIDHASH_SZ]; struct user_struct *creator; + struct work_struct destroyer; }; extern struct user_namespace init_user_ns; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 79084311ee57..076c7c8215b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -60,12 +60,25 @@ int create_user_ns(struct cred *new) return 0; } -void free_user_ns(struct kref *kref) +/* + * Deferred destructor for a user namespace. This is required because + * free_user_ns() may be called with uidhash_lock held, but we need to call + * back to free_uid() which will want to take the lock again. + */ +static void free_user_ns_work(struct work_struct *work) { - struct user_namespace *ns; - - ns = container_of(kref, struct user_namespace, kref); + struct user_namespace *ns = + container_of(work, struct user_namespace, destroyer); free_uid(ns->creator); kfree(ns); } + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns = + container_of(kref, struct user_namespace, kref); + + INIT_WORK(&ns->destroyer, free_user_ns_work); + schedule_work(&ns->destroyer); +} EXPORT_SYMBOL(free_user_ns); -- cgit v1.2.3-59-g8ed1b From 044d408409cc4e1bc75c886e27ca85c270db104c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Mar 2009 16:13:32 +0100 Subject: genirq: assert that irq handlers are indeed running in hardirq context Make sure the genirq layer handlers are indeed running handlers in hardirq context. That is the genirq expectation and doing anything else is broken. Signed-off-by: Peter Zijlstra Cc: Andrew Morton LKML-Reference: <1236006812.5330.632.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3aba8d12f328..a2ee682bca2e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -328,6 +328,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; + WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); + if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); -- cgit v1.2.3-59-g8ed1b From 5b1017404aea6d2e552e991b3fd814d839e9cd67 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 27 Feb 2009 23:25:54 -0800 Subject: x86-64: seccomp: fix 32/64 syscall hole On x86-64, a 32-bit process (TIF_IA32) can switch to 64-bit mode with ljmp, and then use the "syscall" instruction to make a 64-bit system call. A 64-bit process make a 32-bit system call with int $0x80. In both these cases under CONFIG_SECCOMP=y, secure_computing() will use the wrong system call number table. The fix is simple: test TS_COMPAT instead of TIF_IA32. Here is an example exploit: /* test case for seccomp circumvention on x86-64 There are two failure modes: compile with -m64 or compile with -m32. The -m64 case is the worst one, because it does "chmod 777 ." (could be any chmod call). The -m32 case demonstrates it was able to do stat(), which can glean information but not harm anything directly. A buggy kernel will let the test do something, print, and exit 1; a fixed kernel will make it exit with SIGKILL before it does anything. */ #define _GNU_SOURCE #include #include #include #include #include #include #include int main (int argc, char **argv) { char buf[100]; static const char dot[] = "."; long ret; unsigned st[24]; if (prctl (PR_SET_SECCOMP, 1, 0, 0, 0) != 0) perror ("prctl(PR_SET_SECCOMP) -- not compiled into kernel?"); #ifdef __x86_64__ assert ((uintptr_t) dot < (1UL << 32)); asm ("int $0x80 # %0 <- %1(%2 %3)" : "=a" (ret) : "0" (15), "b" (dot), "c" (0777)); ret = snprintf (buf, sizeof buf, "result %ld (check mode on .!)\n", ret); #elif defined __i386__ asm (".code32\n" "pushl %%cs\n" "pushl $2f\n" "ljmpl $0x33, $1f\n" ".code64\n" "1: syscall # %0 <- %1(%2 %3)\n" "lretl\n" ".code32\n" "2:" : "=a" (ret) : "0" (4), "D" (dot), "S" (&st)); if (ret == 0) ret = snprintf (buf, sizeof buf, "stat . -> st_uid=%u\n", st[7]); else ret = snprintf (buf, sizeof buf, "result %ld\n", ret); #else # error "not this one" #endif write (1, buf, ret); syscall (__NR_exit, 1); return 2; } Signed-off-by: Roland McGrath [ I don't know if anybody actually uses seccomp, but it's enabled in at least both Fedora and SuSE kernels, so maybe somebody is. - Linus ] Signed-off-by: Linus Torvalds --- arch/mips/include/asm/seccomp.h | 1 - arch/powerpc/include/asm/compat.h | 5 +++++ arch/powerpc/include/asm/seccomp.h | 4 ---- arch/sparc/include/asm/compat.h | 5 +++++ arch/sparc/include/asm/seccomp.h | 6 ------ arch/x86/include/asm/seccomp_32.h | 6 ------ arch/x86/include/asm/seccomp_64.h | 8 -------- kernel/seccomp.c | 7 ++++--- 8 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h index 36ed44070256..a6772e9507f5 100644 --- a/arch/mips/include/asm/seccomp.h +++ b/arch/mips/include/asm/seccomp.h @@ -1,6 +1,5 @@ #ifndef __ASM_SECCOMP_H -#include #include #define __NR_seccomp_read __NR_read diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index d811a8cd7b58..4774c2f92232 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -210,5 +210,10 @@ struct compat_shmid64_ds { compat_ulong_t __unused6; }; +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_32BIT); +} + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_COMPAT_H */ diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h index 853765eb1f65..00c1d9133cfe 100644 --- a/arch/powerpc/include/asm/seccomp.h +++ b/arch/powerpc/include/asm/seccomp.h @@ -1,10 +1,6 @@ #ifndef _ASM_POWERPC_SECCOMP_H #define _ASM_POWERPC_SECCOMP_H -#ifdef __KERNEL__ -#include -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index f260b58f5ce9..0e706257918f 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -240,4 +240,9 @@ struct compat_shmid64_ds { unsigned int __unused2; }; +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_32BIT); +} + #endif /* _ASM_SPARC64_COMPAT_H */ diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h index 7fcd9968192b..adca1bce41d4 100644 --- a/arch/sparc/include/asm/seccomp.h +++ b/arch/sparc/include/asm/seccomp.h @@ -1,11 +1,5 @@ #ifndef _ASM_SECCOMP_H -#include /* already defines TIF_32BIT */ - -#ifndef TIF_32BIT -#error "unexpected TIF_32BIT on sparc64" -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h index a6ad87b352c4..b811d6f5780c 100644 --- a/arch/x86/include/asm/seccomp_32.h +++ b/arch/x86/include/asm/seccomp_32.h @@ -1,12 +1,6 @@ #ifndef _ASM_X86_SECCOMP_32_H #define _ASM_X86_SECCOMP_32_H -#include - -#ifdef TIF_32BIT -#error "unexpected TIF_32BIT on i386" -#endif - #include #define __NR_seccomp_read __NR_read diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h index 4171bb794e9e..84ec1bd161a5 100644 --- a/arch/x86/include/asm/seccomp_64.h +++ b/arch/x86/include/asm/seccomp_64.h @@ -1,14 +1,6 @@ #ifndef _ASM_X86_SECCOMP_64_H #define _ASM_X86_SECCOMP_64_H -#include - -#ifdef TIF_32BIT -#error "unexpected TIF_32BIT on x86_64" -#else -#define TIF_32BIT TIF_IA32 -#endif - #include #include diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ad64fcb731f2..57d4b13b631d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -8,6 +8,7 @@ #include #include +#include /* #define SECCOMP_DEBUG 1 */ #define NR_SECCOMP_MODES 1 @@ -22,7 +23,7 @@ static int mode1_syscalls[] = { 0, /* null terminated */ }; -#ifdef TIF_32BIT +#ifdef CONFIG_COMPAT static int mode1_syscalls_32[] = { __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 0, /* null terminated */ @@ -37,8 +38,8 @@ void __secure_computing(int this_syscall) switch (mode) { case 1: syscall = mode1_syscalls; -#ifdef TIF_32BIT - if (test_thread_flag(TIF_32BIT)) +#ifdef CONFIG_COMPAT + if (is_compat_task()) syscall = mode1_syscalls_32; #endif do { -- cgit v1.2.3-59-g8ed1b From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2009 12:11:56 -0800 Subject: rcu: increment quiescent state counter in ksoftirqd() If a machine is flooded by network frames, a cpu can loop 100% of its time inside ksoftirqd() without calling schedule(). This can delay RCU grace period to insane values. Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem. Paul: "This regression was a result of the recent change from "schedule()" to "cond_resched()", which got rid of that quiescent state in the common case where a reschedule is not needed". Signed-off-by: Eric Dumazet Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..9041ea7948fe 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); + rcu_qsctr_inc((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3-59-g8ed1b From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 9 Mar 2009 13:31:59 +0100 Subject: Fix fixpoint divide exception in acct_update_integrals Frans Pop reported the crash below when running an s390 kernel under Hercules: Kernel BUG at 000738b4 verbose debug info unavailable! fixpoint divide exception: 0009 #1! SMP Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot dm_mod dasd_eckd_mod dasd_mod CPU: 0 Not tainted 2.6.27.19 #13 Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18) Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 Krnl GPRS: 00000000 000007d0 7fffffff fffff830 00000000 ffffffff 00000002 0f9ed9b8 00000000 00008ca0 00000000 0f9ed9b8 0f9edda4 8007386e 0f4f7ec8 0f4f7e98 Krnl Code: 800738aa: a71807d0 lhi %r1,2000 800738ae: 8c200001 srdl %r2,1 800738b2: 1d21 dr %r2,%r1 >800738b4: 5810d10e l %r1,270(%r13) 800738b8: 1823 lr %r2,%r3 800738ba: 4130f060 la %r3,96(%r15) 800738be: 0de1 basr %r14,%r1 800738c0: 5800f060 l %r0,96(%r15) Call Trace: ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c) <0000000000038502>! do_exit+0x106/0x7c0 <0000000000038c36>! do_group_exit+0x7a/0xb4 <0000000000038c8e>! SyS_exit_group+0x1e/0x30 <0000000000021c28>! sysc_do_restart+0x12/0x16 <0000000077e7e924>! 0x77e7e924 Reason for this is that cpu time accounting usually only happens from interrupt context, but acct_update_integrals gets also called from process context with interrupts enabled. So in acct_update_integrals we may end up with the following scenario: Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt happens which updates accouting values. This causes acct_timexpd to be greater than the former stime + utime. The subsequent calculation of dtime = cputime_sub(time, tsk->acct_timexpd); will be negative and the division performed by cputime_to_jiffies(dtime) will generate an exception since the result won't fit into a 32 bit register. In order to fix this just always disable interrupts while accessing any of the accounting values. Reported by: Frans Pop Tested by: Frans Pop Cc: stable@kernel.org Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 43f891b05a4b..00d59d048edf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk) if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; + unsigned long flags; u64 delta; + local_irq_save(flags); time = tsk->stime + tsk->utime; dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); @@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk) delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) - return; + goto out; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + out: + local_irq_restore(flags); } } -- cgit v1.2.3-59-g8ed1b From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Mar 2009 22:58:45 +0100 Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we re-use the old parent, we must also re-use ->parent_exec_id to make sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed task exits. Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place two different cases together. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: David Howells Cc: Serge E. Hallyn Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde20715..4854c2c4a82e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif clear_all_latency_tracing(p); - /* Our parent execution domain becomes current domain - These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; - /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; @@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; - else + p->parent_exec_id = current->parent_exec_id; + } else { p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } spin_lock(¤t->sighand->siglock); -- cgit v1.2.3-59-g8ed1b From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 10 Mar 2009 12:55:56 -0700 Subject: kernel/user.c: fix a memory leak when freeing up non-init usernamespaces users We were returning early in the sysfs directory cleanup function if the user belonged to a non init usernamespace. Due to this a lot of the cleanup was not done and we were left with a leak. Fix the leak. Reported-by: Serge Hallyn Signed-off-by: Dhaval Giani Acked-by: Serge Hallyn Tested-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6a9b696128c8..fbb300e6191f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -286,14 +286,12 @@ int __init uids_sysfs_init(void) /* work function to remove sysfs directory for a user and free up * corresponding structures. */ -static void remove_user_sysfs_dir(struct work_struct *w) +static void cleanup_user_struct(struct work_struct *w) { struct user_struct *up = container_of(w, struct user_struct, work); unsigned long flags; int remove_user = 0; - if (up->user_ns != &init_user_ns) - return; /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() * atomic. */ @@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w) if (!remove_user) goto done; - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); + if (up->user_ns == &init_user_ns) { + kobject_uevent(&up->kobj, KOBJ_REMOVE); + kobject_del(&up->kobj); + kobject_put(&up->kobj); + } sched_destroy_user(up); key_put(up->uid_keyring); @@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags) atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); - INIT_WORK(&up->work, remove_user_sysfs_dir); + INIT_WORK(&up->work, cleanup_user_struct); schedule_work(&up->work); } -- cgit v1.2.3-59-g8ed1b From f21cfb258df6dd3ea0b3e56d75c7e994edb81b35 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:42 +0900 Subject: irq: add remove_irq() for freeing of setup_irq() irqs Impact: add new API This patch adds a remove_irq() function for releasing interrupts requested with setup_irq(). Without this patch we have no way of releasing such interrupts since free_irq() today tries to kfree() the irqaction passed with setup_irq(). Signed-off-by: Magnus Damm LKML-Reference: <20090312120542.2926.56609.sendpatchset@rx1.opensource.se> Signed-off-by: Ingo Molnar --- include/linux/irq.h | 1 + kernel/irq/manage.c | 39 ++++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index f899b502f186..56f9988362ec 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -236,6 +236,7 @@ typedef struct irq_desc irq_desc_t; #include extern int setup_irq(unsigned int irq, struct irqaction *new); +extern struct irqaction *remove_irq(unsigned int irq, void *dev_id); #ifdef CONFIG_GENERIC_HARDIRQS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 52ee17135092..8b069a7046e9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -551,20 +551,14 @@ int setup_irq(unsigned int irq, struct irqaction *act) } /** - * free_irq - free an interrupt + * remove_irq - free an interrupt * @irq: Interrupt line to free * @dev_id: Device identity to free * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function must not be called from interrupt context. + * Used to remove interrupts statically setup by the early boot process. */ -void free_irq(unsigned int irq, void *dev_id) + +struct irqaction *remove_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; @@ -573,7 +567,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) - return; + return NULL; spin_lock_irqsave(&desc->lock, flags); @@ -589,7 +583,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); spin_unlock_irqrestore(&desc->lock, flags); - return; + return NULL; } if (action->dev_id == dev_id) @@ -636,7 +630,26 @@ void free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif - kfree(action); + return action; +} + +/** + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ + kfree(remove_irq(irq, dev_id)); } EXPORT_SYMBOL(free_irq); -- cgit v1.2.3-59-g8ed1b From cbf94f06824780183e4bba165c7c29d5c7bd9a51 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:51 +0900 Subject: irq: match remove_irq() args with setup_irq() Modify remove_irq() to match setup_irq(). Signed-off-by: Magnus Damm LKML-Reference: <20090312120551.2926.43942.sendpatchset@rx1.opensource.se> Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 +- kernel/irq/manage.c | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 56f9988362ec..737eafbc1f3d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -236,7 +236,7 @@ typedef struct irq_desc irq_desc_t; #include extern int setup_irq(unsigned int irq, struct irqaction *new); -extern struct irqaction *remove_irq(unsigned int irq, void *dev_id); +extern void remove_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_GENERIC_HARDIRQS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b069a7046e9..fc16570c9b46 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -550,15 +550,11 @@ int setup_irq(unsigned int irq, struct irqaction *act) return __setup_irq(irq, desc, act); } -/** - * remove_irq - free an interrupt - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Used to remove interrupts statically setup by the early boot process. + /* + * Internal function to unregister an irqaction - used to free + * regular and special interrupts that are part of the architecture. */ - -struct irqaction *remove_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; @@ -633,6 +629,18 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id) return action; } +/** + * remove_irq - free an interrupt + * @irq: Interrupt line to free + * @act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_irq(unsigned int irq, struct irqaction *act) +{ + __free_irq(irq, act->dev_id); +} + /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free @@ -649,7 +657,7 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id) */ void free_irq(unsigned int irq, void *dev_id) { - kfree(remove_irq(irq, dev_id)); + kfree(__free_irq(irq, dev_id)); } EXPORT_SYMBOL(free_irq); -- cgit v1.2.3-59-g8ed1b From eb53b4e8fef10ccccb49a6dbb5e19ca84ba5a305 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:59 +0900 Subject: irq: export remove_irq() and setup_irq() symbols Export the setup_irq() and remove_irq() symbols. I'd like to export these functions since I have timer code that needs to use setup_irq() early on (too early for request_irq()), and the same code can also be compiled as a module. Signed-off-by: Magnus Damm LKML-Reference: <20090312120559.2926.82371.sendpatchset@rx1.opensource.se> [ changed to _GPL as these are special APIs deep inside the irq layer. ] Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fc16570c9b46..e28db0f656ac 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -549,6 +549,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) return __setup_irq(irq, desc, act); } +EXPORT_SYMBOL_GPL(setup_irq); /* * Internal function to unregister an irqaction - used to free @@ -640,6 +641,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) { __free_irq(irq, act->dev_id); } +EXPORT_SYMBOL_GPL(remove_irq); /** * free_irq - free an interrupt allocated with request_irq -- cgit v1.2.3-59-g8ed1b From c8e2aeef0b8ac9fb8821b8b3734c031579d0b77a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Mar 2009 20:26:23 +0100 Subject: genirq: remove redundant if condition Impact: cleanup The code is only compiled if CONFIG_GENERIC_HARDIRQS=y so another check for this define in the code is redundant. Remove it. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e28db0f656ac..4600f877c292 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -15,7 +15,7 @@ #include "internals.h" -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +#ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; /** -- cgit v1.2.3-59-g8ed1b From 4553573277906901f62f73c0432b332c53de5e2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 22 Feb 2009 23:00:32 +0100 Subject: genirq: use kzalloc instead of explicit zero initialization Impact: simplification Signed-off-by: Thomas Gleixner Reviewed-by: Peter Zijlstra --- kernel/irq/manage.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4600f877c292..8a22039a90ba 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -737,15 +737,13 @@ int request_irq(unsigned int irq, irq_handler_t handler, if (!handler) return -EINVAL; - action = kmalloc(sizeof(struct irqaction), GFP_KERNEL); + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags; - cpus_clear(action->mask); action->name = devname; - action->next = NULL; action->dev_id = dev_id; retval = __setup_irq(irq, desc, action); -- cgit v1.2.3-59-g8ed1b From 0e57aa11abb15b70db53d1f95ae70b3c980ac885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2009 14:34:05 +0100 Subject: genirq: deprecate __do_IRQ Two years migration time is enough. Remove the compability cruft. Add the deprecated warning in kernel/irq/handle.c because marking __do_IRQ itself is way too noisy. Signed-off-by: Thomas Gleixner --- Documentation/feature-removal-schedule.txt | 8 ++++++++ kernel/irq/handle.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 20d3b94703a4..63b4550411be 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -344,3 +344,11 @@ Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and Removal is subject to fixing any remaining bugs in ACPI which may cause the thermal throttling not to happen at the right time. Who: Dave Jones , Matthew Garrett + +----------------------------- + +What: __do_IRQ all in one fits nothing interrupt handler +When: 2.6.32 +Why: __do_IRQ was kept for easy migration to the type flow handlers. + More than two years of migration time is enough. +Who: Thomas Gleixner diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a2ee682bca2e..6661704140c7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -349,6 +349,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) } #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ + +#ifdef CONFIG_ENABLE_WARN_DEPRECATED +# warning __do_IRQ is deprecated. Please convert to proper flow handlers +#endif + /** * __do_IRQ - original all in one highlevel IRQ handler * @irq: the interrupt number -- cgit v1.2.3-59-g8ed1b