From c59923a15c12d2b3597af913bf234a0ef264a38b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Jul 2006 04:45:40 -0700 Subject: [PATCH] remove the tasklist_lock export As announced half a year ago this patch will remove the tasklist_lock export. The previous two patches got rid of the remaining modular users. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 56e4e07e45f7..926e5a68ea9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,9 +61,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; - __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -EXPORT_SYMBOL(tasklist_lock); +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ int nr_processes(void) { -- cgit v1.3-7-g2ca7 From ca74e92b4698276b6696f15a801759f50944f387 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:36 -0700 Subject: [PATCH] per-task-delay-accounting: setup Initialization code related to collection of per-task "delay" statistics which measure how long it had to wait for cpu, sync block io, swapping etc. The collection of statistics and the interface are in other patches. This patch sets up the data structures and allows the statistics collection to be disabled through a kernel boot parameter. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 + include/linux/delayacct.h | 69 +++++++++++++++++++++++++++++ include/linux/sched.h | 20 +++++++++ include/linux/time.h | 12 +++++ init/Kconfig | 10 +++++ init/main.c | 2 + kernel/Makefile | 1 + kernel/delayacct.c | 87 +++++++++++++++++++++++++++++++++++++ kernel/exit.c | 2 + kernel/fork.c | 2 + 10 files changed, 207 insertions(+) create mode 100644 include/linux/delayacct.h create mode 100644 kernel/delayacct.c (limited to 'kernel/fork.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 149f62ba14a5..e11f7728ec6f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -448,6 +448,8 @@ running once the system is up. Format: [,] See also Documentation/networking/decnet.txt. + delayacct [KNL] Enable per-task delay accounting + dhash_entries= [KNL] Set number of hash buckets for dentry cache. diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h new file mode 100644 index 000000000000..9572cfa1f129 --- /dev/null +++ b/include/linux/delayacct.h @@ -0,0 +1,69 @@ +/* delayacct.h - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + */ + +#ifndef _LINUX_DELAYACCT_H +#define _LINUX_DELAYACCT_H + +#include + +#ifdef CONFIG_TASK_DELAY_ACCT + +extern int delayacct_on; /* Delay accounting turned on/off */ +extern kmem_cache_t *delayacct_cache; +extern void delayacct_init(void); +extern void __delayacct_tsk_init(struct task_struct *); +extern void __delayacct_tsk_exit(struct task_struct *); + +static inline void delayacct_set_flag(int flag) +{ + if (current->delays) + current->delays->flags |= flag; +} + +static inline void delayacct_clear_flag(int flag) +{ + if (current->delays) + current->delays->flags &= ~flag; +} + +static inline void delayacct_tsk_init(struct task_struct *tsk) +{ + /* reinitialize in case parent's non-null pointer was dup'ed*/ + tsk->delays = NULL; + if (unlikely(delayacct_on)) + __delayacct_tsk_init(tsk); +} + +static inline void delayacct_tsk_exit(struct task_struct *tsk) +{ + if (tsk->delays) + __delayacct_tsk_exit(tsk); +} + +#else +static inline void delayacct_set_flag(int flag) +{} +static inline void delayacct_clear_flag(int flag) +{} +static inline void delayacct_init(void) +{} +static inline void delayacct_tsk_init(struct task_struct *tsk) +{} +static inline void delayacct_tsk_exit(struct task_struct *tsk) +{} +#endif /* CONFIG_TASK_DELAY_ACCT */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c876e27ff93..7a54e62763c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -552,6 +552,23 @@ struct sched_info { extern struct file_operations proc_schedstat_operations; #endif +#ifdef CONFIG_TASK_DELAY_ACCT +struct task_delay_info { + spinlock_t lock; + unsigned int flags; /* Private per-task flags */ + + /* For each stat XXX, add following, aligned appropriately + * + * struct timespec XXX_start, XXX_end; + * u64 XXX_delay; + * u32 XXX_count; + * + * Atomicity of updates to XXX_delay, XXX_count protected by + * single lock above (split into XXX_lock if contention is an issue). + */ +}; +#endif + enum idle_type { SCHED_IDLE, @@ -945,6 +962,9 @@ struct task_struct { * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; +#ifdef CONFIG_TASK_DELAY_ACCT + struct task_delay_info *delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) diff --git a/include/linux/time.h b/include/linux/time.h index c05f8bb9a323..a5b739967b74 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -70,6 +70,18 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon, extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec); +/* + * sub = lhs - rhs, in normalized form + */ +static inline struct timespec timespec_sub(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, + lhs.tv_nsec - rhs.tv_nsec); + return ts_delta; +} + /* * Returns true if the timespec is norm, false if denorm: */ diff --git a/init/Kconfig b/init/Kconfig index a5b073a103e7..90498a3e53da 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -158,6 +158,16 @@ config BSD_PROCESS_ACCT_V3 for processing it. A preliminary version of these tools is available at . +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting (EXPERIMENTAL)" + help + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. + + Say N if unsure. + config SYSCTL bool "Sysctl support" if EMBEDDED default y diff --git a/init/main.c b/init/main.c index 628b8e9e841a..9e8e8c152142 100644 --- a/init/main.c +++ b/init/main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -574,6 +575,7 @@ asmlinkage void __init start_kernel(void) proc_root_init(); #endif cpuset_init(); + delayacct_init(); check_bugs(); diff --git a/kernel/Makefile b/kernel/Makefile index 47dbcd570cd8..87bb34cc8938 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..fbf7f2284952 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,87 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include + +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; + return 1; +} +__setup("delayacct", delayacct_setup_enable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +void __delayacct_tsk_exit(struct task_struct *tsk) +{ + kmem_cache_free(delayacct_cache, tsk->delays); + tsk->delays = NULL; +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + diff --git a/kernel/exit.c b/kernel/exit.c index 6664c084783d..3c2cf91defa7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + delayacct_tsk_exit(tsk); exit_mm(tsk); if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 926e5a68ea9e..451cfd35bf22 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; -- cgit v1.3-7-g2ca7 From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:44 -0700 Subject: [PATCH] delay accounting taskstats interface send tgid once Send per-tgid data only once during exit of a thread group instead of once with each member thread exit. Currently, when a thread exits, besides its per-tid data, the per-tgid data of its thread group is also sent out, if its thread group is non-empty. The per-tgid data sent consists of the sum of per-tid stats for all *remaining* threads of the thread group. This patch modifies this sending in two ways: - the per-tgid data is sent only when the last thread of a thread group exits. This cuts down heavily on the overhead of sending/receiving per-tgid data, especially when other exploiters of the taskstats interface aren't interested in per-tgid stats - the semantics of the per-tgid data sent are changed. Instead of being the sum of per-tid data for remaining threads, the value now sent is the true total accumalated statistics for all threads that are/were part of the thread group. The patch also addresses a minor issue where failure of one accounting subsystem to fill in the taskstats structure was causing the send of taskstats to not be sent at all. The patch has been tested for stability and run cerberus for over 4 hours on an SMP. [akpm@osdl.org: bugfixes] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/delay-accounting.txt | 13 ++-- Documentation/accounting/taskstats.txt | 33 ++++----- MAINTAINERS | 12 ++++ include/linux/sched.h | 4 ++ include/linux/taskstats_kern.h | 71 ++++++++++++++----- kernel/exit.c | 8 +-- kernel/fork.c | 4 ++ kernel/taskstats.c | 98 ++++++++++++++++++--------- 8 files changed, 162 insertions(+), 81 deletions(-) (limited to 'kernel/fork.c') diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt index f3dc0ca04fa4..be215e58423b 100644 --- a/Documentation/accounting/delay-accounting.txt +++ b/Documentation/accounting/delay-accounting.txt @@ -48,9 +48,10 @@ counter (say cpu_delay_total) for a task will give the delay experienced by the task waiting for the corresponding resource in that interval. -When a task exits, records containing the per-task and per-process statistics -are sent to userspace without requiring a command. More details are given in -the taskstats interface description. +When a task exits, records containing the per-task statistics +are sent to userspace without requiring a command. If it is the last exiting +task of a thread group, the per-tgid statistics are also sent. More details +are given in the taskstats interface description. The getdelays.c userspace utility in this directory allows simple commands to be run and the corresponding delay statistics to be displayed. It also serves @@ -107,9 +108,3 @@ IO count delay total 0 0 MEM count delay total 0 0 - - - - - - diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt index acc6b4f37fc7..efd8f605bcd5 100644 --- a/Documentation/accounting/taskstats.txt +++ b/Documentation/accounting/taskstats.txt @@ -32,12 +32,11 @@ The response contains statistics for a task (if pid is specified) or the sum of statistics for all tasks of the process (if tgid is specified). To obtain statistics for tasks which are exiting, userspace opens a multicast -netlink socket. Each time a task exits, two records are sent by the kernel to -each listener on the multicast socket. The first the per-pid task's statistics -and the second is the sum for all tasks of the process to which the task -belongs (the task does not need to be the thread group leader). The need for -per-tgid stats to be sent for each exiting task is explained in the per-tgid -stats section below. +netlink socket. Each time a task exits, its per-pid statistics is always sent +by the kernel to each listener on the multicast socket. In addition, if it is +the last thread exiting its thread group, an additional record containing the +per-tgid stats are also sent. The latter contains the sum of per-pid stats for +all threads in the thread group, both past and present. getdelays.c is a simple utility demonstrating usage of the taskstats interface for reporting delay accounting statistics. @@ -104,20 +103,14 @@ stats in userspace alone is inefficient and potentially inaccurate (due to lack of atomicity). However, maintaining per-process, in addition to per-task stats, within the -kernel has space and time overheads. Hence the taskstats implementation -dynamically sums up the per-task stats for each task belonging to a process -whenever per-process stats are needed. - -Not maintaining per-tgid stats creates a problem when userspace is interested -in getting these stats when the process dies i.e. the last thread of -a process exits. It isn't possible to simply return some aggregated per-process -statistic from the kernel. - -The approach taken by taskstats is to return the per-tgid stats *each* time -a task exits, in addition to the per-pid stats for that task. Userspace can -maintain task<->process mappings and use them to maintain the per-process stats -in userspace, updating the aggregate appropriately as the tasks of a process -exit. +kernel has space and time overheads. To address this, the taskstats code +accumalates each exiting task's statistics into a process-wide data structure. +When the last task of a process exits, the process level data accumalated also +gets sent to userspace (along with the per-task data). + +When a user queries to get per-tgid data, the sum of all other live threads in +the group is added up and added to the accumalated total for previously exited +threads of the same thread group. Extending taskstats ------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 0557cfde053d..e99028ca2f7c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2240,6 +2240,12 @@ M: tsbogend@alpha.franken.de L: netdev@vger.kernel.org S: Maintained +PER-TASK DELAY ACCOUNTING +P: Shailabh Nagar +M: nagar@watson.ibm.com +L: linux-kernel@vger.kernel.org +S: Maintained + PERSONALITY HANDLING P: Christoph Hellwig M: hch@infradead.org @@ -2767,6 +2773,12 @@ P: Deepak Saxena M: dsaxena@plexity.net S: Maintained +TASKSTATS STATISTICS INTERFACE +P: Shailabh Nagar +M: nagar@watson.ibm.com +L: linux-kernel@vger.kernel.org +S: Maintained + TI PARALLEL LINK CABLE DRIVER P: Romain Lievin M: roms@lpg.ticalc.org diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c5610ca0c92..6afa72e080cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -463,6 +463,10 @@ struct signal_struct { #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif +#ifdef CONFIG_TASKSTATS + spinlock_t stats_lock; + struct taskstats *stats; +#endif }; /* Context switch must be unlocked if interrupts are to be enabled */ diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h index fc9da2e26443..0ae8f67af1fd 100644 --- a/include/linux/taskstats_kern.h +++ b/include/linux/taskstats_kern.h @@ -19,36 +19,75 @@ enum { extern kmem_cache_t *taskstats_cache; extern struct mutex taskstats_exit_mutex; -static inline void taskstats_exit_alloc(struct taskstats **ptidstats, - struct taskstats **ptgidstats) +static inline void taskstats_exit_alloc(struct taskstats **ptidstats) { *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); - *ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); } -static inline void taskstats_exit_free(struct taskstats *tidstats, - struct taskstats *tgidstats) +static inline void taskstats_exit_free(struct taskstats *tidstats) { if (tidstats) kmem_cache_free(taskstats_cache, tidstats); - if (tgidstats) - kmem_cache_free(taskstats_cache, tgidstats); } -extern void taskstats_exit_send(struct task_struct *, struct taskstats *, - struct taskstats *); -extern void taskstats_init_early(void); +static inline void taskstats_tgid_init(struct signal_struct *sig) +{ + spin_lock_init(&sig->stats_lock); + sig->stats = NULL; +} + +static inline void taskstats_tgid_alloc(struct signal_struct *sig) +{ + struct taskstats *stats; + unsigned long flags; + + stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!stats) + return; + + spin_lock_irqsave(&sig->stats_lock, flags); + if (!sig->stats) { + sig->stats = stats; + stats = NULL; + } + spin_unlock_irqrestore(&sig->stats_lock, flags); + + if (stats) + kmem_cache_free(taskstats_cache, stats); +} +static inline void taskstats_tgid_free(struct signal_struct *sig) +{ + struct taskstats *stats = NULL; + unsigned long flags; + + spin_lock_irqsave(&sig->stats_lock, flags); + if (sig->stats) { + stats = sig->stats; + sig->stats = NULL; + } + spin_unlock_irqrestore(&sig->stats_lock, flags); + if (stats) + kmem_cache_free(taskstats_cache, stats); +} + +extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int); +extern void taskstats_init_early(void); +extern void taskstats_tgid_alloc(struct signal_struct *); #else -static inline void taskstats_exit_alloc(struct taskstats **ptidstats, - struct taskstats **ptgidstats) +static inline void taskstats_exit_alloc(struct taskstats **ptidstats) {} -static inline void taskstats_exit_free(struct taskstats *ptidstats, - struct taskstats *ptgidstats) +static inline void taskstats_exit_free(struct taskstats *ptidstats) {} static inline void taskstats_exit_send(struct task_struct *tsk, - struct taskstats *tidstats, - struct taskstats *tgidstats) + struct taskstats *tidstats, + int group_dead) +{} +static inline void taskstats_tgid_init(struct signal_struct *sig) +{} +static inline void taskstats_tgid_alloc(struct signal_struct *sig) +{} +static inline void taskstats_tgid_free(struct signal_struct *sig) {} static inline void taskstats_init_early(void) {} diff --git a/kernel/exit.c b/kernel/exit.c index 9852ed8c2988..67c1e9a4f812 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - struct taskstats *tidstats, *tgidstats; + struct taskstats *tidstats; int group_dead; profile_task_exit(tsk); @@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats, &tgidstats); + taskstats_exit_alloc(&tidstats); acct_update_integrals(tsk); if (tsk->mm) { @@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, tgidstats); - taskstats_exit_free(tidstats, tgidstats); + taskstats_exit_send(tsk, tidstats, group_dead); + taskstats_exit_free(tidstats); delayacct_tsk_exit(tsk); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 451cfd35bf22..1b0f7b1e0881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); + taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea9506de3b85..4a0a5022b299 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, struct taskstats *stats) { - int rc; struct task_struct *tsk, *first; + unsigned long flags; + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ first = tgidtsk; - read_lock(&tasklist_lock); if (!first) { + read_lock(&tasklist_lock); first = find_task_by_pid(tgid); if (!first) { read_unlock(&tasklist_lock); return -ESRCH; } - } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + tsk = first; + read_lock(&tasklist_lock); do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; /* - * Each accounting subsystem adds calls its functions to + * Accounting subsystem can call its functions here to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * break; + * per-task-foo(stats, tsk); */ - - rc = delayacct_add_tsk(stats, tsk); - if (rc) - break; + delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); read_unlock(&tasklist_lock); stats->version = TASKSTATS_VERSION; - /* - * Accounting subsytems can also add calls here if they don't - * wish to aggregate statistics for per-tgid stats + * Accounting subsytems can also add calls here to modify + * fields of taskstats. */ - return rc; + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; } + static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) { int rc = 0; @@ -230,7 +263,7 @@ err: /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - struct taskstats *tgidstats) + int group_dead) { int rc; struct sk_buff *rep_skb; @@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; + unsigned long flags; if (!family_registered || !tidstats) return; - is_thread_group = !thread_group_empty(tsk); - rc = 0; + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + rc = 0; /* * Size includes space for nested attributes */ @@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, *tidstats); nla_nest_end(rep_skb, na); - if (!is_thread_group || !tgidstats) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + if (!is_thread_group) + goto send; - rc = fill_tgid(tsk->pid, tsk, tgidstats); /* - * If fill_tgid() failed then one probable reason could be that the - * thread group leader has exited. fill_tgid() will fail, send out - * the pid statistics collected earlier. + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving */ - if (rc < 0) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tgidstats); + *tsk->signal->stats); nla_nest_end(rep_skb, na); +send: send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; + return; nla_put_failure: genlmsg_cancel(rep_skb, reply); -- cgit v1.3-7-g2ca7 From 9f59ce5d0e0dd837853385927b150f5cef3a7f52 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Sat, 5 Aug 2006 12:14:11 -0700 Subject: [PATCH] ptrace: make pid of child process available for PTRACE_EVENT_VFORK_DONE When delivering PTRACE_EVENT_VFORK_DONE, provide pid of the child process when tracer calls ptrace(PTRACE_GETEVENTMSG). This is already (accidentally) available when the tracer is tracing VFORK in addition to VFORK_DONE. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Cc: Daniel Jacobowitz Cc: Albert Cahalan Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 1b0f7b1e0881..aa36c43783cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1387,8 +1387,10 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { + current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } } } else { free_pid(pid); -- cgit v1.3-7-g2ca7 From 35df17c57cecb08f0120fb18926325f1093dc429 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Thu, 31 Aug 2006 21:27:38 -0700 Subject: [PATCH] task delay accounting fixes Cleanup allocation and freeing of tsk->delays used by delay accounting. This solves two problems reported for delay accounting: 1. oops in __delayacct_blkio_ticks http://www.uwsg.indiana.edu/hypermail/linux/kernel/0608.2/1844.html Currently tsk->delays is getting freed too early in task exit which can cause a NULL tsk->delays to get accessed via reading of /proc//stats. The patch fixes this problem by freeing tsk->delays closer to when task_struct itself is freed up. As a result, it also eliminates the use of tsk->delays_lock which was only being used (inadequately) to safeguard access to tsk->delays while a task was exiting. 2. Possible memory leak in kernel/delayacct.c http://www.uwsg.indiana.edu/hypermail/linux/kernel/0608.2/1389.html The patch cleans up tsk->delays allocations after a bad fork which was missing earlier. The patch has been tested to fix the problems listed above and stress tested with rapid calls to delay accounting's taskstats command interface (which is the other path that can access the same data, besides the /proc interface causing the oops above). Signed-off-by: Shailabh Nagar Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 10 +++++++--- include/linux/sched.h | 1 - kernel/delayacct.c | 16 ---------------- kernel/exit.c | 1 - kernel/fork.c | 6 ++++-- 5 files changed, 11 insertions(+), 23 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 11487b6e7127..561e2a77805c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -59,10 +59,14 @@ static inline void delayacct_tsk_init(struct task_struct *tsk) __delayacct_tsk_init(tsk); } -static inline void delayacct_tsk_exit(struct task_struct *tsk) +/* Free tsk->delays. Called from bad fork and __put_task_struct + * where there's no risk of tsk->delays being accessed elsewhere + */ +static inline void delayacct_tsk_free(struct task_struct *tsk) { if (tsk->delays) - __delayacct_tsk_exit(tsk); + kmem_cache_free(delayacct_cache, tsk->delays); + tsk->delays = NULL; } static inline void delayacct_blkio_start(void) @@ -101,7 +105,7 @@ static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) {} -static inline void delayacct_tsk_exit(struct task_struct *tsk) +static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index 6674fc1e51bf..34ed0d99b1bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -994,7 +994,6 @@ struct task_struct { */ struct pipe_inode_info *splice_pipe; #ifdef CONFIG_TASK_DELAY_ACCT - spinlock_t delays_lock; struct task_delay_info *delays; #endif }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 57ca3730205d..36752f124c6a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -41,24 +41,11 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { - spin_lock_init(&tsk->delays_lock); - /* No need to acquire tsk->delays_lock for allocation here unless - __delayacct_tsk_init called after tsk is attached to tasklist - */ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); } -void __delayacct_tsk_exit(struct task_struct *tsk) -{ - struct task_delay_info *delays = tsk->delays; - spin_lock(&tsk->delays_lock); - tsk->delays = NULL; - spin_unlock(&tsk->delays_lock); - kmem_cache_free(delayacct_cache, delays); -} - /* * Start accounting for a delay statistic using * its starting timestamp (@start) @@ -118,8 +105,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) struct timespec ts; unsigned long t1,t2,t3; - spin_lock(&tsk->delays_lock); - /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data */ @@ -161,7 +146,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) spin_unlock(&tsk->delays->lock); done: - spin_unlock(&tsk->delays_lock); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index dba194a8d416..a4c19a52ce46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -908,7 +908,6 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); - delayacct_tsk_exit(tsk); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index aa36c43783cc..f9b014e3e700 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk) security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); + delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -1011,7 +1012,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup; + goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); @@ -1277,7 +1278,8 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cpuset: #endif cpuset_exit(p); -bad_fork_cleanup: +bad_fork_cleanup_delays_binfmt: + delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: -- cgit v1.3-7-g2ca7