From ca74e92b4698276b6696f15a801759f50944f387 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:36 -0700
Subject: [PATCH] per-task-delay-accounting: setup

Initialization code related to collection of per-task "delay" statistics which
measure how long it had to wait for cpu, sync block io, swapping etc.  The
collection of statistics and the interface are in other patches.  This patch
sets up the data structures and allows the statistics collection to be
disabled through a kernel boot parameter.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..3c2cf91defa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	delayacct_tsk_exit(tsk);
 	exit_mm(tsk);
 
 	if (group_dead)
-- 
cgit v1.3-7-g2ca7


From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:40 -0700
Subject: [PATCH] per-task-delay-accounting: taskstats interface

Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC
family), for getting statistics of tasks and thread groups during their
lifetime and when they exit.  The interface is intended for use by multiple
accounting packages though it is being created in the context of delay
accounting.

This patch creates the interface without populating the fields of the data
that is sent to the user in response to a command or upon the exit of a task.
Each accounting package interested in using taskstats has to provide an
additional patch to add its stats to the common structure.

[akpm@osdl.org: cleanups, Kconfig fix]
Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/taskstats.txt | 146 ++++++++++++++
 include/linux/taskstats.h              |  84 +++++++++
 include/linux/taskstats_kern.h         |  57 ++++++
 init/Kconfig                           |  13 ++
 init/main.c                            |   2 +
 kernel/Makefile                        |   1 +
 kernel/exit.c                          |   7 +
 kernel/taskstats.c                     | 336 +++++++++++++++++++++++++++++++++
 8 files changed, 646 insertions(+)
 create mode 100644 Documentation/accounting/taskstats.txt
 create mode 100644 include/linux/taskstats.h
 create mode 100644 include/linux/taskstats_kern.h
 create mode 100644 kernel/taskstats.c

(limited to 'kernel/exit.c')

diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
new file mode 100644
index 000000000000..ad9b6997e162
--- /dev/null
+++ b/Documentation/accounting/taskstats.txt
@@ -0,0 +1,146 @@
+Per-task statistics interface
+-----------------------------
+
+
+Taskstats is a netlink-based interface for sending per-task and
+per-process statistics from the kernel to userspace.
+
+Taskstats was designed for the following benefits:
+
+- efficiently provide statistics during lifetime of a task and on its exit
+- unified interface for multiple accounting subsystems
+- extensibility for use by future accounting patches
+
+Terminology
+-----------
+
+"pid", "tid" and "task" are used interchangeably and refer to the standard
+Linux task defined by struct task_struct.  per-pid stats are the same as
+per-task stats.
+
+"tgid", "process" and "thread group" are used interchangeably and refer to the
+tasks that share an mm_struct i.e. the traditional Unix process. Despite the
+use of tgid, there is no special treatment for the task that is thread group
+leader - a process is deemed alive as long as it has any task belonging to it.
+
+Usage
+-----
+
+To get statistics during task's lifetime, userspace opens a unicast netlink
+socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
+The response contains statistics for a task (if pid is specified) or the sum of
+statistics for all tasks of the process (if tgid is specified).
+
+To obtain statistics for tasks which are exiting, userspace opens a multicast
+netlink socket. Each time a task exits, two records are sent by the kernel to
+each listener on the multicast socket. The first the per-pid task's statistics
+and the second is the sum for all tasks of the process to which the task
+belongs (the task does not need to be the thread group leader). The need for
+per-tgid stats to be sent for each exiting task is explained in the per-tgid
+stats section below.
+
+
+Interface
+---------
+
+The user-kernel interface is encapsulated in include/linux/taskstats.h
+
+To avoid this documentation becoming obsolete as the interface evolves, only
+an outline of the current version is given. taskstats.h always overrides the
+description here.
+
+struct taskstats is the common accounting structure for both per-pid and
+per-tgid data. It is versioned and can be extended by each accounting subsystem
+that is added to the kernel. The fields and their semantics are defined in the
+taskstats.h file.
+
+The data exchanged between user and kernel space is a netlink message belonging
+to the NETLINK_GENERIC family and using the netlink attributes interface.
+The messages are in the format
+
+    +----------+- - -+-------------+-------------------+
+    | nlmsghdr | Pad |  genlmsghdr | taskstats payload |
+    +----------+- - -+-------------+-------------------+
+
+
+The taskstats payload is one of the following three kinds:
+
+1. Commands: Sent from user to kernel. The payload is one attribute, of type
+TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute
+payload. The pid/tgid denotes the task/process for which userspace wants
+statistics.
+
+2. Response for a command: sent from the kernel in response to a userspace
+command. The payload is a series of three attributes of type:
+
+a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
+a pid/tgid will be followed by some stats.
+
+b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
+is being returned.
+
+c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The
+same structure is used for both per-pid and per-tgid stats.
+
+3. New message sent by kernel whenever a task exits. The payload consists of a
+   series of attributes of the following type:
+
+a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
+b) TASKSTATS_TYPE_PID: contains exiting task's pid
+c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
+d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
+e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
+f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
+
+
+per-tgid stats
+--------------
+
+Taskstats provides per-process stats, in addition to per-task stats, since
+resource management is often done at a process granularity and aggregating task
+stats in userspace alone is inefficient and potentially inaccurate (due to lack
+of atomicity).
+
+However, maintaining per-process, in addition to per-task stats, within the
+kernel has space and time overheads. Hence the taskstats implementation
+dynamically sums up the per-task stats for each task belonging to a process
+whenever per-process stats are needed.
+
+Not maintaining per-tgid stats creates a problem when userspace is interested
+in getting these stats when the process dies i.e. the last thread of
+a process exits. It isn't possible to simply return some aggregated per-process
+statistic from the kernel.
+
+The approach taken by taskstats is to return the per-tgid stats *each* time
+a task exits, in addition to the per-pid stats for that task. Userspace can
+maintain task<->process mappings and use them to maintain the per-process stats
+in userspace, updating the aggregate appropriately as the tasks of a process
+exit.
+
+Extending taskstats
+-------------------
+
+There are two ways to extend the taskstats interface to export more
+per-task/process stats as patches to collect them get added to the kernel
+in future:
+
+1. Adding more fields to the end of the existing struct taskstats. Backward
+   compatibility is ensured by the version number within the
+   structure. Userspace will use only the fields of the struct that correspond
+   to the version its using.
+
+2. Defining separate statistic structs and using the netlink attributes
+   interface to return them. Since userspace processes each netlink attribute
+   independently, it can always ignore attributes whose type it does not
+   understand (because it is using an older version of the interface).
+
+
+Choosing between 1. and 2. is a matter of trading off flexibility and
+overhead. If only a few fields need to be added, then 1. is the preferable
+path since the kernel and userspace don't need to incur the overhead of
+processing new netlink attributes. But if the new fields expand the existing
+struct too much, requiring disparate userspace accounting utilities to
+unnecessarily receive large structures whose fields are of no interest, then
+extending the attributes structure would be worthwhile.
+
+----
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
new file mode 100644
index 000000000000..51f62759bea9
--- /dev/null
+++ b/include/linux/taskstats.h
@@ -0,0 +1,84 @@
+/* taskstats.h - exporting per-task statistics
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKSTATS_H
+#define _LINUX_TASKSTATS_H
+
+/* Format for per-task data returned to userland when
+ *	- a task exits
+ *	- listener requests stats for a task
+ *
+ * The struct is versioned. Newer versions should only add fields to
+ * the bottom of the struct to maintain backward compatibility.
+ *
+ *
+ * To add new fields
+ *	a) bump up TASKSTATS_VERSION
+ *	b) add comment indicating new version number at end of struct
+ *	c) add new fields after version comment; maintain 64-bit alignment
+ */
+
+#define TASKSTATS_VERSION	1
+
+struct taskstats {
+
+	/* Version 1 */
+	__u64	version;
+};
+
+
+#define TASKSTATS_LISTEN_GROUP	0x1
+
+/*
+ * Commands sent from userspace
+ * Not versioned. New commands should only be inserted at the enum's end
+ * prior to __TASKSTATS_CMD_MAX
+ */
+
+enum {
+	TASKSTATS_CMD_UNSPEC = 0,	/* Reserved */
+	TASKSTATS_CMD_GET,		/* user->kernel request/get-response */
+	TASKSTATS_CMD_NEW,		/* kernel->user event */
+	__TASKSTATS_CMD_MAX,
+};
+
+#define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1)
+
+enum {
+	TASKSTATS_TYPE_UNSPEC = 0,	/* Reserved */
+	TASKSTATS_TYPE_PID,		/* Process id */
+	TASKSTATS_TYPE_TGID,		/* Thread group id */
+	TASKSTATS_TYPE_STATS,		/* taskstats structure */
+	TASKSTATS_TYPE_AGGR_PID,	/* contains pid + stats */
+	TASKSTATS_TYPE_AGGR_TGID,	/* contains tgid + stats */
+	__TASKSTATS_TYPE_MAX,
+};
+
+#define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1)
+
+enum {
+	TASKSTATS_CMD_ATTR_UNSPEC = 0,
+	TASKSTATS_CMD_ATTR_PID,
+	TASKSTATS_CMD_ATTR_TGID,
+	__TASKSTATS_CMD_ATTR_MAX,
+};
+
+#define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1)
+
+/* NETLINK_GENERIC related info */
+
+#define TASKSTATS_GENL_NAME	"TASKSTATS"
+#define TASKSTATS_GENL_VERSION	0x1
+
+#endif /* _LINUX_TASKSTATS_H */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
new file mode 100644
index 000000000000..bd0ecb969c26
--- /dev/null
+++ b/include/linux/taskstats_kern.h
@@ -0,0 +1,57 @@
+/* taskstats_kern.h - kernel header for per-task statistics interface
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ */
+
+#ifndef _LINUX_TASKSTATS_KERN_H
+#define _LINUX_TASKSTATS_KERN_H
+
+#include <linux/taskstats.h>
+#include <linux/sched.h>
+
+enum {
+	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
+	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
+};
+
+#ifdef CONFIG_TASKSTATS
+extern kmem_cache_t *taskstats_cache;
+
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{
+	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+}
+
+static inline void taskstats_exit_free(struct taskstats *tidstats,
+					struct taskstats *tgidstats)
+{
+	if (tidstats)
+		kmem_cache_free(taskstats_cache, tidstats);
+	if (tgidstats)
+		kmem_cache_free(taskstats_cache, tgidstats);
+}
+
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
+				struct taskstats *);
+extern void taskstats_init_early(void);
+
+#else
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{}
+static inline void taskstats_exit_free(struct taskstats *ptidstats,
+					struct taskstats *ptgidstats)
+{}
+static inline void taskstats_exit_send(struct task_struct *tsk,
+					struct taskstats *tidstats,
+					struct taskstats *tgidstats)
+{}
+static inline void taskstats_init_early(void)
+{}
+#endif /* CONFIG_TASKSTATS */
+
+#endif
+
diff --git a/init/Kconfig b/init/Kconfig
index 90498a3e53da..56a7093b4e4c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -158,6 +158,19 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>.
 
+config TASKSTATS
+	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
+	depends on NET
+	default n
+	help
+	  Export selected statistics for tasks/processes through the
+	  generic netlink interface. Unlike BSD process accounting, the
+	  statistics are available during the lifetime of tasks/processes as
+	  responses to commands. Like BSD accounting, they are sent to user
+	  space on task exit.
+
+	  Say N if unsure.
+
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting (EXPERIMENTAL)"
 	help
diff --git a/init/main.c b/init/main.c
index 9e8e8c152142..8651a720a092 100644
--- a/init/main.c
+++ b/init/main.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
+#include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
@@ -575,6 +576,7 @@ asmlinkage void __init start_kernel(void)
 	proc_root_init();
 #endif
 	cpuset_init();
+	taskstats_init_early();
 	delayacct_init();
 
 	check_bugs();
diff --git a/kernel/Makefile b/kernel/Makefile
index 87bb34cc8938..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c2cf91defa7..9852ed8c2988 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
@@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct taskstats *tidstats, *tgidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
+	taskstats_exit_alloc(&tidstats, &tgidstats);
+
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	taskstats_exit_send(tsk, tidstats, tgidstats);
+	taskstats_exit_free(tidstats, tgidstats);
 	delayacct_tsk_exit(tsk);
+
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..82ec9137d908
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply;
+	int rc;
+
+	reply = genlmsg_data(genlhdr);
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	if (event == TASKSTATS_MSG_MULTICAST)
+		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+	return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+err:
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk, *first;
+
+	first = tgidtsk;
+	read_lock(&tasklist_lock);
+	if (!first) {
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+	}
+	tsk = first;
+	do {
+		/*
+		 * Each accounting subsystem adds calls its functions to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	rc = per-task-foo(stats, tsk);
+		 *	if (rc)
+		 *		break;
+		 */
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Accounting subsytems can also add calls here if they don't
+	 * wish to aggregate statistics for per-tgid stats
+	 */
+
+	return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			struct taskstats *tgidstats)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	mutex_lock(&taskstats_exit_mutex);
+
+	is_thread_group = !thread_group_empty(tsk);
+	rc = 0;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group || !tgidstats) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	rc = fill_tgid(tsk->pid, tsk, tgidstats);
+	/*
+	 * If fill_tgid() failed then one probable reason could be that the
+	 * thread group leader has exited. fill_tgid() will fail, send out
+	 * the pid statistics collected earlier.
+	 */
+	if (rc < 0) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tgidstats);
+	nla_nest_end(rep_skb, na);
+
+	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	goto ret;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	mutex_unlock(&taskstats_exit_mutex);
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_send_stats,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
-- 
cgit v1.3-7-g2ca7


From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:44 -0700
Subject: [PATCH] delay accounting taskstats interface send tgid once

Send per-tgid data only once during exit of a thread group instead of once
with each member thread exit.

Currently, when a thread exits, besides its per-tid data, the per-tgid data
of its thread group is also sent out, if its thread group is non-empty.
The per-tgid data sent consists of the sum of per-tid stats for all
*remaining* threads of the thread group.

This patch modifies this sending in two ways:

- the per-tgid data is sent only when the last thread of a thread group
  exits.  This cuts down heavily on the overhead of sending/receiving
  per-tgid data, especially when other exploiters of the taskstats
  interface aren't interested in per-tgid stats

- the semantics of the per-tgid data sent are changed.  Instead of being
  the sum of per-tid data for remaining threads, the value now sent is the
  true total accumalated statistics for all threads that are/were part of
  the thread group.

The patch also addresses a minor issue where failure of one accounting
subsystem to fill in the taskstats structure was causing the send of
taskstats to not be sent at all.

The patch has been tested for stability and run cerberus for over 4 hours
on an SMP.

[akpm@osdl.org: bugfixes]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/delay-accounting.txt | 13 ++--
 Documentation/accounting/taskstats.txt        | 33 ++++-----
 MAINTAINERS                                   | 12 ++++
 include/linux/sched.h                         |  4 ++
 include/linux/taskstats_kern.h                | 71 ++++++++++++++-----
 kernel/exit.c                                 |  8 +--
 kernel/fork.c                                 |  4 ++
 kernel/taskstats.c                            | 98 ++++++++++++++++++---------
 8 files changed, 162 insertions(+), 81 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
index f3dc0ca04fa4..be215e58423b 100644
--- a/Documentation/accounting/delay-accounting.txt
+++ b/Documentation/accounting/delay-accounting.txt
@@ -48,9 +48,10 @@ counter (say cpu_delay_total) for a task will give the delay
 experienced by the task waiting for the corresponding resource
 in that interval.
 
-When a task exits, records containing the per-task and per-process statistics
-are sent to userspace without requiring a command. More details are given in
-the taskstats interface description.
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
 
 The getdelays.c userspace utility in this directory allows simple commands to
 be run and the corresponding delay statistics to be displayed. It also serves
@@ -107,9 +108,3 @@ IO	count	delay total
 	0	0
 MEM	count	delay total
 	0	0
-
-
-
-
-
-
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
index acc6b4f37fc7..efd8f605bcd5 100644
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@@ -32,12 +32,11 @@ The response contains statistics for a task (if pid is specified) or the sum of
 statistics for all tasks of the process (if tgid is specified).
 
 To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, two records are sent by the kernel to
-each listener on the multicast socket. The first the per-pid task's statistics
-and the second is the sum for all tasks of the process to which the task
-belongs (the task does not need to be the thread group leader). The need for
-per-tgid stats to be sent for each exiting task is explained in the per-tgid
-stats section below.
+netlink socket. Each time a task exits, its per-pid statistics is always sent
+by the kernel to each listener on the multicast socket. In addition, if it is
+the last thread exiting its thread group, an additional record containing the
+per-tgid stats are also sent. The latter contains the sum of per-pid stats for
+all threads in the thread group, both past and present.
 
 getdelays.c is a simple utility demonstrating usage of the taskstats interface
 for reporting delay accounting statistics.
@@ -104,20 +103,14 @@ stats in userspace alone is inefficient and potentially inaccurate (due to lack
 of atomicity).
 
 However, maintaining per-process, in addition to per-task stats, within the
-kernel has space and time overheads. Hence the taskstats implementation
-dynamically sums up the per-task stats for each task belonging to a process
-whenever per-process stats are needed.
-
-Not maintaining per-tgid stats creates a problem when userspace is interested
-in getting these stats when the process dies i.e. the last thread of
-a process exits. It isn't possible to simply return some aggregated per-process
-statistic from the kernel.
-
-The approach taken by taskstats is to return the per-tgid stats *each* time
-a task exits, in addition to the per-pid stats for that task. Userspace can
-maintain task<->process mappings and use them to maintain the per-process stats
-in userspace, updating the aggregate appropriately as the tasks of a process
-exit.
+kernel has space and time overheads. To address this, the taskstats code
+accumalates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumalated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumalated total for previously exited
+threads of the same thread group.
 
 Extending taskstats
 -------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 0557cfde053d..e99028ca2f7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2240,6 +2240,12 @@ M:	tsbogend@alpha.franken.de
 L:	netdev@vger.kernel.org
 S:	Maintained
 
+PER-TASK DELAY ACCOUNTING
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 PERSONALITY HANDLING
 P:	Christoph Hellwig
 M:	hch@infradead.org
@@ -2767,6 +2773,12 @@ P:	Deepak Saxena
 M:	dsaxena@plexity.net
 S:	Maintained
 
+TASKSTATS STATISTICS INTERFACE
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 TI PARALLEL LINK CABLE DRIVER
 P:     Romain Lievin
 M:     roms@lpg.ticalc.org
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c5610ca0c92..6afa72e080cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -463,6 +463,10 @@ struct signal_struct {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
+#ifdef CONFIG_TASKSTATS
+	spinlock_t stats_lock;
+	struct taskstats *stats;
+#endif
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index fc9da2e26443..0ae8f67af1fd 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -19,36 +19,75 @@ enum {
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {
 	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
 }
 
-static inline void taskstats_exit_free(struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
 		kmem_cache_free(taskstats_cache, tidstats);
-	if (tgidstats)
-		kmem_cache_free(taskstats_cache, tgidstats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
-				struct taskstats *);
-extern void taskstats_init_early(void);
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{
+	spin_lock_init(&sig->stats_lock);
+	sig->stats = NULL;
+}
+
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{
+	struct taskstats *stats;
+	unsigned long flags;
+
+	stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!stats)
+		return;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
 
+static inline void taskstats_tgid_free(struct signal_struct *sig)
+{
+	struct taskstats *stats = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (sig->stats) {
+		stats = sig->stats;
+		sig->stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
+
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_init_early(void);
+extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {}
-static inline void taskstats_exit_free(struct taskstats *ptidstats,
-					struct taskstats *ptgidstats)
+static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
-					struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+				       struct taskstats *tidstats,
+				       int group_dead)
+{}
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
 static inline void taskstats_init_early(void)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9852ed8c2988..67c1e9a4f812 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats, *tgidstats;
+	struct taskstats *tidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &tgidstats);
+	taskstats_exit_alloc(&tidstats);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, tgidstats);
-	taskstats_exit_free(tidstats, tgidstats);
+	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 451cfd35bf22..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b85..4a0a5022b299 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		struct taskstats *stats)
 {
-	int rc;
 	struct task_struct *tsk, *first;
+	unsigned long flags;
 
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
 	first = tgidtsk;
-	read_lock(&tasklist_lock);
 	if (!first) {
+		read_lock(&tasklist_lock);
 		first = find_task_by_pid(tgid);
 		if (!first) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
-	}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
 	tsk = first;
+	read_lock(&tasklist_lock);
 	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
 		/*
-		 * Each accounting subsystem adds calls its functions to
+		 * Accounting subsystem can call its functions here to
 		 * fill in relevant parts of struct taskstsats as follows
 		 *
-		 *	rc = per-task-foo(stats, tsk);
-		 *	if (rc)
-		 *		break;
+		 *	per-task-foo(stats, tsk);
 		 */
-
-		rc = delayacct_add_tsk(stats, tsk);
-		if (rc)
-			break;
+		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
 	stats->version = TASKSTATS_VERSION;
 
-
 	/*
-	 * Accounting subsytems can also add calls here if they don't
-	 * wish to aggregate statistics for per-tgid stats
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
 	 */
 
-	return rc;
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
 }
 
+
 static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -230,7 +263,7 @@ err:
 
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			struct taskstats *tgidstats)
+			int group_dead)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
+	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	is_thread_group = !thread_group_empty(tsk);
-	rc = 0;
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
 
+	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			*tidstats);
 	nla_nest_end(rep_skb, na);
 
-	if (!is_thread_group || !tgidstats) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+	if (!is_thread_group)
+		goto send;
 
-	rc = fill_tgid(tsk->pid, tsk, tgidstats);
 	/*
-	 * If fill_tgid() failed then one probable reason could be that the
-	 * thread group leader has exited. fill_tgid() will fail, send out
-	 * the pid statistics collected earlier.
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (rc < 0) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
 
 	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
 	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
 	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tgidstats);
+			*tsk->signal->stats);
 	nla_nest_end(rep_skb, na);
 
+send:
 	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-	goto ret;
+	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-- 
cgit v1.3-7-g2ca7


From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] per-task delay accounting taskstats interface: control exit
 data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus.  By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask.  The interest is recorded per-cpu.  When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats.h      |   4 +-
 include/linux/taskstats_kern.h |  27 +-----
 kernel/exit.c                  |   5 +-
 kernel/taskstats.c             | 200 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 198 insertions(+), 38 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index c6aeca32348e..f1cb6cddd19d 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -91,8 +91,6 @@ struct taskstats {
 };
 
 
-#define TASKSTATS_LISTEN_GROUP	0x1
-
 /*
  * Commands sent from userspace
  * Not versioned. New commands should only be inserted at the enum's end
@@ -124,6 +122,8 @@ enum {
 	TASKSTATS_CMD_ATTR_UNSPEC = 0,
 	TASKSTATS_CMD_ATTR_PID,
 	TASKSTATS_CMD_ATTR_TGID,
+	TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+	TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
 	__TASKSTATS_CMD_ATTR_MAX,
 };
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 2b6adec3a2e4..16894b7edcc8 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -11,30 +11,10 @@
 #include <linux/sched.h>
 #include <net/genetlink.h>
 
-enum {
-	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
-	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
-};
-
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline int taskstats_has_listeners(void)
-{
-	if (!genl_sock)
-		return 0;
-	return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
-}
-
-
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
-{
-	*ptidstats = NULL;
-	if (taskstats_has_listeners())
-		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-}
-
 static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
@@ -82,17 +62,18 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
 		kmem_cache_free(taskstats_cache, stats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
 extern void taskstats_init_early(void);
 extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {}
 static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
 				       struct taskstats *tidstats,
-				       int group_dead)
+				       int group_dead, unsigned int cpu)
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 67c1e9a4f812..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	struct task_struct *tsk = current;
 	struct taskstats *tidstats;
 	int group_dead;
+	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats);
+	taskstats_exit_alloc(&tidstats, &mycpu);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
 	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b299..abb59e323544 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
 };
 
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
 	return genlmsg_unicast(skb, pid);
 }
 
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			list_del(&s->list);
+			kfree(s);
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_write(&listeners->sem);
+
+	return rc;
+}
+
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
@@ -204,8 +272,73 @@ ret:
 	return;
 }
 
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
 
-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
 
 	/*
 	 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
 	return rc;
 }
 
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead)
+			int group_dead, unsigned int mycpu)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:
@@ -338,16 +510,22 @@ ret:
 
 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };
 
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
-- 
cgit v1.3-7-g2ca7


From 35df17c57cecb08f0120fb18926325f1093dc429 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Thu, 31 Aug 2006 21:27:38 -0700
Subject: [PATCH] task delay accounting fixes

Cleanup allocation and freeing of tsk->delays used by delay accounting.
This solves two problems reported for delay accounting:

1. oops in __delayacct_blkio_ticks
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0608.2/1844.html

Currently tsk->delays is getting freed too early in task exit which can
cause a NULL tsk->delays to get accessed via reading of /proc/<tgid>/stats.
 The patch fixes this problem by freeing tsk->delays closer to when
task_struct itself is freed up.  As a result, it also eliminates the use of
tsk->delays_lock which was only being used (inadequately) to safeguard
access to tsk->delays while a task was exiting.

2. Possible memory leak in kernel/delayacct.c
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0608.2/1389.html

The patch cleans up tsk->delays allocations after a bad fork which was
missing earlier.

The patch has been tested to fix the problems listed above and stress
tested with rapid calls to delay accounting's taskstats command interface
(which is the other path that can access the same data, besides the /proc
interface causing the oops above).

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h | 10 +++++++---
 include/linux/sched.h     |  1 -
 kernel/delayacct.c        | 16 ----------------
 kernel/exit.c             |  1 -
 kernel/fork.c             |  6 ++++--
 5 files changed, 11 insertions(+), 23 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 11487b6e7127..561e2a77805c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -59,10 +59,14 @@ static inline void delayacct_tsk_init(struct task_struct *tsk)
 		__delayacct_tsk_init(tsk);
 }
 
-static inline void delayacct_tsk_exit(struct task_struct *tsk)
+/* Free tsk->delays. Called from bad fork and __put_task_struct
+ * where there's no risk of tsk->delays being accessed elsewhere
+ */
+static inline void delayacct_tsk_free(struct task_struct *tsk)
 {
 	if (tsk->delays)
-		__delayacct_tsk_exit(tsk);
+		kmem_cache_free(delayacct_cache, tsk->delays);
+	tsk->delays = NULL;
 }
 
 static inline void delayacct_blkio_start(void)
@@ -101,7 +105,7 @@ static inline void delayacct_init(void)
 {}
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {}
-static inline void delayacct_tsk_exit(struct task_struct *tsk)
+static inline void delayacct_tsk_free(struct task_struct *tsk)
 {}
 static inline void delayacct_blkio_start(void)
 {}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6674fc1e51bf..34ed0d99b1bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -994,7 +994,6 @@ struct task_struct {
 	 */
 	struct pipe_inode_info *splice_pipe;
 #ifdef	CONFIG_TASK_DELAY_ACCT
-	spinlock_t delays_lock;
 	struct task_delay_info *delays;
 #endif
 };
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 57ca3730205d..36752f124c6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,24 +41,11 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-	spin_lock_init(&tsk->delays_lock);
-	/* No need to acquire tsk->delays_lock for allocation here unless
-	   __delayacct_tsk_init called after tsk is attached to tasklist
-	*/
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
 }
 
-void __delayacct_tsk_exit(struct task_struct *tsk)
-{
-	struct task_delay_info *delays = tsk->delays;
-	spin_lock(&tsk->delays_lock);
-	tsk->delays = NULL;
-	spin_unlock(&tsk->delays_lock);
-	kmem_cache_free(delayacct_cache, delays);
-}
-
 /*
  * Start accounting for a delay statistic using
  * its starting timestamp (@start)
@@ -118,8 +105,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	struct timespec ts;
 	unsigned long t1,t2,t3;
 
-	spin_lock(&tsk->delays_lock);
-
 	/* Though tsk->delays accessed later, early exit avoids
 	 * unnecessary returning of other data
 	 */
@@ -161,7 +146,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	spin_unlock(&tsk->delays->lock);
 
 done:
-	spin_unlock(&tsk->delays_lock);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index dba194a8d416..a4c19a52ce46 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,7 +908,6 @@ fastcall NORET_TYPE void do_exit(long code)
 		audit_free(tsk);
 	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
 	taskstats_exit_free(tidstats);
-	delayacct_tsk_exit(tsk);
 
 	exit_mm(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index aa36c43783cc..f9b014e3e700 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk)
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
+	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
@@ -1011,7 +1012,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	retval = -EFAULT;
 	if (clone_flags & CLONE_PARENT_SETTID)
 		if (put_user(p->pid, parent_tidptr))
-			goto bad_fork_cleanup;
+			goto bad_fork_cleanup_delays_binfmt;
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
@@ -1277,7 +1278,8 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cpuset:
 #endif
 	cpuset_exit(p);
-bad_fork_cleanup:
+bad_fork_cleanup_delays_binfmt:
+	delayacct_tsk_free(p);
 	if (p->binfmt)
 		module_put(p->binfmt->module);
 bad_fork_cleanup_put_domain:
-- 
cgit v1.3-7-g2ca7


From 3b6362b833b9f7a9d4222cf1bb35f99c411abb31 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 2 Sep 2006 21:22:16 +0400
Subject: [PATCH] eligible_child: remove an obsolete ->tgid check

It is not possible to find a sub-thread in ->children/->ptrace_children
lists, ptrace_attach() does not allow to attach to sub-threads.

Even if it was possible to ptrace the task from the same thread group,
we can't allow to release ->group_leader while there are others (ptracer)
threads in the same group.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index a4c19a52ce46..d891883420f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1053,7 +1053,7 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
 	 * Do not consider thread group leaders that are
 	 * in a non-empty thread group:
 	 */
-	if (current->tgid != p->tgid && delay_group_leader(p))
+	if (delay_group_leader(p))
 		return 2;
 
 	if (security_task_wait(p))
-- 
cgit v1.3-7-g2ca7